/src/libunistring/lib/uninorm/u-normalize-internal.h
Line | Count | Source |
1 | | /* Decomposition and composition of Unicode strings. |
2 | | Copyright (C) 2009-2026 Free Software Foundation, Inc. |
3 | | Written by Bruno Haible <bruno@clisp.org>, 2009. |
4 | | |
5 | | This file is free software: you can redistribute it and/or modify |
6 | | it under the terms of the GNU Lesser General Public License as |
7 | | published by the Free Software Foundation; either version 2.1 of the |
8 | | License, or (at your option) any later version. |
9 | | |
10 | | This file is distributed in the hope that it will be useful, |
11 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | GNU Lesser General Public License for more details. |
14 | | |
15 | | You should have received a copy of the GNU Lesser General Public License |
16 | | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
17 | | |
18 | | UNIT * |
19 | | FUNC (uninorm_t nf, const UNIT *s, size_t n, |
20 | | UNIT *resultbuf, size_t *lengthp) |
21 | 7.34M | { |
22 | 7.34M | int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer; |
23 | 7.34M | ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer; |
24 | | |
25 | | /* The result being accumulated. */ |
26 | 7.34M | UNIT *result; |
27 | 7.34M | size_t allocated; |
28 | 7.34M | if (resultbuf == NULL) |
29 | 7.34M | { |
30 | 7.34M | result = NULL; |
31 | 7.34M | allocated = 0; |
32 | 7.34M | } |
33 | 0 | else |
34 | 0 | { |
35 | 0 | result = resultbuf; |
36 | 0 | allocated = *lengthp; |
37 | 0 | } |
38 | 7.34M | size_t length = 0; |
39 | | |
40 | | /* The buffer for sorting. */ |
41 | 7.34M | #define SORTBUF_PREALLOCATED 64 |
42 | 7.34M | struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED]; |
43 | 7.34M | struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */ |
44 | 7.34M | sortbuf_preallocated; |
45 | 7.34M | size_t sortbuf_allocated = SORTBUF_PREALLOCATED; |
46 | 7.34M | size_t sortbuf_count = 0; |
47 | | |
48 | 7.34M | { |
49 | 7.34M | const UNIT *s_end = s + n; |
50 | | |
51 | 7.34M | for (;;) |
52 | 25.5M | { |
53 | 25.5M | int count; |
54 | 25.5M | ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH]; |
55 | 25.5M | int decomposed_count; |
56 | | |
57 | 25.5M | if (s < s_end) |
58 | 18.2M | { |
59 | | /* Fetch the next character. */ |
60 | 18.2M | count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s); |
61 | 18.2M | decomposed_count = 1; |
62 | | |
63 | | /* Decompose it, recursively. |
64 | | It would be possible to precompute the recursive decomposition |
65 | | and store it in a table. But this would significantly increase |
66 | | the size of the decomposition tables, because for example for |
67 | | U+1FC1 the recursive canonical decomposition and the recursive |
68 | | compatibility decomposition are different. */ |
69 | 45.3M | for (int curr = 0; curr < decomposed_count; ) |
70 | 27.1M | { |
71 | | /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e. |
72 | | all elements are atomic. */ |
73 | 27.1M | ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH]; |
74 | 27.1M | int curr_decomposed_count; |
75 | | |
76 | 27.1M | curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed); |
77 | 27.1M | if (curr_decomposed_count >= 0) |
78 | 2.89M | { |
79 | | /* Move curr_decomposed[0..curr_decomposed_count-1] over |
80 | | decomposed[curr], making room. It's not worth using |
81 | | memcpy() here, since the counts are so small. */ |
82 | 2.89M | int shift = curr_decomposed_count - 1; |
83 | | |
84 | 2.89M | if (shift < 0) |
85 | 0 | abort (); |
86 | 2.89M | if (shift > 0) |
87 | 2.51M | { |
88 | 2.51M | decomposed_count += shift; |
89 | 2.51M | if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH) |
90 | 0 | abort (); |
91 | 2.54M | for (int j = decomposed_count - 1 - shift; j > curr; j--) |
92 | 27.0k | decomposed[j + shift] = decomposed[j]; |
93 | 2.51M | } |
94 | 11.8M | for (; shift >= 0; shift--) |
95 | 8.93M | decomposed[curr + shift] = curr_decomposed[shift]; |
96 | 2.89M | } |
97 | 24.2M | else |
98 | 24.2M | { |
99 | | /* decomposed[curr] is atomic. */ |
100 | 24.2M | curr++; |
101 | 24.2M | } |
102 | 27.1M | } |
103 | 18.2M | } |
104 | 7.34M | else |
105 | 7.34M | { |
106 | 7.34M | count = 0; |
107 | 7.34M | decomposed_count = 0; |
108 | 7.34M | } |
109 | | |
110 | 25.5M | int i = 0; |
111 | 25.5M | for (;;) |
112 | 49.7M | { |
113 | 49.7M | ucs4_t uc; |
114 | 49.7M | int ccc; |
115 | | |
116 | 49.7M | if (s < s_end) |
117 | 42.4M | { |
118 | | /* Fetch the next character from the decomposition. */ |
119 | 42.4M | if (i == decomposed_count) |
120 | 18.2M | break; |
121 | 24.2M | uc = decomposed[i]; |
122 | 24.2M | ccc = uc_combining_class (uc); |
123 | 24.2M | } |
124 | 7.34M | else |
125 | 7.34M | { |
126 | | /* End of string reached. */ |
127 | 7.34M | uc = 0; |
128 | 7.34M | ccc = 0; |
129 | 7.34M | } |
130 | | |
131 | 31.5M | if (ccc == 0) |
132 | 28.7M | { |
133 | | /* Apply the canonical ordering algorithm to the accumulated |
134 | | sequence of characters. */ |
135 | 28.7M | if (sortbuf_count > 1) |
136 | 1.44M | gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count, |
137 | 1.44M | sortbuf + sortbuf_count); |
138 | | |
139 | 28.7M | if (composer != NULL) |
140 | 28.7M | { |
141 | | /* Attempt to combine decomposed characters, as specified |
142 | | in the Unicode Standard Annex #15 "Unicode Normalization |
143 | | Forms". We need to check |
144 | | 1. whether the first accumulated character is a |
145 | | "starter" (i.e. has ccc = 0). This is usually the |
146 | | case. But when the string starts with a |
147 | | non-starter, the sortbuf also starts with a |
148 | | non-starter. Btw, this check could also be |
149 | | omitted, because the composition table has only |
150 | | entries (code1, code2) for which code1 is a |
151 | | starter; if the first accumulated character is not |
152 | | a starter, no lookup will succeed. |
153 | | 2. If the sortbuf has more than one character, check |
154 | | for each of these characters that are not "blocked" |
155 | | from the starter (i.e. have a ccc that is higher |
156 | | than the ccc of the previous character) whether it |
157 | | can be combined with the first character. |
158 | | 3. If only one character is left in sortbuf, check |
159 | | whether it can be combined with the next character |
160 | | (also a starter). */ |
161 | 28.7M | if (sortbuf_count > 0 && sortbuf[0].ccc == 0) |
162 | 21.3M | { |
163 | 24.2M | for (size_t j = 1; j < sortbuf_count; ) |
164 | 2.82M | { |
165 | 2.82M | if (sortbuf[j].ccc > sortbuf[j - 1].ccc) |
166 | 1.47M | { |
167 | 1.47M | ucs4_t combined = |
168 | 1.47M | composer (sortbuf[0].code, sortbuf[j].code); |
169 | 1.47M | if (combined) |
170 | 1.43M | { |
171 | 1.43M | sortbuf[0].code = combined; |
172 | | /* sortbuf[0].ccc = 0, still valid. */ |
173 | 3.30M | for (size_t k = j + 1; k < sortbuf_count; k++) |
174 | 1.87M | sortbuf[k - 1] = sortbuf[k]; |
175 | 1.43M | sortbuf_count--; |
176 | 1.43M | continue; |
177 | 1.43M | } |
178 | 1.47M | } |
179 | 1.39M | j++; |
180 | 1.39M | } |
181 | 21.3M | if (s < s_end && sortbuf_count == 1) |
182 | 14.0M | { |
183 | 14.0M | ucs4_t combined = |
184 | 14.0M | composer (sortbuf[0].code, uc); |
185 | 14.0M | if (combined) |
186 | 20.0k | { |
187 | 20.0k | uc = combined; |
188 | 20.0k | ccc = 0; |
189 | | /* uc could be further combined with subsequent |
190 | | characters. So don't put it into sortbuf[0] in |
191 | | this round, only in the next round. */ |
192 | 20.0k | sortbuf_count = 0; |
193 | 20.0k | } |
194 | 14.0M | } |
195 | 21.3M | } |
196 | 28.7M | } |
197 | | |
198 | 51.5M | for (size_t j = 0; j < sortbuf_count; j++) |
199 | 22.7M | { |
200 | 22.7M | ucs4_t muc = sortbuf[j].code; |
201 | | |
202 | | /* Append muc to the result accumulator. */ |
203 | 22.7M | if (length < allocated) |
204 | 15.5M | { |
205 | 15.5M | int ret = |
206 | 15.5M | U_UCTOMB (result + length, muc, allocated - length); |
207 | 15.5M | if (ret == -1) |
208 | 0 | { |
209 | 0 | errno = EINVAL; |
210 | 0 | goto fail; |
211 | 0 | } |
212 | 15.5M | if (ret >= 0) |
213 | 15.5M | { |
214 | 15.5M | length += ret; |
215 | 15.5M | goto done_appending; |
216 | 15.5M | } |
217 | 15.5M | } |
218 | 7.28M | { |
219 | 7.28M | size_t old_allocated = allocated; |
220 | 7.28M | size_t new_allocated = 2 * old_allocated; |
221 | 7.28M | if (new_allocated < 64) |
222 | 7.27M | new_allocated = 64; |
223 | 7.28M | if (new_allocated < old_allocated) /* integer overflow? */ |
224 | 0 | abort (); |
225 | 7.28M | { |
226 | 7.28M | UNIT *larger_result; |
227 | 7.28M | if (result == NULL) |
228 | 7.27M | { |
229 | 7.27M | larger_result = |
230 | 7.27M | (UNIT *) malloc (new_allocated * sizeof (UNIT)); |
231 | 7.27M | if (larger_result == NULL) |
232 | 0 | { |
233 | 0 | errno = ENOMEM; |
234 | 0 | goto fail; |
235 | 0 | } |
236 | 7.27M | } |
237 | 14.5k | else if (result == resultbuf) |
238 | 0 | { |
239 | 0 | larger_result = |
240 | 0 | (UNIT *) malloc (new_allocated * sizeof (UNIT)); |
241 | 0 | if (larger_result == NULL) |
242 | 0 | { |
243 | 0 | errno = ENOMEM; |
244 | 0 | goto fail; |
245 | 0 | } |
246 | 0 | U_CPY (larger_result, resultbuf, length); |
247 | 0 | } |
248 | 14.5k | else |
249 | 14.5k | { |
250 | 14.5k | larger_result = |
251 | 14.5k | (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); |
252 | 14.5k | if (larger_result == NULL) |
253 | 0 | { |
254 | 0 | errno = ENOMEM; |
255 | 0 | goto fail; |
256 | 0 | } |
257 | 14.5k | } |
258 | 7.28M | result = larger_result; |
259 | 7.28M | allocated = new_allocated; |
260 | 7.28M | { |
261 | 7.28M | int ret = |
262 | 7.28M | U_UCTOMB (result + length, muc, allocated - length); |
263 | 7.28M | if (ret == -1) |
264 | 0 | { |
265 | 0 | errno = EINVAL; |
266 | 0 | goto fail; |
267 | 0 | } |
268 | 7.28M | if (ret < 0) |
269 | 0 | abort (); |
270 | 7.28M | length += ret; |
271 | 7.28M | goto done_appending; |
272 | 7.28M | } |
273 | 7.28M | } |
274 | 7.28M | } |
275 | 22.7M | done_appending: ; |
276 | 22.7M | } |
277 | | |
278 | | /* sortbuf is now empty. */ |
279 | 28.7M | sortbuf_count = 0; |
280 | 28.7M | } |
281 | | |
282 | 31.5M | if (!(s < s_end)) |
283 | | /* End of string reached. */ |
284 | 7.34M | break; |
285 | | |
286 | | /* Append (uc, ccc) to sortbuf. */ |
287 | 24.2M | if (sortbuf_count == sortbuf_allocated) |
288 | 1.93k | { |
289 | 1.93k | sortbuf_allocated = 2 * sortbuf_allocated; |
290 | 1.93k | if (sortbuf_allocated < sortbuf_count) /* integer overflow? */ |
291 | 0 | abort (); |
292 | 1.93k | struct ucs4_with_ccc *new_sortbuf = |
293 | 1.93k | (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc)); |
294 | 1.93k | if (new_sortbuf == NULL) |
295 | 0 | { |
296 | 0 | errno = ENOMEM; |
297 | 0 | goto fail; |
298 | 0 | } |
299 | 1.93k | memcpy (new_sortbuf, sortbuf, |
300 | 1.93k | sortbuf_count * sizeof (struct ucs4_with_ccc)); |
301 | 1.93k | if (sortbuf != sortbuf_preallocated) |
302 | 1.93k | free (sortbuf); |
303 | 1.93k | sortbuf = new_sortbuf; |
304 | 1.93k | } |
305 | 24.2M | sortbuf[sortbuf_count].code = uc; |
306 | 24.2M | sortbuf[sortbuf_count].ccc = ccc; |
307 | 24.2M | sortbuf_count++; |
308 | | |
309 | 24.2M | i++; |
310 | 24.2M | } |
311 | | |
312 | 25.5M | if (!(s < s_end)) |
313 | | /* End of string reached. */ |
314 | 7.34M | break; |
315 | | |
316 | 18.2M | s += count; |
317 | 18.2M | } |
318 | 7.34M | } |
319 | | |
320 | 7.34M | if (length == 0) |
321 | 69.0k | { |
322 | 69.0k | if (result == NULL) |
323 | 69.0k | { |
324 | | /* Return a non-NULL value. NULL means error. */ |
325 | 69.0k | result = (UNIT *) malloc (1); |
326 | 69.0k | if (result == NULL) |
327 | 0 | { |
328 | 0 | errno = ENOMEM; |
329 | 0 | goto fail; |
330 | 0 | } |
331 | 69.0k | } |
332 | 69.0k | } |
333 | 7.27M | else if (result != resultbuf && length < allocated) |
334 | 7.27M | { |
335 | | /* Shrink the allocated memory if possible. */ |
336 | 7.27M | UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT)); |
337 | 7.27M | if (memory != NULL) |
338 | 7.27M | result = memory; |
339 | 7.27M | } |
340 | | |
341 | 7.34M | if (sortbuf_count > 0) |
342 | 0 | abort (); |
343 | 7.34M | if (sortbuf != sortbuf_preallocated) |
344 | 7.34M | free (sortbuf); |
345 | | |
346 | 7.34M | *lengthp = length; |
347 | 7.34M | return result; |
348 | | |
349 | 0 | fail: |
350 | 0 | { |
351 | 0 | int saved_errno = errno; |
352 | 0 | if (sortbuf != sortbuf_preallocated) |
353 | 0 | free (sortbuf); |
354 | 0 | if (result != resultbuf) |
355 | 0 | free (result); |
356 | 0 | errno = saved_errno; |
357 | 0 | } |
358 | | return NULL; |
359 | 7.34M | } Line | Count | Source | 21 | 2.45M | { | 22 | 2.45M | int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer; | 23 | 2.45M | ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer; | 24 | | | 25 | | /* The result being accumulated. */ | 26 | 2.45M | UNIT *result; | 27 | 2.45M | size_t allocated; | 28 | 2.45M | if (resultbuf == NULL) | 29 | 2.45M | { | 30 | 2.45M | result = NULL; | 31 | 2.45M | allocated = 0; | 32 | 2.45M | } | 33 | 0 | else | 34 | 0 | { | 35 | 0 | result = resultbuf; | 36 | 0 | allocated = *lengthp; | 37 | 0 | } | 38 | 2.45M | size_t length = 0; | 39 | | | 40 | | /* The buffer for sorting. */ | 41 | 2.45M | #define SORTBUF_PREALLOCATED 64 | 42 | 2.45M | struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED]; | 43 | 2.45M | struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */ | 44 | 2.45M | sortbuf_preallocated; | 45 | 2.45M | size_t sortbuf_allocated = SORTBUF_PREALLOCATED; | 46 | 2.45M | size_t sortbuf_count = 0; | 47 | | | 48 | 2.45M | { | 49 | 2.45M | const UNIT *s_end = s + n; | 50 | | | 51 | 2.45M | for (;;) | 52 | 13.4M | { | 53 | 13.4M | int count; | 54 | 13.4M | ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH]; | 55 | 13.4M | int decomposed_count; | 56 | | | 57 | 13.4M | if (s < s_end) | 58 | 11.0M | { | 59 | | /* Fetch the next character. */ | 60 | 11.0M | count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s); | 61 | 11.0M | decomposed_count = 1; | 62 | | | 63 | | /* Decompose it, recursively. | 64 | | It would be possible to precompute the recursive decomposition | 65 | | and store it in a table. But this would significantly increase | 66 | | the size of the decomposition tables, because for example for | 67 | | U+1FC1 the recursive canonical decomposition and the recursive | 68 | | compatibility decomposition are different. */ | 69 | 30.8M | for (int curr = 0; curr < decomposed_count; ) | 70 | 19.8M | { | 71 | | /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e. | 72 | | all elements are atomic. */ | 73 | 19.8M | ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH]; | 74 | 19.8M | int curr_decomposed_count; | 75 | | | 76 | 19.8M | curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed); | 77 | 19.8M | if (curr_decomposed_count >= 0) | 78 | 2.86M | { | 79 | | /* Move curr_decomposed[0..curr_decomposed_count-1] over | 80 | | decomposed[curr], making room. It's not worth using | 81 | | memcpy() here, since the counts are so small. */ | 82 | 2.86M | int shift = curr_decomposed_count - 1; | 83 | | | 84 | 2.86M | if (shift < 0) | 85 | 0 | abort (); | 86 | 2.86M | if (shift > 0) | 87 | 2.48M | { | 88 | 2.48M | decomposed_count += shift; | 89 | 2.48M | if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH) | 90 | 0 | abort (); | 91 | 2.50M | for (int j = decomposed_count - 1 - shift; j > curr; j--) | 92 | 16.4k | decomposed[j + shift] = decomposed[j]; | 93 | 2.48M | } | 94 | 11.7M | for (; shift >= 0; shift--) | 95 | 8.86M | decomposed[curr + shift] = curr_decomposed[shift]; | 96 | 2.86M | } | 97 | 17.0M | else | 98 | 17.0M | { | 99 | | /* decomposed[curr] is atomic. */ | 100 | 17.0M | curr++; | 101 | 17.0M | } | 102 | 19.8M | } | 103 | 11.0M | } | 104 | 2.45M | else | 105 | 2.45M | { | 106 | 2.45M | count = 0; | 107 | 2.45M | decomposed_count = 0; | 108 | 2.45M | } | 109 | | | 110 | 13.4M | int i = 0; | 111 | 13.4M | for (;;) | 112 | 30.4M | { | 113 | 30.4M | ucs4_t uc; | 114 | 30.4M | int ccc; | 115 | | | 116 | 30.4M | if (s < s_end) | 117 | 28.0M | { | 118 | | /* Fetch the next character from the decomposition. */ | 119 | 28.0M | if (i == decomposed_count) | 120 | 11.0M | break; | 121 | 17.0M | uc = decomposed[i]; | 122 | 17.0M | ccc = uc_combining_class (uc); | 123 | 17.0M | } | 124 | 2.45M | else | 125 | 2.45M | { | 126 | | /* End of string reached. */ | 127 | 2.45M | uc = 0; | 128 | 2.45M | ccc = 0; | 129 | 2.45M | } | 130 | | | 131 | 19.4M | if (ccc == 0) | 132 | 16.7M | { | 133 | | /* Apply the canonical ordering algorithm to the accumulated | 134 | | sequence of characters. */ | 135 | 16.7M | if (sortbuf_count > 1) | 136 | 1.41M | gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count, | 137 | 1.41M | sortbuf + sortbuf_count); | 138 | | | 139 | 16.7M | if (composer != NULL) | 140 | 16.7M | { | 141 | | /* Attempt to combine decomposed characters, as specified | 142 | | in the Unicode Standard Annex #15 "Unicode Normalization | 143 | | Forms". We need to check | 144 | | 1. whether the first accumulated character is a | 145 | | "starter" (i.e. has ccc = 0). This is usually the | 146 | | case. But when the string starts with a | 147 | | non-starter, the sortbuf also starts with a | 148 | | non-starter. Btw, this check could also be | 149 | | omitted, because the composition table has only | 150 | | entries (code1, code2) for which code1 is a | 151 | | starter; if the first accumulated character is not | 152 | | a starter, no lookup will succeed. | 153 | | 2. If the sortbuf has more than one character, check | 154 | | for each of these characters that are not "blocked" | 155 | | from the starter (i.e. have a ccc that is higher | 156 | | than the ccc of the previous character) whether it | 157 | | can be combined with the first character. | 158 | | 3. If only one character is left in sortbuf, check | 159 | | whether it can be combined with the next character | 160 | | (also a starter). */ | 161 | 16.7M | if (sortbuf_count > 0 && sortbuf[0].ccc == 0) | 162 | 14.2M | { | 163 | 16.9M | for (size_t j = 1; j < sortbuf_count; ) | 164 | 2.73M | { | 165 | 2.73M | if (sortbuf[j].ccc > sortbuf[j - 1].ccc) | 166 | 1.43M | { | 167 | 1.43M | ucs4_t combined = | 168 | 1.43M | composer (sortbuf[0].code, sortbuf[j].code); | 169 | 1.43M | if (combined) | 170 | 1.41M | { | 171 | 1.41M | sortbuf[0].code = combined; | 172 | | /* sortbuf[0].ccc = 0, still valid. */ | 173 | 3.26M | for (size_t k = j + 1; k < sortbuf_count; k++) | 174 | 1.85M | sortbuf[k - 1] = sortbuf[k]; | 175 | 1.41M | sortbuf_count--; | 176 | 1.41M | continue; | 177 | 1.41M | } | 178 | 1.43M | } | 179 | 1.32M | j++; | 180 | 1.32M | } | 181 | 14.2M | if (s < s_end && sortbuf_count == 1) | 182 | 11.7M | { | 183 | 11.7M | ucs4_t combined = | 184 | 11.7M | composer (sortbuf[0].code, uc); | 185 | 11.7M | if (combined) | 186 | 8.37k | { | 187 | 8.37k | uc = combined; | 188 | 8.37k | ccc = 0; | 189 | | /* uc could be further combined with subsequent | 190 | | characters. So don't put it into sortbuf[0] in | 191 | | this round, only in the next round. */ | 192 | 8.37k | sortbuf_count = 0; | 193 | 8.37k | } | 194 | 11.7M | } | 195 | 14.2M | } | 196 | 16.7M | } | 197 | | | 198 | 32.3M | for (size_t j = 0; j < sortbuf_count; j++) | 199 | 15.5M | { | 200 | 15.5M | ucs4_t muc = sortbuf[j].code; | 201 | | | 202 | | /* Append muc to the result accumulator. */ | 203 | 15.5M | if (length < allocated) | 204 | 13.1M | { | 205 | 13.1M | int ret = | 206 | 13.1M | U_UCTOMB (result + length, muc, allocated - length); | 207 | 13.1M | if (ret == -1) | 208 | 0 | { | 209 | 0 | errno = EINVAL; | 210 | 0 | goto fail; | 211 | 0 | } | 212 | 13.1M | if (ret >= 0) | 213 | 13.1M | { | 214 | 13.1M | length += ret; | 215 | 13.1M | goto done_appending; | 216 | 13.1M | } | 217 | 13.1M | } | 218 | 2.46M | { | 219 | 2.46M | size_t old_allocated = allocated; | 220 | 2.46M | size_t new_allocated = 2 * old_allocated; | 221 | 2.46M | if (new_allocated < 64) | 222 | 2.45M | new_allocated = 64; | 223 | 2.46M | if (new_allocated < old_allocated) /* integer overflow? */ | 224 | 0 | abort (); | 225 | 2.46M | { | 226 | 2.46M | UNIT *larger_result; | 227 | 2.46M | if (result == NULL) | 228 | 2.45M | { | 229 | 2.45M | larger_result = | 230 | 2.45M | (UNIT *) malloc (new_allocated * sizeof (UNIT)); | 231 | 2.45M | if (larger_result == NULL) | 232 | 0 | { | 233 | 0 | errno = ENOMEM; | 234 | 0 | goto fail; | 235 | 0 | } | 236 | 2.45M | } | 237 | 11.8k | else if (result == resultbuf) | 238 | 0 | { | 239 | 0 | larger_result = | 240 | 0 | (UNIT *) malloc (new_allocated * sizeof (UNIT)); | 241 | 0 | if (larger_result == NULL) | 242 | 0 | { | 243 | 0 | errno = ENOMEM; | 244 | 0 | goto fail; | 245 | 0 | } | 246 | 0 | U_CPY (larger_result, resultbuf, length); | 247 | 0 | } | 248 | 11.8k | else | 249 | 11.8k | { | 250 | 11.8k | larger_result = | 251 | 11.8k | (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); | 252 | 11.8k | if (larger_result == NULL) | 253 | 0 | { | 254 | 0 | errno = ENOMEM; | 255 | 0 | goto fail; | 256 | 0 | } | 257 | 11.8k | } | 258 | 2.46M | result = larger_result; | 259 | 2.46M | allocated = new_allocated; | 260 | 2.46M | { | 261 | 2.46M | int ret = | 262 | 2.46M | U_UCTOMB (result + length, muc, allocated - length); | 263 | 2.46M | if (ret == -1) | 264 | 0 | { | 265 | 0 | errno = EINVAL; | 266 | 0 | goto fail; | 267 | 0 | } | 268 | 2.46M | if (ret < 0) | 269 | 0 | abort (); | 270 | 2.46M | length += ret; | 271 | 2.46M | goto done_appending; | 272 | 2.46M | } | 273 | 2.46M | } | 274 | 2.46M | } | 275 | 15.5M | done_appending: ; | 276 | 15.5M | } | 277 | | | 278 | | /* sortbuf is now empty. */ | 279 | 16.7M | sortbuf_count = 0; | 280 | 16.7M | } | 281 | | | 282 | 19.4M | if (!(s < s_end)) | 283 | | /* End of string reached. */ | 284 | 2.45M | break; | 285 | | | 286 | | /* Append (uc, ccc) to sortbuf. */ | 287 | 17.0M | if (sortbuf_count == sortbuf_allocated) | 288 | 1.12k | { | 289 | 1.12k | sortbuf_allocated = 2 * sortbuf_allocated; | 290 | 1.12k | if (sortbuf_allocated < sortbuf_count) /* integer overflow? */ | 291 | 0 | abort (); | 292 | 1.12k | struct ucs4_with_ccc *new_sortbuf = | 293 | 1.12k | (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc)); | 294 | 1.12k | if (new_sortbuf == NULL) | 295 | 0 | { | 296 | 0 | errno = ENOMEM; | 297 | 0 | goto fail; | 298 | 0 | } | 299 | 1.12k | memcpy (new_sortbuf, sortbuf, | 300 | 1.12k | sortbuf_count * sizeof (struct ucs4_with_ccc)); | 301 | 1.12k | if (sortbuf != sortbuf_preallocated) | 302 | 1.12k | free (sortbuf); | 303 | 1.12k | sortbuf = new_sortbuf; | 304 | 1.12k | } | 305 | 17.0M | sortbuf[sortbuf_count].code = uc; | 306 | 17.0M | sortbuf[sortbuf_count].ccc = ccc; | 307 | 17.0M | sortbuf_count++; | 308 | | | 309 | 17.0M | i++; | 310 | 17.0M | } | 311 | | | 312 | 13.4M | if (!(s < s_end)) | 313 | | /* End of string reached. */ | 314 | 2.45M | break; | 315 | | | 316 | 11.0M | s += count; | 317 | 11.0M | } | 318 | 2.45M | } | 319 | | | 320 | 2.45M | if (length == 0) | 321 | 0 | { | 322 | 0 | if (result == NULL) | 323 | 0 | { | 324 | | /* Return a non-NULL value. NULL means error. */ | 325 | 0 | result = (UNIT *) malloc (1); | 326 | 0 | if (result == NULL) | 327 | 0 | { | 328 | 0 | errno = ENOMEM; | 329 | 0 | goto fail; | 330 | 0 | } | 331 | 0 | } | 332 | 0 | } | 333 | 2.45M | else if (result != resultbuf && length < allocated) | 334 | 2.45M | { | 335 | | /* Shrink the allocated memory if possible. */ | 336 | 2.45M | UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT)); | 337 | 2.45M | if (memory != NULL) | 338 | 2.45M | result = memory; | 339 | 2.45M | } | 340 | | | 341 | 2.45M | if (sortbuf_count > 0) | 342 | 0 | abort (); | 343 | 2.45M | if (sortbuf != sortbuf_preallocated) | 344 | 2.45M | free (sortbuf); | 345 | | | 346 | 2.45M | *lengthp = length; | 347 | 2.45M | return result; | 348 | | | 349 | 0 | fail: | 350 | 0 | { | 351 | 0 | int saved_errno = errno; | 352 | 0 | if (sortbuf != sortbuf_preallocated) | 353 | 0 | free (sortbuf); | 354 | 0 | if (result != resultbuf) | 355 | 0 | free (result); | 356 | 0 | errno = saved_errno; | 357 | 0 | } | 358 | | return NULL; | 359 | 2.45M | } |
Line | Count | Source | 21 | 4.89M | { | 22 | 4.89M | int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer; | 23 | 4.89M | ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer; | 24 | | | 25 | | /* The result being accumulated. */ | 26 | 4.89M | UNIT *result; | 27 | 4.89M | size_t allocated; | 28 | 4.89M | if (resultbuf == NULL) | 29 | 4.89M | { | 30 | 4.89M | result = NULL; | 31 | 4.89M | allocated = 0; | 32 | 4.89M | } | 33 | 0 | else | 34 | 0 | { | 35 | 0 | result = resultbuf; | 36 | 0 | allocated = *lengthp; | 37 | 0 | } | 38 | 4.89M | size_t length = 0; | 39 | | | 40 | | /* The buffer for sorting. */ | 41 | 4.89M | #define SORTBUF_PREALLOCATED 64 | 42 | 4.89M | struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED]; | 43 | 4.89M | struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */ | 44 | 4.89M | sortbuf_preallocated; | 45 | 4.89M | size_t sortbuf_allocated = SORTBUF_PREALLOCATED; | 46 | 4.89M | size_t sortbuf_count = 0; | 47 | | | 48 | 4.89M | { | 49 | 4.89M | const UNIT *s_end = s + n; | 50 | | | 51 | 4.89M | for (;;) | 52 | 12.0M | { | 53 | 12.0M | int count; | 54 | 12.0M | ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH]; | 55 | 12.0M | int decomposed_count; | 56 | | | 57 | 12.0M | if (s < s_end) | 58 | 7.19M | { | 59 | | /* Fetch the next character. */ | 60 | 7.19M | count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s); | 61 | 7.19M | decomposed_count = 1; | 62 | | | 63 | | /* Decompose it, recursively. | 64 | | It would be possible to precompute the recursive decomposition | 65 | | and store it in a table. But this would significantly increase | 66 | | the size of the decomposition tables, because for example for | 67 | | U+1FC1 the recursive canonical decomposition and the recursive | 68 | | compatibility decomposition are different. */ | 69 | 14.4M | for (int curr = 0; curr < decomposed_count; ) | 70 | 7.26M | { | 71 | | /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e. | 72 | | all elements are atomic. */ | 73 | 7.26M | ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH]; | 74 | 7.26M | int curr_decomposed_count; | 75 | | | 76 | 7.26M | curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed); | 77 | 7.26M | if (curr_decomposed_count >= 0) | 78 | 33.5k | { | 79 | | /* Move curr_decomposed[0..curr_decomposed_count-1] over | 80 | | decomposed[curr], making room. It's not worth using | 81 | | memcpy() here, since the counts are so small. */ | 82 | 33.5k | int shift = curr_decomposed_count - 1; | 83 | | | 84 | 33.5k | if (shift < 0) | 85 | 0 | abort (); | 86 | 33.5k | if (shift > 0) | 87 | 33.2k | { | 88 | 33.2k | decomposed_count += shift; | 89 | 33.2k | if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH) | 90 | 0 | abort (); | 91 | 43.9k | for (int j = decomposed_count - 1 - shift; j > curr; j--) | 92 | 10.6k | decomposed[j + shift] = decomposed[j]; | 93 | 33.2k | } | 94 | 100k | for (; shift >= 0; shift--) | 95 | 66.8k | decomposed[curr + shift] = curr_decomposed[shift]; | 96 | 33.5k | } | 97 | 7.23M | else | 98 | 7.23M | { | 99 | | /* decomposed[curr] is atomic. */ | 100 | 7.23M | curr++; | 101 | 7.23M | } | 102 | 7.26M | } | 103 | 7.19M | } | 104 | 4.89M | else | 105 | 4.89M | { | 106 | 4.89M | count = 0; | 107 | 4.89M | decomposed_count = 0; | 108 | 4.89M | } | 109 | | | 110 | 12.0M | int i = 0; | 111 | 12.0M | for (;;) | 112 | 19.3M | { | 113 | 19.3M | ucs4_t uc; | 114 | 19.3M | int ccc; | 115 | | | 116 | 19.3M | if (s < s_end) | 117 | 14.4M | { | 118 | | /* Fetch the next character from the decomposition. */ | 119 | 14.4M | if (i == decomposed_count) | 120 | 7.19M | break; | 121 | 7.23M | uc = decomposed[i]; | 122 | 7.23M | ccc = uc_combining_class (uc); | 123 | 7.23M | } | 124 | 4.89M | else | 125 | 4.89M | { | 126 | | /* End of string reached. */ | 127 | 4.89M | uc = 0; | 128 | 4.89M | ccc = 0; | 129 | 4.89M | } | 130 | | | 131 | 12.1M | if (ccc == 0) | 132 | 12.0M | { | 133 | | /* Apply the canonical ordering algorithm to the accumulated | 134 | | sequence of characters. */ | 135 | 12.0M | if (sortbuf_count > 1) | 136 | 23.0k | gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count, | 137 | 23.0k | sortbuf + sortbuf_count); | 138 | | | 139 | 12.0M | if (composer != NULL) | 140 | 12.0M | { | 141 | | /* Attempt to combine decomposed characters, as specified | 142 | | in the Unicode Standard Annex #15 "Unicode Normalization | 143 | | Forms". We need to check | 144 | | 1. whether the first accumulated character is a | 145 | | "starter" (i.e. has ccc = 0). This is usually the | 146 | | case. But when the string starts with a | 147 | | non-starter, the sortbuf also starts with a | 148 | | non-starter. Btw, this check could also be | 149 | | omitted, because the composition table has only | 150 | | entries (code1, code2) for which code1 is a | 151 | | starter; if the first accumulated character is not | 152 | | a starter, no lookup will succeed. | 153 | | 2. If the sortbuf has more than one character, check | 154 | | for each of these characters that are not "blocked" | 155 | | from the starter (i.e. have a ccc that is higher | 156 | | than the ccc of the previous character) whether it | 157 | | can be combined with the first character. | 158 | | 3. If only one character is left in sortbuf, check | 159 | | whether it can be combined with the next character | 160 | | (also a starter). */ | 161 | 12.0M | if (sortbuf_count > 0 && sortbuf[0].ccc == 0) | 162 | 7.12M | { | 163 | 7.22M | for (size_t j = 1; j < sortbuf_count; ) | 164 | 98.0k | { | 165 | 98.0k | if (sortbuf[j].ccc > sortbuf[j - 1].ccc) | 166 | 33.7k | { | 167 | 33.7k | ucs4_t combined = | 168 | 33.7k | composer (sortbuf[0].code, sortbuf[j].code); | 169 | 33.7k | if (combined) | 170 | 19.8k | { | 171 | 19.8k | sortbuf[0].code = combined; | 172 | | /* sortbuf[0].ccc = 0, still valid. */ | 173 | 33.3k | for (size_t k = j + 1; k < sortbuf_count; k++) | 174 | 13.5k | sortbuf[k - 1] = sortbuf[k]; | 175 | 19.8k | sortbuf_count--; | 176 | 19.8k | continue; | 177 | 19.8k | } | 178 | 33.7k | } | 179 | 78.2k | j++; | 180 | 78.2k | } | 181 | 7.12M | if (s < s_end && sortbuf_count == 1) | 182 | 2.29M | { | 183 | 2.29M | ucs4_t combined = | 184 | 2.29M | composer (sortbuf[0].code, uc); | 185 | 2.29M | if (combined) | 186 | 11.7k | { | 187 | 11.7k | uc = combined; | 188 | 11.7k | ccc = 0; | 189 | | /* uc could be further combined with subsequent | 190 | | characters. So don't put it into sortbuf[0] in | 191 | | this round, only in the next round. */ | 192 | 11.7k | sortbuf_count = 0; | 193 | 11.7k | } | 194 | 2.29M | } | 195 | 7.12M | } | 196 | 12.0M | } | 197 | | | 198 | 19.2M | for (size_t j = 0; j < sortbuf_count; j++) | 199 | 7.20M | { | 200 | 7.20M | ucs4_t muc = sortbuf[j].code; | 201 | | | 202 | | /* Append muc to the result accumulator. */ | 203 | 7.20M | if (length < allocated) | 204 | 2.37M | { | 205 | 2.37M | int ret = | 206 | 2.37M | U_UCTOMB (result + length, muc, allocated - length); | 207 | 2.37M | if (ret == -1) | 208 | 0 | { | 209 | 0 | errno = EINVAL; | 210 | 0 | goto fail; | 211 | 0 | } | 212 | 2.37M | if (ret >= 0) | 213 | 2.37M | { | 214 | 2.37M | length += ret; | 215 | 2.37M | goto done_appending; | 216 | 2.37M | } | 217 | 2.37M | } | 218 | 4.82M | { | 219 | 4.82M | size_t old_allocated = allocated; | 220 | 4.82M | size_t new_allocated = 2 * old_allocated; | 221 | 4.82M | if (new_allocated < 64) | 222 | 4.82M | new_allocated = 64; | 223 | 4.82M | if (new_allocated < old_allocated) /* integer overflow? */ | 224 | 0 | abort (); | 225 | 4.82M | { | 226 | 4.82M | UNIT *larger_result; | 227 | 4.82M | if (result == NULL) | 228 | 4.82M | { | 229 | 4.82M | larger_result = | 230 | 4.82M | (UNIT *) malloc (new_allocated * sizeof (UNIT)); | 231 | 4.82M | if (larger_result == NULL) | 232 | 0 | { | 233 | 0 | errno = ENOMEM; | 234 | 0 | goto fail; | 235 | 0 | } | 236 | 4.82M | } | 237 | 2.72k | else if (result == resultbuf) | 238 | 0 | { | 239 | 0 | larger_result = | 240 | 0 | (UNIT *) malloc (new_allocated * sizeof (UNIT)); | 241 | 0 | if (larger_result == NULL) | 242 | 0 | { | 243 | 0 | errno = ENOMEM; | 244 | 0 | goto fail; | 245 | 0 | } | 246 | 0 | U_CPY (larger_result, resultbuf, length); | 247 | 0 | } | 248 | 2.72k | else | 249 | 2.72k | { | 250 | 2.72k | larger_result = | 251 | 2.72k | (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); | 252 | 2.72k | if (larger_result == NULL) | 253 | 0 | { | 254 | 0 | errno = ENOMEM; | 255 | 0 | goto fail; | 256 | 0 | } | 257 | 2.72k | } | 258 | 4.82M | result = larger_result; | 259 | 4.82M | allocated = new_allocated; | 260 | 4.82M | { | 261 | 4.82M | int ret = | 262 | 4.82M | U_UCTOMB (result + length, muc, allocated - length); | 263 | 4.82M | if (ret == -1) | 264 | 0 | { | 265 | 0 | errno = EINVAL; | 266 | 0 | goto fail; | 267 | 0 | } | 268 | 4.82M | if (ret < 0) | 269 | 0 | abort (); | 270 | 4.82M | length += ret; | 271 | 4.82M | goto done_appending; | 272 | 4.82M | } | 273 | 4.82M | } | 274 | 4.82M | } | 275 | 7.20M | done_appending: ; | 276 | 7.20M | } | 277 | | | 278 | | /* sortbuf is now empty. */ | 279 | 12.0M | sortbuf_count = 0; | 280 | 12.0M | } | 281 | | | 282 | 12.1M | if (!(s < s_end)) | 283 | | /* End of string reached. */ | 284 | 4.89M | break; | 285 | | | 286 | | /* Append (uc, ccc) to sortbuf. */ | 287 | 7.23M | if (sortbuf_count == sortbuf_allocated) | 288 | 808 | { | 289 | 808 | sortbuf_allocated = 2 * sortbuf_allocated; | 290 | 808 | if (sortbuf_allocated < sortbuf_count) /* integer overflow? */ | 291 | 0 | abort (); | 292 | 808 | struct ucs4_with_ccc *new_sortbuf = | 293 | 808 | (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc)); | 294 | 808 | if (new_sortbuf == NULL) | 295 | 0 | { | 296 | 0 | errno = ENOMEM; | 297 | 0 | goto fail; | 298 | 0 | } | 299 | 808 | memcpy (new_sortbuf, sortbuf, | 300 | 808 | sortbuf_count * sizeof (struct ucs4_with_ccc)); | 301 | 808 | if (sortbuf != sortbuf_preallocated) | 302 | 808 | free (sortbuf); | 303 | 808 | sortbuf = new_sortbuf; | 304 | 808 | } | 305 | 7.23M | sortbuf[sortbuf_count].code = uc; | 306 | 7.23M | sortbuf[sortbuf_count].ccc = ccc; | 307 | 7.23M | sortbuf_count++; | 308 | | | 309 | 7.23M | i++; | 310 | 7.23M | } | 311 | | | 312 | 12.0M | if (!(s < s_end)) | 313 | | /* End of string reached. */ | 314 | 4.89M | break; | 315 | | | 316 | 7.19M | s += count; | 317 | 7.19M | } | 318 | 4.89M | } | 319 | | | 320 | 4.89M | if (length == 0) | 321 | 69.0k | { | 322 | 69.0k | if (result == NULL) | 323 | 69.0k | { | 324 | | /* Return a non-NULL value. NULL means error. */ | 325 | 69.0k | result = (UNIT *) malloc (1); | 326 | 69.0k | if (result == NULL) | 327 | 0 | { | 328 | 0 | errno = ENOMEM; | 329 | 0 | goto fail; | 330 | 0 | } | 331 | 69.0k | } | 332 | 69.0k | } | 333 | 4.82M | else if (result != resultbuf && length < allocated) | 334 | 4.82M | { | 335 | | /* Shrink the allocated memory if possible. */ | 336 | 4.82M | UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT)); | 337 | 4.82M | if (memory != NULL) | 338 | 4.82M | result = memory; | 339 | 4.82M | } | 340 | | | 341 | 4.89M | if (sortbuf_count > 0) | 342 | 0 | abort (); | 343 | 4.89M | if (sortbuf != sortbuf_preallocated) | 344 | 4.89M | free (sortbuf); | 345 | | | 346 | 4.89M | *lengthp = length; | 347 | 4.89M | return result; | 348 | | | 349 | 0 | fail: | 350 | 0 | { | 351 | 0 | int saved_errno = errno; | 352 | 0 | if (sortbuf != sortbuf_preallocated) | 353 | 0 | free (sortbuf); | 354 | 0 | if (result != resultbuf) | 355 | 0 | free (result); | 356 | 0 | errno = saved_errno; | 357 | 0 | } | 358 | | return NULL; | 359 | 4.89M | } |
|