/src/libunistring/lib/uninorm/u-normalize-internal.h
Line | Count | Source |
1 | | /* Decomposition and composition of Unicode strings. |
2 | | Copyright (C) 2009-2025 Free Software Foundation, Inc. |
3 | | Written by Bruno Haible <bruno@clisp.org>, 2009. |
4 | | |
5 | | This file is free software: you can redistribute it and/or modify |
6 | | it under the terms of the GNU Lesser General Public License as |
7 | | published by the Free Software Foundation; either version 2.1 of the |
8 | | License, or (at your option) any later version. |
9 | | |
10 | | This file is distributed in the hope that it will be useful, |
11 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | GNU Lesser General Public License for more details. |
14 | | |
15 | | You should have received a copy of the GNU Lesser General Public License |
16 | | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
17 | | |
18 | | UNIT * |
19 | | FUNC (uninorm_t nf, const UNIT *s, size_t n, |
20 | | UNIT *resultbuf, size_t *lengthp) |
21 | 12.5M | { |
22 | 12.5M | int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer; |
23 | 12.5M | ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer; |
24 | | |
25 | | /* The result being accumulated. */ |
26 | 12.5M | UNIT *result; |
27 | 12.5M | size_t length; |
28 | 12.5M | size_t allocated; |
29 | | /* The buffer for sorting. */ |
30 | 12.5M | #define SORTBUF_PREALLOCATED 64 |
31 | 12.5M | struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED]; |
32 | 12.5M | struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */ |
33 | 12.5M | size_t sortbuf_allocated; |
34 | 12.5M | size_t sortbuf_count; |
35 | | |
36 | | /* Initialize the accumulator. */ |
37 | 12.5M | if (resultbuf == NULL) |
38 | 12.5M | { |
39 | 12.5M | result = NULL; |
40 | 12.5M | allocated = 0; |
41 | 12.5M | } |
42 | 0 | else |
43 | 0 | { |
44 | 0 | result = resultbuf; |
45 | 0 | allocated = *lengthp; |
46 | 0 | } |
47 | 12.5M | length = 0; |
48 | | |
49 | | /* Initialize the buffer for sorting. */ |
50 | 12.5M | sortbuf = sortbuf_preallocated; |
51 | 12.5M | sortbuf_allocated = SORTBUF_PREALLOCATED; |
52 | 12.5M | sortbuf_count = 0; |
53 | | |
54 | 12.5M | { |
55 | 12.5M | const UNIT *s_end = s + n; |
56 | | |
57 | 12.5M | for (;;) |
58 | 39.3M | { |
59 | 39.3M | int count; |
60 | 39.3M | ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH]; |
61 | 39.3M | int decomposed_count; |
62 | 39.3M | int i; |
63 | | |
64 | 39.3M | if (s < s_end) |
65 | 26.7M | { |
66 | | /* Fetch the next character. */ |
67 | 26.7M | count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s); |
68 | 26.7M | decomposed_count = 1; |
69 | | |
70 | | /* Decompose it, recursively. |
71 | | It would be possible to precompute the recursive decomposition |
72 | | and store it in a table. But this would significantly increase |
73 | | the size of the decomposition tables, because for example for |
74 | | U+1FC1 the recursive canonical decomposition and the recursive |
75 | | compatibility decomposition are different. */ |
76 | 63.0M | for (int curr = 0; curr < decomposed_count; ) |
77 | 36.2M | { |
78 | | /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e. |
79 | | all elements are atomic. */ |
80 | 36.2M | ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH]; |
81 | 36.2M | int curr_decomposed_count; |
82 | | |
83 | 36.2M | curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed); |
84 | 36.2M | if (curr_decomposed_count >= 0) |
85 | 3.13M | { |
86 | | /* Move curr_decomposed[0..curr_decomposed_count-1] over |
87 | | decomposed[curr], making room. It's not worth using |
88 | | memcpy() here, since the counts are so small. */ |
89 | 3.13M | int shift = curr_decomposed_count - 1; |
90 | | |
91 | 3.13M | if (shift < 0) |
92 | 0 | abort (); |
93 | 3.13M | if (shift > 0) |
94 | 2.72M | { |
95 | 2.72M | decomposed_count += shift; |
96 | 2.72M | if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH) |
97 | 0 | abort (); |
98 | 2.75M | for (int j = decomposed_count - 1 - shift; j > curr; j--) |
99 | 25.9k | decomposed[j + shift] = decomposed[j]; |
100 | 2.72M | } |
101 | 12.6M | for (; shift >= 0; shift--) |
102 | 9.48M | decomposed[curr + shift] = curr_decomposed[shift]; |
103 | 3.13M | } |
104 | 33.1M | else |
105 | 33.1M | { |
106 | | /* decomposed[curr] is atomic. */ |
107 | 33.1M | curr++; |
108 | 33.1M | } |
109 | 36.2M | } |
110 | 26.7M | } |
111 | 12.5M | else |
112 | 12.5M | { |
113 | 12.5M | count = 0; |
114 | 12.5M | decomposed_count = 0; |
115 | 12.5M | } |
116 | | |
117 | 39.3M | i = 0; |
118 | 39.3M | for (;;) |
119 | 72.4M | { |
120 | 72.4M | ucs4_t uc; |
121 | 72.4M | int ccc; |
122 | | |
123 | 72.4M | if (s < s_end) |
124 | 59.8M | { |
125 | | /* Fetch the next character from the decomposition. */ |
126 | 59.8M | if (i == decomposed_count) |
127 | 26.7M | break; |
128 | 33.1M | uc = decomposed[i]; |
129 | 33.1M | ccc = uc_combining_class (uc); |
130 | 33.1M | } |
131 | 12.5M | else |
132 | 12.5M | { |
133 | | /* End of string reached. */ |
134 | 12.5M | uc = 0; |
135 | 12.5M | ccc = 0; |
136 | 12.5M | } |
137 | | |
138 | 45.6M | if (ccc == 0) |
139 | 42.6M | { |
140 | | /* Apply the canonical ordering algorithm to the accumulated |
141 | | sequence of characters. */ |
142 | 42.6M | if (sortbuf_count > 1) |
143 | 1.63M | gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count, |
144 | 1.63M | sortbuf + sortbuf_count); |
145 | | |
146 | 42.6M | if (composer != NULL) |
147 | 42.6M | { |
148 | | /* Attempt to combine decomposed characters, as specified |
149 | | in the Unicode Standard Annex #15 "Unicode Normalization |
150 | | Forms". We need to check |
151 | | 1. whether the first accumulated character is a |
152 | | "starter" (i.e. has ccc = 0). This is usually the |
153 | | case. But when the string starts with a |
154 | | non-starter, the sortbuf also starts with a |
155 | | non-starter. Btw, this check could also be |
156 | | omitted, because the composition table has only |
157 | | entries (code1, code2) for which code1 is a |
158 | | starter; if the first accumulated character is not |
159 | | a starter, no lookup will succeed. |
160 | | 2. If the sortbuf has more than one character, check |
161 | | for each of these characters that are not "blocked" |
162 | | from the starter (i.e. have a ccc that is higher |
163 | | than the ccc of the previous character) whether it |
164 | | can be combined with the first character. |
165 | | 3. If only one character is left in sortbuf, check |
166 | | whether it can be combined with the next character |
167 | | (also a starter). */ |
168 | 42.6M | if (sortbuf_count > 0 && sortbuf[0].ccc == 0) |
169 | 30.1M | { |
170 | 33.0M | for (size_t j = 1; j < sortbuf_count; ) |
171 | 2.95M | { |
172 | 2.95M | if (sortbuf[j].ccc > sortbuf[j - 1].ccc) |
173 | 1.66M | { |
174 | 1.66M | ucs4_t combined = |
175 | 1.66M | composer (sortbuf[0].code, sortbuf[j].code); |
176 | 1.66M | if (combined) |
177 | 1.61M | { |
178 | 1.61M | sortbuf[0].code = combined; |
179 | | /* sortbuf[0].ccc = 0, still valid. */ |
180 | 3.23M | for (size_t k = j + 1; k < sortbuf_count; k++) |
181 | 1.61M | sortbuf[k - 1] = sortbuf[k]; |
182 | 1.61M | sortbuf_count--; |
183 | 1.61M | continue; |
184 | 1.61M | } |
185 | 1.66M | } |
186 | 1.33M | j++; |
187 | 1.33M | } |
188 | 30.1M | if (s < s_end && sortbuf_count == 1) |
189 | 17.6M | { |
190 | 17.6M | ucs4_t combined = |
191 | 17.6M | composer (sortbuf[0].code, uc); |
192 | 17.6M | if (combined) |
193 | 16.6k | { |
194 | 16.6k | uc = combined; |
195 | 16.6k | ccc = 0; |
196 | | /* uc could be further combined with subsequent |
197 | | characters. So don't put it into sortbuf[0] in |
198 | | this round, only in the next round. */ |
199 | 16.6k | sortbuf_count = 0; |
200 | 16.6k | } |
201 | 17.6M | } |
202 | 30.1M | } |
203 | 42.6M | } |
204 | | |
205 | 74.1M | for (size_t j = 0; j < sortbuf_count; j++) |
206 | 31.4M | { |
207 | 31.4M | ucs4_t muc = sortbuf[j].code; |
208 | | |
209 | | /* Append muc to the result accumulator. */ |
210 | 31.4M | if (length < allocated) |
211 | 19.0M | { |
212 | 19.0M | int ret = |
213 | 19.0M | U_UCTOMB (result + length, muc, allocated - length); |
214 | 19.0M | if (ret == -1) |
215 | 0 | { |
216 | 0 | errno = EINVAL; |
217 | 0 | goto fail; |
218 | 0 | } |
219 | 19.0M | if (ret >= 0) |
220 | 19.0M | { |
221 | 19.0M | length += ret; |
222 | 19.0M | goto done_appending; |
223 | 19.0M | } |
224 | 19.0M | } |
225 | 12.4M | { |
226 | 12.4M | size_t old_allocated = allocated; |
227 | 12.4M | size_t new_allocated = 2 * old_allocated; |
228 | 12.4M | if (new_allocated < 64) |
229 | 12.4M | new_allocated = 64; |
230 | 12.4M | if (new_allocated < old_allocated) /* integer overflow? */ |
231 | 0 | abort (); |
232 | 12.4M | { |
233 | 12.4M | UNIT *larger_result; |
234 | 12.4M | if (result == NULL) |
235 | 12.4M | { |
236 | 12.4M | larger_result = |
237 | 12.4M | (UNIT *) malloc (new_allocated * sizeof (UNIT)); |
238 | 12.4M | if (larger_result == NULL) |
239 | 0 | { |
240 | 0 | errno = ENOMEM; |
241 | 0 | goto fail; |
242 | 0 | } |
243 | 12.4M | } |
244 | 15.5k | else if (result == resultbuf) |
245 | 0 | { |
246 | 0 | larger_result = |
247 | 0 | (UNIT *) malloc (new_allocated * sizeof (UNIT)); |
248 | 0 | if (larger_result == NULL) |
249 | 0 | { |
250 | 0 | errno = ENOMEM; |
251 | 0 | goto fail; |
252 | 0 | } |
253 | 0 | U_CPY (larger_result, resultbuf, length); |
254 | 0 | } |
255 | 15.5k | else |
256 | 15.5k | { |
257 | 15.5k | larger_result = |
258 | 15.5k | (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); |
259 | 15.5k | if (larger_result == NULL) |
260 | 0 | { |
261 | 0 | errno = ENOMEM; |
262 | 0 | goto fail; |
263 | 0 | } |
264 | 15.5k | } |
265 | 12.4M | result = larger_result; |
266 | 12.4M | allocated = new_allocated; |
267 | 12.4M | { |
268 | 12.4M | int ret = |
269 | 12.4M | U_UCTOMB (result + length, muc, allocated - length); |
270 | 12.4M | if (ret == -1) |
271 | 0 | { |
272 | 0 | errno = EINVAL; |
273 | 0 | goto fail; |
274 | 0 | } |
275 | 12.4M | if (ret < 0) |
276 | 0 | abort (); |
277 | 12.4M | length += ret; |
278 | 12.4M | goto done_appending; |
279 | 12.4M | } |
280 | 12.4M | } |
281 | 12.4M | } |
282 | 31.4M | done_appending: ; |
283 | 31.4M | } |
284 | | |
285 | | /* sortbuf is now empty. */ |
286 | 42.6M | sortbuf_count = 0; |
287 | 42.6M | } |
288 | | |
289 | 45.6M | if (!(s < s_end)) |
290 | | /* End of string reached. */ |
291 | 12.5M | break; |
292 | | |
293 | | /* Append (uc, ccc) to sortbuf. */ |
294 | 33.1M | if (sortbuf_count == sortbuf_allocated) |
295 | 1.86k | { |
296 | 1.86k | struct ucs4_with_ccc *new_sortbuf; |
297 | | |
298 | 1.86k | sortbuf_allocated = 2 * sortbuf_allocated; |
299 | 1.86k | if (sortbuf_allocated < sortbuf_count) /* integer overflow? */ |
300 | 0 | abort (); |
301 | 1.86k | new_sortbuf = |
302 | 1.86k | (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc)); |
303 | 1.86k | if (new_sortbuf == NULL) |
304 | 0 | { |
305 | 0 | errno = ENOMEM; |
306 | 0 | goto fail; |
307 | 0 | } |
308 | 1.86k | memcpy (new_sortbuf, sortbuf, |
309 | 1.86k | sortbuf_count * sizeof (struct ucs4_with_ccc)); |
310 | 1.86k | if (sortbuf != sortbuf_preallocated) |
311 | 1.86k | free (sortbuf); |
312 | 1.86k | sortbuf = new_sortbuf; |
313 | 1.86k | } |
314 | 33.1M | sortbuf[sortbuf_count].code = uc; |
315 | 33.1M | sortbuf[sortbuf_count].ccc = ccc; |
316 | 33.1M | sortbuf_count++; |
317 | | |
318 | 33.1M | i++; |
319 | 33.1M | } |
320 | | |
321 | 39.3M | if (!(s < s_end)) |
322 | | /* End of string reached. */ |
323 | 12.5M | break; |
324 | | |
325 | 26.7M | s += count; |
326 | 26.7M | } |
327 | 12.5M | } |
328 | | |
329 | 12.5M | if (length == 0) |
330 | 91.3k | { |
331 | 91.3k | if (result == NULL) |
332 | 91.3k | { |
333 | | /* Return a non-NULL value. NULL means error. */ |
334 | 91.3k | result = (UNIT *) malloc (1); |
335 | 91.3k | if (result == NULL) |
336 | 0 | { |
337 | 0 | errno = ENOMEM; |
338 | 0 | goto fail; |
339 | 0 | } |
340 | 91.3k | } |
341 | 91.3k | } |
342 | 12.4M | else if (result != resultbuf && length < allocated) |
343 | 12.4M | { |
344 | | /* Shrink the allocated memory if possible. */ |
345 | 12.4M | UNIT *memory; |
346 | | |
347 | 12.4M | memory = (UNIT *) realloc (result, length * sizeof (UNIT)); |
348 | 12.4M | if (memory != NULL) |
349 | 12.4M | result = memory; |
350 | 12.4M | } |
351 | | |
352 | 12.5M | if (sortbuf_count > 0) |
353 | 0 | abort (); |
354 | 12.5M | if (sortbuf != sortbuf_preallocated) |
355 | 12.5M | free (sortbuf); |
356 | | |
357 | 12.5M | *lengthp = length; |
358 | 12.5M | return result; |
359 | | |
360 | 0 | fail: |
361 | 0 | { |
362 | 0 | int saved_errno = errno; |
363 | 0 | if (sortbuf != sortbuf_preallocated) |
364 | 0 | free (sortbuf); |
365 | 0 | if (result != resultbuf) |
366 | 0 | free (result); |
367 | 0 | errno = saved_errno; |
368 | 0 | } |
369 | | return NULL; |
370 | 12.5M | } Line | Count | Source | 21 | 4.12M | { | 22 | 4.12M | int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer; | 23 | 4.12M | ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer; | 24 | | | 25 | | /* The result being accumulated. */ | 26 | 4.12M | UNIT *result; | 27 | 4.12M | size_t length; | 28 | 4.12M | size_t allocated; | 29 | | /* The buffer for sorting. */ | 30 | 4.12M | #define SORTBUF_PREALLOCATED 64 | 31 | 4.12M | struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED]; | 32 | 4.12M | struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */ | 33 | 4.12M | size_t sortbuf_allocated; | 34 | 4.12M | size_t sortbuf_count; | 35 | | | 36 | | /* Initialize the accumulator. */ | 37 | 4.12M | if (resultbuf == NULL) | 38 | 4.12M | { | 39 | 4.12M | result = NULL; | 40 | 4.12M | allocated = 0; | 41 | 4.12M | } | 42 | 0 | else | 43 | 0 | { | 44 | 0 | result = resultbuf; | 45 | 0 | allocated = *lengthp; | 46 | 0 | } | 47 | 4.12M | length = 0; | 48 | | | 49 | | /* Initialize the buffer for sorting. */ | 50 | 4.12M | sortbuf = sortbuf_preallocated; | 51 | 4.12M | sortbuf_allocated = SORTBUF_PREALLOCATED; | 52 | 4.12M | sortbuf_count = 0; | 53 | | | 54 | 4.12M | { | 55 | 4.12M | const UNIT *s_end = s + n; | 56 | | | 57 | 4.12M | for (;;) | 58 | 18.8M | { | 59 | 18.8M | int count; | 60 | 18.8M | ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH]; | 61 | 18.8M | int decomposed_count; | 62 | 18.8M | int i; | 63 | | | 64 | 18.8M | if (s < s_end) | 65 | 14.6M | { | 66 | | /* Fetch the next character. */ | 67 | 14.6M | count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s); | 68 | 14.6M | decomposed_count = 1; | 69 | | | 70 | | /* Decompose it, recursively. | 71 | | It would be possible to precompute the recursive decomposition | 72 | | and store it in a table. But this would significantly increase | 73 | | the size of the decomposition tables, because for example for | 74 | | U+1FC1 the recursive canonical decomposition and the recursive | 75 | | compatibility decomposition are different. */ | 76 | 38.8M | for (int curr = 0; curr < decomposed_count; ) | 77 | 24.1M | { | 78 | | /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e. | 79 | | all elements are atomic. */ | 80 | 24.1M | ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH]; | 81 | 24.1M | int curr_decomposed_count; | 82 | | | 83 | 24.1M | curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed); | 84 | 24.1M | if (curr_decomposed_count >= 0) | 85 | 3.09M | { | 86 | | /* Move curr_decomposed[0..curr_decomposed_count-1] over | 87 | | decomposed[curr], making room. It's not worth using | 88 | | memcpy() here, since the counts are so small. */ | 89 | 3.09M | int shift = curr_decomposed_count - 1; | 90 | | | 91 | 3.09M | if (shift < 0) | 92 | 0 | abort (); | 93 | 3.09M | if (shift > 0) | 94 | 2.69M | { | 95 | 2.69M | decomposed_count += shift; | 96 | 2.69M | if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH) | 97 | 0 | abort (); | 98 | 2.70M | for (int j = decomposed_count - 1 - shift; j > curr; j--) | 99 | 16.1k | decomposed[j + shift] = decomposed[j]; | 100 | 2.69M | } | 101 | 12.5M | for (; shift >= 0; shift--) | 102 | 9.42M | decomposed[curr + shift] = curr_decomposed[shift]; | 103 | 3.09M | } | 104 | 21.0M | else | 105 | 21.0M | { | 106 | | /* decomposed[curr] is atomic. */ | 107 | 21.0M | curr++; | 108 | 21.0M | } | 109 | 24.1M | } | 110 | 14.6M | } | 111 | 4.12M | else | 112 | 4.12M | { | 113 | 4.12M | count = 0; | 114 | 4.12M | decomposed_count = 0; | 115 | 4.12M | } | 116 | | | 117 | 18.8M | i = 0; | 118 | 18.8M | for (;;) | 119 | 39.8M | { | 120 | 39.8M | ucs4_t uc; | 121 | 39.8M | int ccc; | 122 | | | 123 | 39.8M | if (s < s_end) | 124 | 35.7M | { | 125 | | /* Fetch the next character from the decomposition. */ | 126 | 35.7M | if (i == decomposed_count) | 127 | 14.6M | break; | 128 | 21.0M | uc = decomposed[i]; | 129 | 21.0M | ccc = uc_combining_class (uc); | 130 | 21.0M | } | 131 | 4.12M | else | 132 | 4.12M | { | 133 | | /* End of string reached. */ | 134 | 4.12M | uc = 0; | 135 | 4.12M | ccc = 0; | 136 | 4.12M | } | 137 | | | 138 | 25.1M | if (ccc == 0) | 139 | 22.2M | { | 140 | | /* Apply the canonical ordering algorithm to the accumulated | 141 | | sequence of characters. */ | 142 | 22.2M | if (sortbuf_count > 1) | 143 | 1.60M | gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count, | 144 | 1.60M | sortbuf + sortbuf_count); | 145 | | | 146 | 22.2M | if (composer != NULL) | 147 | 22.2M | { | 148 | | /* Attempt to combine decomposed characters, as specified | 149 | | in the Unicode Standard Annex #15 "Unicode Normalization | 150 | | Forms". We need to check | 151 | | 1. whether the first accumulated character is a | 152 | | "starter" (i.e. has ccc = 0). This is usually the | 153 | | case. But when the string starts with a | 154 | | non-starter, the sortbuf also starts with a | 155 | | non-starter. Btw, this check could also be | 156 | | omitted, because the composition table has only | 157 | | entries (code1, code2) for which code1 is a | 158 | | starter; if the first accumulated character is not | 159 | | a starter, no lookup will succeed. | 160 | | 2. If the sortbuf has more than one character, check | 161 | | for each of these characters that are not "blocked" | 162 | | from the starter (i.e. have a ccc that is higher | 163 | | than the ccc of the previous character) whether it | 164 | | can be combined with the first character. | 165 | | 3. If only one character is left in sortbuf, check | 166 | | whether it can be combined with the next character | 167 | | (also a starter). */ | 168 | 22.2M | if (sortbuf_count > 0 && sortbuf[0].ccc == 0) | 169 | 18.1M | { | 170 | 21.0M | for (size_t j = 1; j < sortbuf_count; ) | 171 | 2.84M | { | 172 | 2.84M | if (sortbuf[j].ccc > sortbuf[j - 1].ccc) | 173 | 1.62M | { | 174 | 1.62M | ucs4_t combined = | 175 | 1.62M | composer (sortbuf[0].code, sortbuf[j].code); | 176 | 1.62M | if (combined) | 177 | 1.59M | { | 178 | 1.59M | sortbuf[0].code = combined; | 179 | | /* sortbuf[0].ccc = 0, still valid. */ | 180 | 3.19M | for (size_t k = j + 1; k < sortbuf_count; k++) | 181 | 1.60M | sortbuf[k - 1] = sortbuf[k]; | 182 | 1.59M | sortbuf_count--; | 183 | 1.59M | continue; | 184 | 1.59M | } | 185 | 1.62M | } | 186 | 1.25M | j++; | 187 | 1.25M | } | 188 | 18.1M | if (s < s_end && sortbuf_count == 1) | 189 | 14.0M | { | 190 | 14.0M | ucs4_t combined = | 191 | 14.0M | composer (sortbuf[0].code, uc); | 192 | 14.0M | if (combined) | 193 | 7.42k | { | 194 | 7.42k | uc = combined; | 195 | 7.42k | ccc = 0; | 196 | | /* uc could be further combined with subsequent | 197 | | characters. So don't put it into sortbuf[0] in | 198 | | this round, only in the next round. */ | 199 | 7.42k | sortbuf_count = 0; | 200 | 7.42k | } | 201 | 14.0M | } | 202 | 18.1M | } | 203 | 22.2M | } | 204 | | | 205 | 41.6M | for (size_t j = 0; j < sortbuf_count; j++) | 206 | 19.4M | { | 207 | 19.4M | ucs4_t muc = sortbuf[j].code; | 208 | | | 209 | | /* Append muc to the result accumulator. */ | 210 | 19.4M | if (length < allocated) | 211 | 15.2M | { | 212 | 15.2M | int ret = | 213 | 15.2M | U_UCTOMB (result + length, muc, allocated - length); | 214 | 15.2M | if (ret == -1) | 215 | 0 | { | 216 | 0 | errno = EINVAL; | 217 | 0 | goto fail; | 218 | 0 | } | 219 | 15.2M | if (ret >= 0) | 220 | 15.2M | { | 221 | 15.2M | length += ret; | 222 | 15.2M | goto done_appending; | 223 | 15.2M | } | 224 | 15.2M | } | 225 | 4.13M | { | 226 | 4.13M | size_t old_allocated = allocated; | 227 | 4.13M | size_t new_allocated = 2 * old_allocated; | 228 | 4.13M | if (new_allocated < 64) | 229 | 4.12M | new_allocated = 64; | 230 | 4.13M | if (new_allocated < old_allocated) /* integer overflow? */ | 231 | 0 | abort (); | 232 | 4.13M | { | 233 | 4.13M | UNIT *larger_result; | 234 | 4.13M | if (result == NULL) | 235 | 4.12M | { | 236 | 4.12M | larger_result = | 237 | 4.12M | (UNIT *) malloc (new_allocated * sizeof (UNIT)); | 238 | 4.12M | if (larger_result == NULL) | 239 | 0 | { | 240 | 0 | errno = ENOMEM; | 241 | 0 | goto fail; | 242 | 0 | } | 243 | 4.12M | } | 244 | 12.5k | else if (result == resultbuf) | 245 | 0 | { | 246 | 0 | larger_result = | 247 | 0 | (UNIT *) malloc (new_allocated * sizeof (UNIT)); | 248 | 0 | if (larger_result == NULL) | 249 | 0 | { | 250 | 0 | errno = ENOMEM; | 251 | 0 | goto fail; | 252 | 0 | } | 253 | 0 | U_CPY (larger_result, resultbuf, length); | 254 | 0 | } | 255 | 12.5k | else | 256 | 12.5k | { | 257 | 12.5k | larger_result = | 258 | 12.5k | (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); | 259 | 12.5k | if (larger_result == NULL) | 260 | 0 | { | 261 | 0 | errno = ENOMEM; | 262 | 0 | goto fail; | 263 | 0 | } | 264 | 12.5k | } | 265 | 4.13M | result = larger_result; | 266 | 4.13M | allocated = new_allocated; | 267 | 4.13M | { | 268 | 4.13M | int ret = | 269 | 4.13M | U_UCTOMB (result + length, muc, allocated - length); | 270 | 4.13M | if (ret == -1) | 271 | 0 | { | 272 | 0 | errno = EINVAL; | 273 | 0 | goto fail; | 274 | 0 | } | 275 | 4.13M | if (ret < 0) | 276 | 0 | abort (); | 277 | 4.13M | length += ret; | 278 | 4.13M | goto done_appending; | 279 | 4.13M | } | 280 | 4.13M | } | 281 | 4.13M | } | 282 | 19.4M | done_appending: ; | 283 | 19.4M | } | 284 | | | 285 | | /* sortbuf is now empty. */ | 286 | 22.2M | sortbuf_count = 0; | 287 | 22.2M | } | 288 | | | 289 | 25.1M | if (!(s < s_end)) | 290 | | /* End of string reached. */ | 291 | 4.12M | break; | 292 | | | 293 | | /* Append (uc, ccc) to sortbuf. */ | 294 | 21.0M | if (sortbuf_count == sortbuf_allocated) | 295 | 1.04k | { | 296 | 1.04k | struct ucs4_with_ccc *new_sortbuf; | 297 | | | 298 | 1.04k | sortbuf_allocated = 2 * sortbuf_allocated; | 299 | 1.04k | if (sortbuf_allocated < sortbuf_count) /* integer overflow? */ | 300 | 0 | abort (); | 301 | 1.04k | new_sortbuf = | 302 | 1.04k | (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc)); | 303 | 1.04k | if (new_sortbuf == NULL) | 304 | 0 | { | 305 | 0 | errno = ENOMEM; | 306 | 0 | goto fail; | 307 | 0 | } | 308 | 1.04k | memcpy (new_sortbuf, sortbuf, | 309 | 1.04k | sortbuf_count * sizeof (struct ucs4_with_ccc)); | 310 | 1.04k | if (sortbuf != sortbuf_preallocated) | 311 | 1.04k | free (sortbuf); | 312 | 1.04k | sortbuf = new_sortbuf; | 313 | 1.04k | } | 314 | 21.0M | sortbuf[sortbuf_count].code = uc; | 315 | 21.0M | sortbuf[sortbuf_count].ccc = ccc; | 316 | 21.0M | sortbuf_count++; | 317 | | | 318 | 21.0M | i++; | 319 | 21.0M | } | 320 | | | 321 | 18.8M | if (!(s < s_end)) | 322 | | /* End of string reached. */ | 323 | 4.12M | break; | 324 | | | 325 | 14.6M | s += count; | 326 | 14.6M | } | 327 | 4.12M | } | 328 | | | 329 | 4.12M | if (length == 0) | 330 | 0 | { | 331 | 0 | if (result == NULL) | 332 | 0 | { | 333 | | /* Return a non-NULL value. NULL means error. */ | 334 | 0 | result = (UNIT *) malloc (1); | 335 | 0 | if (result == NULL) | 336 | 0 | { | 337 | 0 | errno = ENOMEM; | 338 | 0 | goto fail; | 339 | 0 | } | 340 | 0 | } | 341 | 0 | } | 342 | 4.12M | else if (result != resultbuf && length < allocated) | 343 | 4.12M | { | 344 | | /* Shrink the allocated memory if possible. */ | 345 | 4.12M | UNIT *memory; | 346 | | | 347 | 4.12M | memory = (UNIT *) realloc (result, length * sizeof (UNIT)); | 348 | 4.12M | if (memory != NULL) | 349 | 4.12M | result = memory; | 350 | 4.12M | } | 351 | | | 352 | 4.12M | if (sortbuf_count > 0) | 353 | 0 | abort (); | 354 | 4.12M | if (sortbuf != sortbuf_preallocated) | 355 | 4.12M | free (sortbuf); | 356 | | | 357 | 4.12M | *lengthp = length; | 358 | 4.12M | return result; | 359 | | | 360 | 0 | fail: | 361 | 0 | { | 362 | 0 | int saved_errno = errno; | 363 | 0 | if (sortbuf != sortbuf_preallocated) | 364 | 0 | free (sortbuf); | 365 | 0 | if (result != resultbuf) | 366 | 0 | free (result); | 367 | 0 | errno = saved_errno; | 368 | 0 | } | 369 | | return NULL; | 370 | 4.12M | } |
Line | Count | Source | 21 | 8.42M | { | 22 | 8.42M | int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer; | 23 | 8.42M | ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer; | 24 | | | 25 | | /* The result being accumulated. */ | 26 | 8.42M | UNIT *result; | 27 | 8.42M | size_t length; | 28 | 8.42M | size_t allocated; | 29 | | /* The buffer for sorting. */ | 30 | 8.42M | #define SORTBUF_PREALLOCATED 64 | 31 | 8.42M | struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED]; | 32 | 8.42M | struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */ | 33 | 8.42M | size_t sortbuf_allocated; | 34 | 8.42M | size_t sortbuf_count; | 35 | | | 36 | | /* Initialize the accumulator. */ | 37 | 8.42M | if (resultbuf == NULL) | 38 | 8.42M | { | 39 | 8.42M | result = NULL; | 40 | 8.42M | allocated = 0; | 41 | 8.42M | } | 42 | 0 | else | 43 | 0 | { | 44 | 0 | result = resultbuf; | 45 | 0 | allocated = *lengthp; | 46 | 0 | } | 47 | 8.42M | length = 0; | 48 | | | 49 | | /* Initialize the buffer for sorting. */ | 50 | 8.42M | sortbuf = sortbuf_preallocated; | 51 | 8.42M | sortbuf_allocated = SORTBUF_PREALLOCATED; | 52 | 8.42M | sortbuf_count = 0; | 53 | | | 54 | 8.42M | { | 55 | 8.42M | const UNIT *s_end = s + n; | 56 | | | 57 | 8.42M | for (;;) | 58 | 20.4M | { | 59 | 20.4M | int count; | 60 | 20.4M | ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH]; | 61 | 20.4M | int decomposed_count; | 62 | 20.4M | int i; | 63 | | | 64 | 20.4M | if (s < s_end) | 65 | 12.0M | { | 66 | | /* Fetch the next character. */ | 67 | 12.0M | count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s); | 68 | 12.0M | decomposed_count = 1; | 69 | | | 70 | | /* Decompose it, recursively. | 71 | | It would be possible to precompute the recursive decomposition | 72 | | and store it in a table. But this would significantly increase | 73 | | the size of the decomposition tables, because for example for | 74 | | U+1FC1 the recursive canonical decomposition and the recursive | 75 | | compatibility decomposition are different. */ | 76 | 24.2M | for (int curr = 0; curr < decomposed_count; ) | 77 | 12.1M | { | 78 | | /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e. | 79 | | all elements are atomic. */ | 80 | 12.1M | ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH]; | 81 | 12.1M | int curr_decomposed_count; | 82 | | | 83 | 12.1M | curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed); | 84 | 12.1M | if (curr_decomposed_count >= 0) | 85 | 34.7k | { | 86 | | /* Move curr_decomposed[0..curr_decomposed_count-1] over | 87 | | decomposed[curr], making room. It's not worth using | 88 | | memcpy() here, since the counts are so small. */ | 89 | 34.7k | int shift = curr_decomposed_count - 1; | 90 | | | 91 | 34.7k | if (shift < 0) | 92 | 0 | abort (); | 93 | 34.7k | if (shift > 0) | 94 | 34.3k | { | 95 | 34.3k | decomposed_count += shift; | 96 | 34.3k | if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH) | 97 | 0 | abort (); | 98 | 44.1k | for (int j = decomposed_count - 1 - shift; j > curr; j--) | 99 | 9.76k | decomposed[j + shift] = decomposed[j]; | 100 | 34.3k | } | 101 | 103k | for (; shift >= 0; shift--) | 102 | 69.0k | decomposed[curr + shift] = curr_decomposed[shift]; | 103 | 34.7k | } | 104 | 12.1M | else | 105 | 12.1M | { | 106 | | /* decomposed[curr] is atomic. */ | 107 | 12.1M | curr++; | 108 | 12.1M | } | 109 | 12.1M | } | 110 | 12.0M | } | 111 | 8.42M | else | 112 | 8.42M | { | 113 | 8.42M | count = 0; | 114 | 8.42M | decomposed_count = 0; | 115 | 8.42M | } | 116 | | | 117 | 20.4M | i = 0; | 118 | 20.4M | for (;;) | 119 | 32.6M | { | 120 | 32.6M | ucs4_t uc; | 121 | 32.6M | int ccc; | 122 | | | 123 | 32.6M | if (s < s_end) | 124 | 24.1M | { | 125 | | /* Fetch the next character from the decomposition. */ | 126 | 24.1M | if (i == decomposed_count) | 127 | 12.0M | break; | 128 | 12.1M | uc = decomposed[i]; | 129 | 12.1M | ccc = uc_combining_class (uc); | 130 | 12.1M | } | 131 | 8.42M | else | 132 | 8.42M | { | 133 | | /* End of string reached. */ | 134 | 8.42M | uc = 0; | 135 | 8.42M | ccc = 0; | 136 | 8.42M | } | 137 | | | 138 | 20.5M | if (ccc == 0) | 139 | 20.4M | { | 140 | | /* Apply the canonical ordering algorithm to the accumulated | 141 | | sequence of characters. */ | 142 | 20.4M | if (sortbuf_count > 1) | 143 | 26.0k | gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count, | 144 | 26.0k | sortbuf + sortbuf_count); | 145 | | | 146 | 20.4M | if (composer != NULL) | 147 | 20.4M | { | 148 | | /* Attempt to combine decomposed characters, as specified | 149 | | in the Unicode Standard Annex #15 "Unicode Normalization | 150 | | Forms". We need to check | 151 | | 1. whether the first accumulated character is a | 152 | | "starter" (i.e. has ccc = 0). This is usually the | 153 | | case. But when the string starts with a | 154 | | non-starter, the sortbuf also starts with a | 155 | | non-starter. Btw, this check could also be | 156 | | omitted, because the composition table has only | 157 | | entries (code1, code2) for which code1 is a | 158 | | starter; if the first accumulated character is not | 159 | | a starter, no lookup will succeed. | 160 | | 2. If the sortbuf has more than one character, check | 161 | | for each of these characters that are not "blocked" | 162 | | from the starter (i.e. have a ccc that is higher | 163 | | than the ccc of the previous character) whether it | 164 | | can be combined with the first character. | 165 | | 3. If only one character is left in sortbuf, check | 166 | | whether it can be combined with the next character | 167 | | (also a starter). */ | 168 | 20.4M | if (sortbuf_count > 0 && sortbuf[0].ccc == 0) | 169 | 11.9M | { | 170 | 12.0M | for (size_t j = 1; j < sortbuf_count; ) | 171 | 100k | { | 172 | 100k | if (sortbuf[j].ccc > sortbuf[j - 1].ccc) | 173 | 37.5k | { | 174 | 37.5k | ucs4_t combined = | 175 | 37.5k | composer (sortbuf[0].code, sortbuf[j].code); | 176 | 37.5k | if (combined) | 177 | 23.0k | { | 178 | 23.0k | sortbuf[0].code = combined; | 179 | | /* sortbuf[0].ccc = 0, still valid. */ | 180 | 36.7k | for (size_t k = j + 1; k < sortbuf_count; k++) | 181 | 13.6k | sortbuf[k - 1] = sortbuf[k]; | 182 | 23.0k | sortbuf_count--; | 183 | 23.0k | continue; | 184 | 23.0k | } | 185 | 37.5k | } | 186 | 77.8k | j++; | 187 | 77.8k | } | 188 | 11.9M | if (s < s_end && sortbuf_count == 1) | 189 | 3.66M | { | 190 | 3.66M | ucs4_t combined = | 191 | 3.66M | composer (sortbuf[0].code, uc); | 192 | 3.66M | if (combined) | 193 | 9.25k | { | 194 | 9.25k | uc = combined; | 195 | 9.25k | ccc = 0; | 196 | | /* uc could be further combined with subsequent | 197 | | characters. So don't put it into sortbuf[0] in | 198 | | this round, only in the next round. */ | 199 | 9.25k | sortbuf_count = 0; | 200 | 9.25k | } | 201 | 3.66M | } | 202 | 11.9M | } | 203 | 20.4M | } | 204 | | | 205 | 32.4M | for (size_t j = 0; j < sortbuf_count; j++) | 206 | 12.0M | { | 207 | 12.0M | ucs4_t muc = sortbuf[j].code; | 208 | | | 209 | | /* Append muc to the result accumulator. */ | 210 | 12.0M | if (length < allocated) | 211 | 3.74M | { | 212 | 3.74M | int ret = | 213 | 3.74M | U_UCTOMB (result + length, muc, allocated - length); | 214 | 3.74M | if (ret == -1) | 215 | 0 | { | 216 | 0 | errno = EINVAL; | 217 | 0 | goto fail; | 218 | 0 | } | 219 | 3.74M | if (ret >= 0) | 220 | 3.74M | { | 221 | 3.74M | length += ret; | 222 | 3.74M | goto done_appending; | 223 | 3.74M | } | 224 | 3.74M | } | 225 | 8.33M | { | 226 | 8.33M | size_t old_allocated = allocated; | 227 | 8.33M | size_t new_allocated = 2 * old_allocated; | 228 | 8.33M | if (new_allocated < 64) | 229 | 8.32M | new_allocated = 64; | 230 | 8.33M | if (new_allocated < old_allocated) /* integer overflow? */ | 231 | 0 | abort (); | 232 | 8.33M | { | 233 | 8.33M | UNIT *larger_result; | 234 | 8.33M | if (result == NULL) | 235 | 8.32M | { | 236 | 8.32M | larger_result = | 237 | 8.32M | (UNIT *) malloc (new_allocated * sizeof (UNIT)); | 238 | 8.32M | if (larger_result == NULL) | 239 | 0 | { | 240 | 0 | errno = ENOMEM; | 241 | 0 | goto fail; | 242 | 0 | } | 243 | 8.32M | } | 244 | 3.01k | else if (result == resultbuf) | 245 | 0 | { | 246 | 0 | larger_result = | 247 | 0 | (UNIT *) malloc (new_allocated * sizeof (UNIT)); | 248 | 0 | if (larger_result == NULL) | 249 | 0 | { | 250 | 0 | errno = ENOMEM; | 251 | 0 | goto fail; | 252 | 0 | } | 253 | 0 | U_CPY (larger_result, resultbuf, length); | 254 | 0 | } | 255 | 3.01k | else | 256 | 3.01k | { | 257 | 3.01k | larger_result = | 258 | 3.01k | (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); | 259 | 3.01k | if (larger_result == NULL) | 260 | 0 | { | 261 | 0 | errno = ENOMEM; | 262 | 0 | goto fail; | 263 | 0 | } | 264 | 3.01k | } | 265 | 8.33M | result = larger_result; | 266 | 8.33M | allocated = new_allocated; | 267 | 8.33M | { | 268 | 8.33M | int ret = | 269 | 8.33M | U_UCTOMB (result + length, muc, allocated - length); | 270 | 8.33M | if (ret == -1) | 271 | 0 | { | 272 | 0 | errno = EINVAL; | 273 | 0 | goto fail; | 274 | 0 | } | 275 | 8.33M | if (ret < 0) | 276 | 0 | abort (); | 277 | 8.33M | length += ret; | 278 | 8.33M | goto done_appending; | 279 | 8.33M | } | 280 | 8.33M | } | 281 | 8.33M | } | 282 | 12.0M | done_appending: ; | 283 | 12.0M | } | 284 | | | 285 | | /* sortbuf is now empty. */ | 286 | 20.4M | sortbuf_count = 0; | 287 | 20.4M | } | 288 | | | 289 | 20.5M | if (!(s < s_end)) | 290 | | /* End of string reached. */ | 291 | 8.42M | break; | 292 | | | 293 | | /* Append (uc, ccc) to sortbuf. */ | 294 | 12.1M | if (sortbuf_count == sortbuf_allocated) | 295 | 826 | { | 296 | 826 | struct ucs4_with_ccc *new_sortbuf; | 297 | | | 298 | 826 | sortbuf_allocated = 2 * sortbuf_allocated; | 299 | 826 | if (sortbuf_allocated < sortbuf_count) /* integer overflow? */ | 300 | 0 | abort (); | 301 | 826 | new_sortbuf = | 302 | 826 | (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc)); | 303 | 826 | if (new_sortbuf == NULL) | 304 | 0 | { | 305 | 0 | errno = ENOMEM; | 306 | 0 | goto fail; | 307 | 0 | } | 308 | 826 | memcpy (new_sortbuf, sortbuf, | 309 | 826 | sortbuf_count * sizeof (struct ucs4_with_ccc)); | 310 | 826 | if (sortbuf != sortbuf_preallocated) | 311 | 826 | free (sortbuf); | 312 | 826 | sortbuf = new_sortbuf; | 313 | 826 | } | 314 | 12.1M | sortbuf[sortbuf_count].code = uc; | 315 | 12.1M | sortbuf[sortbuf_count].ccc = ccc; | 316 | 12.1M | sortbuf_count++; | 317 | | | 318 | 12.1M | i++; | 319 | 12.1M | } | 320 | | | 321 | 20.4M | if (!(s < s_end)) | 322 | | /* End of string reached. */ | 323 | 8.42M | break; | 324 | | | 325 | 12.0M | s += count; | 326 | 12.0M | } | 327 | 8.42M | } | 328 | | | 329 | 8.42M | if (length == 0) | 330 | 91.3k | { | 331 | 91.3k | if (result == NULL) | 332 | 91.3k | { | 333 | | /* Return a non-NULL value. NULL means error. */ | 334 | 91.3k | result = (UNIT *) malloc (1); | 335 | 91.3k | if (result == NULL) | 336 | 0 | { | 337 | 0 | errno = ENOMEM; | 338 | 0 | goto fail; | 339 | 0 | } | 340 | 91.3k | } | 341 | 91.3k | } | 342 | 8.32M | else if (result != resultbuf && length < allocated) | 343 | 8.32M | { | 344 | | /* Shrink the allocated memory if possible. */ | 345 | 8.32M | UNIT *memory; | 346 | | | 347 | 8.32M | memory = (UNIT *) realloc (result, length * sizeof (UNIT)); | 348 | 8.32M | if (memory != NULL) | 349 | 8.32M | result = memory; | 350 | 8.32M | } | 351 | | | 352 | 8.42M | if (sortbuf_count > 0) | 353 | 0 | abort (); | 354 | 8.42M | if (sortbuf != sortbuf_preallocated) | 355 | 8.42M | free (sortbuf); | 356 | | | 357 | 8.42M | *lengthp = length; | 358 | 8.42M | return result; | 359 | | | 360 | 0 | fail: | 361 | 0 | { | 362 | 0 | int saved_errno = errno; | 363 | 0 | if (sortbuf != sortbuf_preallocated) | 364 | 0 | free (sortbuf); | 365 | 0 | if (result != resultbuf) | 366 | 0 | free (result); | 367 | 0 | errno = saved_errno; | 368 | 0 | } | 369 | | return NULL; | 370 | 8.42M | } |
|