/src/gnutls/lib/unistring/uninorm/u-normalize-internal.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* Decomposition and composition of Unicode strings. |
2 | | Copyright (C) 2009-2023 Free Software Foundation, Inc. |
3 | | Written by Bruno Haible <bruno@clisp.org>, 2009. |
4 | | |
5 | | This file is free software: you can redistribute it and/or modify |
6 | | it under the terms of the GNU Lesser General Public License as |
7 | | published by the Free Software Foundation; either version 2.1 of the |
8 | | License, or (at your option) any later version. |
9 | | |
10 | | This file is distributed in the hope that it will be useful, |
11 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | GNU Lesser General Public License for more details. |
14 | | |
15 | | You should have received a copy of the GNU Lesser General Public License |
16 | | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
17 | | |
18 | | UNIT * |
19 | | FUNC (uninorm_t nf, const UNIT *s, size_t n, |
20 | | UNIT *resultbuf, size_t *lengthp) |
21 | 0 | { |
22 | 0 | int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer; |
23 | 0 | ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer; |
24 | | |
25 | | /* The result being accumulated. */ |
26 | 0 | UNIT *result; |
27 | 0 | size_t length; |
28 | 0 | size_t allocated; |
29 | | /* The buffer for sorting. */ |
30 | 0 | #define SORTBUF_PREALLOCATED 64 |
31 | 0 | struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED]; |
32 | 0 | struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */ |
33 | 0 | size_t sortbuf_allocated; |
34 | 0 | size_t sortbuf_count; |
35 | | |
36 | | /* Initialize the accumulator. */ |
37 | 0 | if (resultbuf == NULL) |
38 | 0 | { |
39 | 0 | result = NULL; |
40 | 0 | allocated = 0; |
41 | 0 | } |
42 | 0 | else |
43 | 0 | { |
44 | 0 | result = resultbuf; |
45 | 0 | allocated = *lengthp; |
46 | 0 | } |
47 | 0 | length = 0; |
48 | | |
49 | | /* Initialize the buffer for sorting. */ |
50 | 0 | sortbuf = sortbuf_preallocated; |
51 | 0 | sortbuf_allocated = SORTBUF_PREALLOCATED; |
52 | 0 | sortbuf_count = 0; |
53 | |
|
54 | 0 | { |
55 | 0 | const UNIT *s_end = s + n; |
56 | |
|
57 | 0 | for (;;) |
58 | 0 | { |
59 | 0 | int count; |
60 | 0 | ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH]; |
61 | 0 | int decomposed_count; |
62 | 0 | int i; |
63 | |
|
64 | 0 | if (s < s_end) |
65 | 0 | { |
66 | | /* Fetch the next character. */ |
67 | 0 | count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s); |
68 | 0 | decomposed_count = 1; |
69 | | |
70 | | /* Decompose it, recursively. |
71 | | It would be possible to precompute the recursive decomposition |
72 | | and store it in a table. But this would significantly increase |
73 | | the size of the decomposition tables, because for example for |
74 | | U+1FC1 the recursive canonical decomposition and the recursive |
75 | | compatibility decomposition are different. */ |
76 | 0 | { |
77 | 0 | int curr; |
78 | |
|
79 | 0 | for (curr = 0; curr < decomposed_count; ) |
80 | 0 | { |
81 | | /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e. |
82 | | all elements are atomic. */ |
83 | 0 | ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH]; |
84 | 0 | int curr_decomposed_count; |
85 | |
|
86 | 0 | curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed); |
87 | 0 | if (curr_decomposed_count >= 0) |
88 | 0 | { |
89 | | /* Move curr_decomposed[0..curr_decomposed_count-1] over |
90 | | decomposed[curr], making room. It's not worth using |
91 | | memcpy() here, since the counts are so small. */ |
92 | 0 | int shift = curr_decomposed_count - 1; |
93 | |
|
94 | 0 | if (shift < 0) |
95 | 0 | abort (); |
96 | 0 | if (shift > 0) |
97 | 0 | { |
98 | 0 | int j; |
99 | |
|
100 | 0 | decomposed_count += shift; |
101 | 0 | if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH) |
102 | 0 | abort (); |
103 | 0 | for (j = decomposed_count - 1 - shift; j > curr; j--) |
104 | 0 | decomposed[j + shift] = decomposed[j]; |
105 | 0 | } |
106 | 0 | for (; shift >= 0; shift--) |
107 | 0 | decomposed[curr + shift] = curr_decomposed[shift]; |
108 | 0 | } |
109 | 0 | else |
110 | 0 | { |
111 | | /* decomposed[curr] is atomic. */ |
112 | 0 | curr++; |
113 | 0 | } |
114 | 0 | } |
115 | 0 | } |
116 | 0 | } |
117 | 0 | else |
118 | 0 | { |
119 | 0 | count = 0; |
120 | 0 | decomposed_count = 0; |
121 | 0 | } |
122 | | |
123 | 0 | i = 0; |
124 | 0 | for (;;) |
125 | 0 | { |
126 | 0 | ucs4_t uc; |
127 | 0 | int ccc; |
128 | |
|
129 | 0 | if (s < s_end) |
130 | 0 | { |
131 | | /* Fetch the next character from the decomposition. */ |
132 | 0 | if (i == decomposed_count) |
133 | 0 | break; |
134 | 0 | uc = decomposed[i]; |
135 | 0 | ccc = uc_combining_class (uc); |
136 | 0 | } |
137 | 0 | else |
138 | 0 | { |
139 | | /* End of string reached. */ |
140 | 0 | uc = 0; |
141 | 0 | ccc = 0; |
142 | 0 | } |
143 | | |
144 | 0 | if (ccc == 0) |
145 | 0 | { |
146 | 0 | size_t j; |
147 | | |
148 | | /* Apply the canonical ordering algorithm to the accumulated |
149 | | sequence of characters. */ |
150 | 0 | if (sortbuf_count > 1) |
151 | 0 | gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count, |
152 | 0 | sortbuf + sortbuf_count); |
153 | |
|
154 | 0 | if (composer != NULL) |
155 | 0 | { |
156 | | /* Attempt to combine decomposed characters, as specified |
157 | | in the Unicode Standard Annex #15 "Unicode Normalization |
158 | | Forms". We need to check |
159 | | 1. whether the first accumulated character is a |
160 | | "starter" (i.e. has ccc = 0). This is usually the |
161 | | case. But when the string starts with a |
162 | | non-starter, the sortbuf also starts with a |
163 | | non-starter. Btw, this check could also be |
164 | | omitted, because the composition table has only |
165 | | entries (code1, code2) for which code1 is a |
166 | | starter; if the first accumulated character is not |
167 | | a starter, no lookup will succeed. |
168 | | 2. If the sortbuf has more than one character, check |
169 | | for each of these characters that are not "blocked" |
170 | | from the starter (i.e. have a ccc that is higher |
171 | | than the ccc of the previous character) whether it |
172 | | can be combined with the first character. |
173 | | 3. If only one character is left in sortbuf, check |
174 | | whether it can be combined with the next character |
175 | | (also a starter). */ |
176 | 0 | if (sortbuf_count > 0 && sortbuf[0].ccc == 0) |
177 | 0 | { |
178 | 0 | for (j = 1; j < sortbuf_count; ) |
179 | 0 | { |
180 | 0 | if (sortbuf[j].ccc > sortbuf[j - 1].ccc) |
181 | 0 | { |
182 | 0 | ucs4_t combined = |
183 | 0 | composer (sortbuf[0].code, sortbuf[j].code); |
184 | 0 | if (combined) |
185 | 0 | { |
186 | 0 | size_t k; |
187 | |
|
188 | 0 | sortbuf[0].code = combined; |
189 | | /* sortbuf[0].ccc = 0, still valid. */ |
190 | 0 | for (k = j + 1; k < sortbuf_count; k++) |
191 | 0 | sortbuf[k - 1] = sortbuf[k]; |
192 | 0 | sortbuf_count--; |
193 | 0 | continue; |
194 | 0 | } |
195 | 0 | } |
196 | 0 | j++; |
197 | 0 | } |
198 | 0 | if (s < s_end && sortbuf_count == 1) |
199 | 0 | { |
200 | 0 | ucs4_t combined = |
201 | 0 | composer (sortbuf[0].code, uc); |
202 | 0 | if (combined) |
203 | 0 | { |
204 | 0 | uc = combined; |
205 | 0 | ccc = 0; |
206 | | /* uc could be further combined with subsequent |
207 | | characters. So don't put it into sortbuf[0] in |
208 | | this round, only in the next round. */ |
209 | 0 | sortbuf_count = 0; |
210 | 0 | } |
211 | 0 | } |
212 | 0 | } |
213 | 0 | } |
214 | |
|
215 | 0 | for (j = 0; j < sortbuf_count; j++) |
216 | 0 | { |
217 | 0 | ucs4_t muc = sortbuf[j].code; |
218 | | |
219 | | /* Append muc to the result accumulator. */ |
220 | 0 | if (length < allocated) |
221 | 0 | { |
222 | 0 | int ret = |
223 | 0 | U_UCTOMB (result + length, muc, allocated - length); |
224 | 0 | if (ret == -1) |
225 | 0 | { |
226 | 0 | errno = EINVAL; |
227 | 0 | goto fail; |
228 | 0 | } |
229 | 0 | if (ret >= 0) |
230 | 0 | { |
231 | 0 | length += ret; |
232 | 0 | goto done_appending; |
233 | 0 | } |
234 | 0 | } |
235 | 0 | { |
236 | 0 | size_t old_allocated = allocated; |
237 | 0 | size_t new_allocated = 2 * old_allocated; |
238 | 0 | if (new_allocated < 64) |
239 | 0 | new_allocated = 64; |
240 | 0 | if (new_allocated < old_allocated) /* integer overflow? */ |
241 | 0 | abort (); |
242 | 0 | { |
243 | 0 | UNIT *larger_result; |
244 | 0 | if (result == NULL) |
245 | 0 | { |
246 | 0 | larger_result = |
247 | 0 | (UNIT *) malloc (new_allocated * sizeof (UNIT)); |
248 | 0 | if (larger_result == NULL) |
249 | 0 | { |
250 | 0 | errno = ENOMEM; |
251 | 0 | goto fail; |
252 | 0 | } |
253 | 0 | } |
254 | 0 | else if (result == resultbuf) |
255 | 0 | { |
256 | 0 | larger_result = |
257 | 0 | (UNIT *) malloc (new_allocated * sizeof (UNIT)); |
258 | 0 | if (larger_result == NULL) |
259 | 0 | { |
260 | 0 | errno = ENOMEM; |
261 | 0 | goto fail; |
262 | 0 | } |
263 | 0 | U_CPY (larger_result, resultbuf, length); |
264 | 0 | } |
265 | 0 | else |
266 | 0 | { |
267 | 0 | larger_result = |
268 | 0 | (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); |
269 | 0 | if (larger_result == NULL) |
270 | 0 | { |
271 | 0 | errno = ENOMEM; |
272 | 0 | goto fail; |
273 | 0 | } |
274 | 0 | } |
275 | 0 | result = larger_result; |
276 | 0 | allocated = new_allocated; |
277 | 0 | { |
278 | 0 | int ret = |
279 | 0 | U_UCTOMB (result + length, muc, allocated - length); |
280 | 0 | if (ret == -1) |
281 | 0 | { |
282 | 0 | errno = EINVAL; |
283 | 0 | goto fail; |
284 | 0 | } |
285 | 0 | if (ret < 0) |
286 | 0 | abort (); |
287 | 0 | length += ret; |
288 | 0 | goto done_appending; |
289 | 0 | } |
290 | 0 | } |
291 | 0 | } |
292 | 0 | done_appending: ; |
293 | 0 | } |
294 | | |
295 | | /* sortbuf is now empty. */ |
296 | 0 | sortbuf_count = 0; |
297 | 0 | } |
298 | | |
299 | 0 | if (!(s < s_end)) |
300 | | /* End of string reached. */ |
301 | 0 | break; |
302 | | |
303 | | /* Append (uc, ccc) to sortbuf. */ |
304 | 0 | if (sortbuf_count == sortbuf_allocated) |
305 | 0 | { |
306 | 0 | struct ucs4_with_ccc *new_sortbuf; |
307 | |
|
308 | 0 | sortbuf_allocated = 2 * sortbuf_allocated; |
309 | 0 | if (sortbuf_allocated < sortbuf_count) /* integer overflow? */ |
310 | 0 | abort (); |
311 | 0 | new_sortbuf = |
312 | 0 | (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc)); |
313 | 0 | if (new_sortbuf == NULL) |
314 | 0 | { |
315 | 0 | errno = ENOMEM; |
316 | 0 | goto fail; |
317 | 0 | } |
318 | 0 | memcpy (new_sortbuf, sortbuf, |
319 | 0 | sortbuf_count * sizeof (struct ucs4_with_ccc)); |
320 | 0 | if (sortbuf != sortbuf_preallocated) |
321 | 0 | free (sortbuf); |
322 | 0 | sortbuf = new_sortbuf; |
323 | 0 | } |
324 | 0 | sortbuf[sortbuf_count].code = uc; |
325 | 0 | sortbuf[sortbuf_count].ccc = ccc; |
326 | 0 | sortbuf_count++; |
327 | |
|
328 | 0 | i++; |
329 | 0 | } |
330 | | |
331 | 0 | if (!(s < s_end)) |
332 | | /* End of string reached. */ |
333 | 0 | break; |
334 | | |
335 | 0 | s += count; |
336 | 0 | } |
337 | 0 | } |
338 | | |
339 | 0 | if (length == 0) |
340 | 0 | { |
341 | 0 | if (result == NULL) |
342 | 0 | { |
343 | | /* Return a non-NULL value. NULL means error. */ |
344 | 0 | result = (UNIT *) malloc (1); |
345 | 0 | if (result == NULL) |
346 | 0 | { |
347 | 0 | errno = ENOMEM; |
348 | 0 | goto fail; |
349 | 0 | } |
350 | 0 | } |
351 | 0 | } |
352 | 0 | else if (result != resultbuf && length < allocated) |
353 | 0 | { |
354 | | /* Shrink the allocated memory if possible. */ |
355 | 0 | UNIT *memory; |
356 | |
|
357 | 0 | memory = (UNIT *) realloc (result, length * sizeof (UNIT)); |
358 | 0 | if (memory != NULL) |
359 | 0 | result = memory; |
360 | 0 | } |
361 | | |
362 | 0 | if (sortbuf_count > 0) |
363 | 0 | abort (); |
364 | 0 | if (sortbuf != sortbuf_preallocated) |
365 | 0 | free (sortbuf); |
366 | |
|
367 | 0 | *lengthp = length; |
368 | 0 | return result; |
369 | | |
370 | 0 | fail: |
371 | 0 | { |
372 | 0 | int saved_errno = errno; |
373 | 0 | if (sortbuf != sortbuf_preallocated) |
374 | 0 | free (sortbuf); |
375 | 0 | if (result != resultbuf) |
376 | 0 | free (result); |
377 | 0 | errno = saved_errno; |
378 | 0 | } |
379 | 0 | return NULL; |
380 | 0 | } Unexecuted instantiation: u16_normalize Unexecuted instantiation: u32_normalize |