/src/libunistring/lib/unicase/u-casemap.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent). |
2 | | Copyright (C) 2009-2022 Free Software Foundation, Inc. |
3 | | Written by Bruno Haible <bruno@clisp.org>, 2009. |
4 | | |
5 | | This file is free software. |
6 | | It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". |
7 | | You can redistribute it and/or modify it under either |
8 | | - the terms of the GNU Lesser General Public License as published |
9 | | by the Free Software Foundation; either version 3, or (at your |
10 | | option) any later version, or |
11 | | - the terms of the GNU General Public License as published by the |
12 | | Free Software Foundation; either version 2, or (at your option) |
13 | | any later version, or |
14 | | - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". |
15 | | |
16 | | This file is distributed in the hope that it will be useful, |
17 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
19 | | Lesser General Public License and the GNU General Public License |
20 | | for more details. |
21 | | |
22 | | You should have received a copy of the GNU Lesser General Public |
23 | | License and of the GNU General Public License along with this |
24 | | program. If not, see <https://www.gnu.org/licenses/>. */ |
25 | | |
26 | | UNIT * |
27 | | FUNC (const UNIT *s, size_t n, |
28 | | casing_prefix_context_t prefix_context, |
29 | | casing_suffix_context_t suffix_context, |
30 | | const char *iso639_language, |
31 | | ucs4_t (*single_character_map) (ucs4_t), |
32 | | size_t offset_in_rule, /* offset in 'struct special_casing_rule' */ |
33 | | uninorm_t nf, |
34 | | UNIT *resultbuf, size_t *lengthp) |
35 | 0 | { |
36 | | /* The result being accumulated. */ |
37 | 0 | UNIT *result; |
38 | 0 | size_t length; |
39 | 0 | size_t allocated; |
40 | | |
41 | | /* Initialize the accumulator. */ |
42 | 0 | if (nf != NULL || resultbuf == NULL) |
43 | 0 | { |
44 | 0 | result = NULL; |
45 | 0 | allocated = 0; |
46 | 0 | } |
47 | 0 | else |
48 | 0 | { |
49 | 0 | result = resultbuf; |
50 | 0 | allocated = *lengthp; |
51 | 0 | } |
52 | 0 | length = 0; |
53 | |
|
54 | 0 | { |
55 | 0 | const UNIT *s_end = s + n; |
56 | | |
57 | | /* Helper for evaluating the FINAL_SIGMA condition: |
58 | | Last character that was not case-ignorable. */ |
59 | 0 | ucs4_t last_char_except_ignorable = |
60 | 0 | prefix_context.last_char_except_ignorable; |
61 | | |
62 | | /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions: |
63 | | Last character that was of combining class 230 ("Above") or 0. */ |
64 | 0 | ucs4_t last_char_normal_or_above = |
65 | 0 | prefix_context.last_char_normal_or_above; |
66 | |
|
67 | 0 | while (s < s_end) |
68 | 0 | { |
69 | 0 | ucs4_t uc; |
70 | 0 | int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); |
71 | |
|
72 | 0 | ucs4_t mapped_uc[3]; |
73 | 0 | unsigned int mapped_count; |
74 | |
|
75 | 0 | if (uc < 0x10000) |
76 | 0 | { |
77 | | /* Look first in the special-casing table. */ |
78 | 0 | char code[3]; |
79 | |
|
80 | 0 | code[0] = (uc >> 8) & 0xff; |
81 | 0 | code[1] = uc & 0xff; |
82 | |
|
83 | 0 | for (code[2] = 0; ; code[2]++) |
84 | 0 | { |
85 | 0 | const struct special_casing_rule *rule = |
86 | 0 | gl_unicase_special_lookup (code, 3); |
87 | |
|
88 | 0 | if (rule == NULL) |
89 | 0 | break; |
90 | | |
91 | | /* Test if the condition applies. */ |
92 | | /* Does the language apply? */ |
93 | 0 | if (rule->language[0] == '\0' |
94 | 0 | || (iso639_language != NULL |
95 | 0 | && iso639_language[0] == rule->language[0] |
96 | 0 | && iso639_language[1] == rule->language[1])) |
97 | 0 | { |
98 | | /* Does the context apply? */ |
99 | 0 | int context = rule->context; |
100 | 0 | bool applies; |
101 | |
|
102 | 0 | if (context < 0) |
103 | 0 | context = - context; |
104 | 0 | switch (context) |
105 | 0 | { |
106 | 0 | case SCC_ALWAYS: |
107 | 0 | applies = true; |
108 | 0 | break; |
109 | | |
110 | 0 | case SCC_FINAL_SIGMA: |
111 | | /* "Before" condition: preceded by a sequence |
112 | | consisting of a cased letter and a case-ignorable |
113 | | sequence. |
114 | | "After" condition: not followed by a sequence |
115 | | consisting of a case-ignorable sequence and then a |
116 | | cased letter. */ |
117 | | /* Test the "before" condition. */ |
118 | 0 | applies = uc_is_cased (last_char_except_ignorable); |
119 | | /* Test the "after" condition. */ |
120 | 0 | if (applies) |
121 | 0 | { |
122 | 0 | const UNIT *s2 = s + count; |
123 | 0 | for (;;) |
124 | 0 | { |
125 | 0 | if (s2 < s_end) |
126 | 0 | { |
127 | 0 | ucs4_t uc2; |
128 | 0 | int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); |
129 | | /* Our uc_is_case_ignorable function is |
130 | | known to return false for all cased |
131 | | characters. So we can call |
132 | | uc_is_case_ignorable first. */ |
133 | 0 | if (!uc_is_case_ignorable (uc2)) |
134 | 0 | { |
135 | 0 | applies = ! uc_is_cased (uc2); |
136 | 0 | break; |
137 | 0 | } |
138 | 0 | s2 += count2; |
139 | 0 | } |
140 | 0 | else |
141 | 0 | { |
142 | 0 | applies = ! uc_is_cased (suffix_context.first_char_except_ignorable); |
143 | 0 | break; |
144 | 0 | } |
145 | 0 | } |
146 | 0 | } |
147 | 0 | break; |
148 | | |
149 | 0 | case SCC_AFTER_SOFT_DOTTED: |
150 | | /* "Before" condition: There is a Soft_Dotted character |
151 | | before it, with no intervening character of |
152 | | combining class 0 or 230 (Above). */ |
153 | | /* Test the "before" condition. */ |
154 | 0 | applies = uc_is_property_soft_dotted (last_char_normal_or_above); |
155 | 0 | break; |
156 | | |
157 | 0 | case SCC_MORE_ABOVE: |
158 | | /* "After" condition: followed by a character of |
159 | | combining class 230 (Above) with no intervening |
160 | | character of combining class 0 or 230 (Above). */ |
161 | | /* Test the "after" condition. */ |
162 | 0 | { |
163 | 0 | const UNIT *s2 = s + count; |
164 | 0 | applies = false; |
165 | 0 | for (;;) |
166 | 0 | { |
167 | 0 | if (s2 < s_end) |
168 | 0 | { |
169 | 0 | ucs4_t uc2; |
170 | 0 | int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); |
171 | 0 | int ccc = uc_combining_class (uc2); |
172 | 0 | if (ccc == UC_CCC_A) |
173 | 0 | { |
174 | 0 | applies = true; |
175 | 0 | break; |
176 | 0 | } |
177 | 0 | if (ccc == UC_CCC_NR) |
178 | 0 | break; |
179 | 0 | s2 += count2; |
180 | 0 | } |
181 | 0 | else |
182 | 0 | { |
183 | 0 | applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0); |
184 | 0 | break; |
185 | 0 | } |
186 | 0 | } |
187 | 0 | } |
188 | 0 | break; |
189 | | |
190 | 0 | case SCC_BEFORE_DOT: |
191 | | /* "After" condition: followed by COMBINING DOT ABOVE |
192 | | (U+0307). Any sequence of characters with a |
193 | | combining class that is neither 0 nor 230 may |
194 | | intervene between the current character and the |
195 | | combining dot above. */ |
196 | | /* Test the "after" condition. */ |
197 | 0 | { |
198 | 0 | const UNIT *s2 = s + count; |
199 | 0 | applies = false; |
200 | 0 | for (;;) |
201 | 0 | { |
202 | 0 | if (s2 < s_end) |
203 | 0 | { |
204 | 0 | ucs4_t uc2; |
205 | 0 | int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); |
206 | 0 | if (uc2 == 0x0307) /* COMBINING DOT ABOVE */ |
207 | 0 | { |
208 | 0 | applies = true; |
209 | 0 | break; |
210 | 0 | } |
211 | 0 | { |
212 | 0 | int ccc = uc_combining_class (uc2); |
213 | 0 | if (ccc == UC_CCC_A || ccc == UC_CCC_NR) |
214 | 0 | break; |
215 | 0 | } |
216 | 0 | s2 += count2; |
217 | 0 | } |
218 | 0 | else |
219 | 0 | { |
220 | 0 | applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0); |
221 | 0 | break; |
222 | 0 | } |
223 | 0 | } |
224 | 0 | } |
225 | 0 | break; |
226 | | |
227 | 0 | case SCC_AFTER_I: |
228 | | /* "Before" condition: There is an uppercase I before |
229 | | it, and there is no intervening character of |
230 | | combining class 0 or 230 (Above). */ |
231 | | /* Test the "before" condition. */ |
232 | 0 | applies = (last_char_normal_or_above == 'I'); |
233 | 0 | break; |
234 | | |
235 | 0 | default: |
236 | 0 | abort (); |
237 | 0 | } |
238 | 0 | if (rule->context < 0) |
239 | 0 | applies = !applies; |
240 | |
|
241 | 0 | if (applies) |
242 | 0 | { |
243 | | /* The rule applies. |
244 | | Look up the mapping (0 to 3 characters). */ |
245 | 0 | const unsigned short *mapped_in_rule = |
246 | 0 | (const unsigned short *)((const char *)rule + offset_in_rule); |
247 | |
|
248 | 0 | if (mapped_in_rule[0] == 0) |
249 | 0 | mapped_count = 0; |
250 | 0 | else |
251 | 0 | { |
252 | 0 | mapped_uc[0] = mapped_in_rule[0]; |
253 | 0 | if (mapped_in_rule[1] == 0) |
254 | 0 | mapped_count = 1; |
255 | 0 | else |
256 | 0 | { |
257 | 0 | mapped_uc[1] = mapped_in_rule[1]; |
258 | 0 | if (mapped_in_rule[2] == 0) |
259 | 0 | mapped_count = 2; |
260 | 0 | else |
261 | 0 | { |
262 | 0 | mapped_uc[2] = mapped_in_rule[2]; |
263 | 0 | mapped_count = 3; |
264 | 0 | } |
265 | 0 | } |
266 | 0 | } |
267 | 0 | goto found_mapping; |
268 | 0 | } |
269 | 0 | } |
270 | | |
271 | | /* Optimization: Save a hash table lookup in the next round. */ |
272 | 0 | if (!rule->has_next) |
273 | 0 | break; |
274 | 0 | } |
275 | 0 | } |
276 | | |
277 | | /* No special-cased mapping. So use the locale and context independent |
278 | | mapping. */ |
279 | 0 | mapped_uc[0] = single_character_map (uc); |
280 | 0 | mapped_count = 1; |
281 | |
|
282 | 0 | found_mapping: |
283 | | /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */ |
284 | 0 | { |
285 | 0 | unsigned int i; |
286 | |
|
287 | 0 | for (i = 0; i < mapped_count; i++) |
288 | 0 | { |
289 | 0 | ucs4_t muc = mapped_uc[i]; |
290 | | |
291 | | /* Append muc to the result accumulator. */ |
292 | 0 | if (length < allocated) |
293 | 0 | { |
294 | 0 | int ret = U_UCTOMB (result + length, muc, allocated - length); |
295 | 0 | if (ret == -1) |
296 | 0 | { |
297 | 0 | errno = EINVAL; |
298 | 0 | goto fail; |
299 | 0 | } |
300 | 0 | if (ret >= 0) |
301 | 0 | { |
302 | 0 | length += ret; |
303 | 0 | goto done_appending; |
304 | 0 | } |
305 | 0 | } |
306 | 0 | { |
307 | 0 | size_t old_allocated = allocated; |
308 | 0 | size_t new_allocated = 2 * old_allocated; |
309 | 0 | if (new_allocated < 64) |
310 | 0 | new_allocated = 64; |
311 | 0 | if (new_allocated < old_allocated) /* integer overflow? */ |
312 | 0 | abort (); |
313 | 0 | { |
314 | 0 | UNIT *larger_result; |
315 | 0 | if (result == NULL) |
316 | 0 | { |
317 | 0 | larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); |
318 | 0 | if (larger_result == NULL) |
319 | 0 | { |
320 | 0 | errno = ENOMEM; |
321 | 0 | goto fail; |
322 | 0 | } |
323 | 0 | } |
324 | 0 | else if (result == resultbuf) |
325 | 0 | { |
326 | 0 | larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); |
327 | 0 | if (larger_result == NULL) |
328 | 0 | { |
329 | 0 | errno = ENOMEM; |
330 | 0 | goto fail; |
331 | 0 | } |
332 | 0 | U_CPY (larger_result, resultbuf, length); |
333 | 0 | } |
334 | 0 | else |
335 | 0 | { |
336 | 0 | larger_result = |
337 | 0 | (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); |
338 | 0 | if (larger_result == NULL) |
339 | 0 | { |
340 | 0 | errno = ENOMEM; |
341 | 0 | goto fail; |
342 | 0 | } |
343 | 0 | } |
344 | 0 | result = larger_result; |
345 | 0 | allocated = new_allocated; |
346 | 0 | { |
347 | 0 | int ret = U_UCTOMB (result + length, muc, allocated - length); |
348 | 0 | if (ret == -1) |
349 | 0 | { |
350 | 0 | errno = EINVAL; |
351 | 0 | goto fail; |
352 | 0 | } |
353 | 0 | if (ret < 0) |
354 | 0 | abort (); |
355 | 0 | length += ret; |
356 | 0 | goto done_appending; |
357 | 0 | } |
358 | 0 | } |
359 | 0 | } |
360 | 0 | done_appending: ; |
361 | 0 | } |
362 | 0 | } |
363 | | |
364 | 0 | if (!uc_is_case_ignorable (uc)) |
365 | 0 | last_char_except_ignorable = uc; |
366 | |
|
367 | 0 | { |
368 | 0 | int ccc = uc_combining_class (uc); |
369 | 0 | if (ccc == UC_CCC_A || ccc == UC_CCC_NR) |
370 | 0 | last_char_normal_or_above = uc; |
371 | 0 | } |
372 | |
|
373 | 0 | s += count; |
374 | 0 | } |
375 | 0 | } |
376 | | |
377 | 0 | if (nf != NULL) |
378 | 0 | { |
379 | | /* Finally, normalize the result. */ |
380 | 0 | UNIT *normalized_result; |
381 | |
|
382 | 0 | normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp); |
383 | 0 | if (normalized_result == NULL) |
384 | 0 | goto fail; |
385 | | |
386 | 0 | free (result); |
387 | 0 | return normalized_result; |
388 | 0 | } |
389 | | |
390 | 0 | if (length == 0) |
391 | 0 | { |
392 | 0 | if (result == NULL) |
393 | 0 | { |
394 | | /* Return a non-NULL value. NULL means error. */ |
395 | 0 | result = (UNIT *) malloc (1); |
396 | 0 | if (result == NULL) |
397 | 0 | { |
398 | 0 | errno = ENOMEM; |
399 | 0 | goto fail; |
400 | 0 | } |
401 | 0 | } |
402 | 0 | } |
403 | 0 | else if (result != resultbuf && length < allocated) |
404 | 0 | { |
405 | | /* Shrink the allocated memory if possible. */ |
406 | 0 | UNIT *memory; |
407 | |
|
408 | 0 | memory = (UNIT *) realloc (result, length * sizeof (UNIT)); |
409 | 0 | if (memory != NULL) |
410 | 0 | result = memory; |
411 | 0 | } |
412 | | |
413 | 0 | *lengthp = length; |
414 | 0 | return result; |
415 | | |
416 | 0 | fail: |
417 | 0 | if (result != resultbuf) |
418 | 0 | { |
419 | 0 | int saved_errno = errno; |
420 | 0 | free (result); |
421 | 0 | errno = saved_errno; |
422 | 0 | } |
423 | 0 | return NULL; |
424 | 0 | } |