/src/libunistring/lib/unicase/u-casemap.h
Line | Count | Source |
1 | | /* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent). |
2 | | Copyright (C) 2009-2026 Free Software Foundation, Inc. |
3 | | Written by Bruno Haible <bruno@clisp.org>, 2009. |
4 | | |
5 | | This file is free software. |
6 | | It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". |
7 | | You can redistribute it and/or modify it under either |
8 | | - the terms of the GNU Lesser General Public License as published |
9 | | by the Free Software Foundation, either version 3, or (at your |
10 | | option) any later version, or |
11 | | - the terms of the GNU General Public License as published by the |
12 | | Free Software Foundation; either version 2, or (at your option) |
13 | | any later version, or |
14 | | - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". |
15 | | |
16 | | This file is distributed in the hope that it will be useful, |
17 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
19 | | Lesser General Public License and the GNU General Public License |
20 | | for more details. |
21 | | |
22 | | You should have received a copy of the GNU Lesser General Public |
23 | | License and of the GNU General Public License along with this |
24 | | program. If not, see <https://www.gnu.org/licenses/>. */ |
25 | | |
26 | | UNIT * |
27 | | FUNC (const UNIT *s, size_t n, |
28 | | casing_prefix_context_t prefix_context, |
29 | | casing_suffix_context_t suffix_context, |
30 | | const char *iso639_language, |
31 | | ucs4_t (*single_character_map) (ucs4_t), |
32 | | size_t offset_in_rule, /* offset in 'struct special_casing_rule' */ |
33 | | uninorm_t nf, |
34 | | UNIT *resultbuf, size_t *lengthp) |
35 | 2.45M | { |
36 | | /* The result being accumulated. */ |
37 | 2.45M | UNIT *result; |
38 | 2.45M | size_t allocated; |
39 | 2.45M | if (nf != NULL || resultbuf == NULL) |
40 | 2.45M | { |
41 | 2.45M | result = NULL; |
42 | 2.45M | allocated = 0; |
43 | 2.45M | } |
44 | 0 | else |
45 | 0 | { |
46 | 0 | result = resultbuf; |
47 | 0 | allocated = *lengthp; |
48 | 0 | } |
49 | 2.45M | size_t length = 0; |
50 | | |
51 | 2.45M | { |
52 | 2.45M | const UNIT *s_end = s + n; |
53 | | |
54 | | /* Helper for evaluating the FINAL_SIGMA condition: |
55 | | Last character that was not case-ignorable. */ |
56 | 2.45M | ucs4_t last_char_except_ignorable = |
57 | 2.45M | prefix_context.last_char_except_ignorable; |
58 | | |
59 | | /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions: |
60 | | Last character that was of combining class 230 ("Above") or 0. */ |
61 | 2.45M | ucs4_t last_char_normal_or_above = |
62 | 2.45M | prefix_context.last_char_normal_or_above; |
63 | | |
64 | 13.4M | while (s < s_end) |
65 | 11.0M | { |
66 | 11.0M | ucs4_t uc; |
67 | 11.0M | int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); |
68 | | |
69 | 11.0M | ucs4_t mapped_uc[3]; |
70 | 11.0M | unsigned int mapped_count; |
71 | | |
72 | 11.0M | if (uc < 0x10000) |
73 | 10.9M | { |
74 | | /* Look first in the special-casing table. */ |
75 | 10.9M | char code[3]; |
76 | | |
77 | 10.9M | code[0] = (uc >> 8) & 0xff; |
78 | 10.9M | code[1] = uc & 0xff; |
79 | | |
80 | 10.9M | for (code[2] = 0; ; code[2]++) |
81 | 11.0M | { |
82 | 11.0M | const struct special_casing_rule *rule = |
83 | 11.0M | gl_unicase_special_lookup (code, 3); |
84 | | |
85 | 11.0M | if (rule == NULL) |
86 | 10.0M | break; |
87 | | |
88 | | /* Test if the condition applies. */ |
89 | | /* Does the language apply? */ |
90 | 1.01M | if (rule->language[0] == '\0' |
91 | 975k | || (iso639_language != NULL |
92 | 0 | && iso639_language[0] == rule->language[0] |
93 | 0 | && iso639_language[1] == rule->language[1])) |
94 | 35.6k | { |
95 | | /* Does the context apply? */ |
96 | 35.6k | int context = rule->context; |
97 | 35.6k | if (context < 0) |
98 | 0 | context = - context; |
99 | | |
100 | 35.6k | bool applies; |
101 | 35.6k | switch (context) |
102 | 35.6k | { |
103 | 7.29k | case SCC_ALWAYS: |
104 | 7.29k | applies = true; |
105 | 7.29k | break; |
106 | | |
107 | 28.3k | case SCC_FINAL_SIGMA: |
108 | | /* "Before" condition: preceded by a sequence |
109 | | consisting of a cased letter and a case-ignorable |
110 | | sequence. |
111 | | "After" condition: not followed by a sequence |
112 | | consisting of a case-ignorable sequence and then a |
113 | | cased letter. */ |
114 | | /* Test the "before" condition. */ |
115 | 28.3k | applies = uc_is_cased (last_char_except_ignorable); |
116 | | /* Test the "after" condition. */ |
117 | 28.3k | if (applies) |
118 | 5.74k | { |
119 | 5.74k | const UNIT *s2 = s + count; |
120 | 5.74k | for (;;) |
121 | 9.00k | { |
122 | 9.00k | if (s2 < s_end) |
123 | 9.00k | { |
124 | 9.00k | ucs4_t uc2; |
125 | 9.00k | int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); |
126 | | /* Our uc_is_case_ignorable function is |
127 | | known to return false for all cased |
128 | | characters. So we can call |
129 | | uc_is_case_ignorable first. */ |
130 | 9.00k | if (!uc_is_case_ignorable (uc2)) |
131 | 5.74k | { |
132 | 5.74k | applies = ! uc_is_cased (uc2); |
133 | 5.74k | break; |
134 | 5.74k | } |
135 | 3.26k | s2 += count2; |
136 | 3.26k | } |
137 | 0 | else |
138 | 0 | { |
139 | 0 | applies = ! uc_is_cased (suffix_context.first_char_except_ignorable); |
140 | 0 | break; |
141 | 0 | } |
142 | 9.00k | } |
143 | 5.74k | } |
144 | 28.3k | break; |
145 | | |
146 | 3.26k | case SCC_AFTER_SOFT_DOTTED: |
147 | | /* "Before" condition: There is a Soft_Dotted character |
148 | | before it, with no intervening character of |
149 | | combining class 0 or 230 (Above). */ |
150 | | /* Test the "before" condition. */ |
151 | 0 | applies = uc_is_property_soft_dotted (last_char_normal_or_above); |
152 | 0 | break; |
153 | | |
154 | 0 | case SCC_MORE_ABOVE: |
155 | | /* "After" condition: followed by a character of |
156 | | combining class 230 (Above) with no intervening |
157 | | character of combining class 0 or 230 (Above). */ |
158 | | /* Test the "after" condition. */ |
159 | 0 | { |
160 | 0 | const UNIT *s2 = s + count; |
161 | 0 | applies = false; |
162 | 0 | for (;;) |
163 | 0 | { |
164 | 0 | if (s2 < s_end) |
165 | 0 | { |
166 | 0 | ucs4_t uc2; |
167 | 0 | int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); |
168 | 0 | int ccc = uc_combining_class (uc2); |
169 | 0 | if (ccc == UC_CCC_A) |
170 | 0 | { |
171 | 0 | applies = true; |
172 | 0 | break; |
173 | 0 | } |
174 | 0 | if (ccc == UC_CCC_NR) |
175 | 0 | break; |
176 | 0 | s2 += count2; |
177 | 0 | } |
178 | 0 | else |
179 | 0 | { |
180 | 0 | applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0); |
181 | 0 | break; |
182 | 0 | } |
183 | 0 | } |
184 | 0 | } |
185 | 0 | break; |
186 | | |
187 | 0 | case SCC_BEFORE_DOT: |
188 | | /* "After" condition: followed by COMBINING DOT ABOVE |
189 | | (U+0307). Any sequence of characters with a |
190 | | combining class that is neither 0 nor 230 may |
191 | | intervene between the current character and the |
192 | | combining dot above. */ |
193 | | /* Test the "after" condition. */ |
194 | 0 | { |
195 | 0 | const UNIT *s2 = s + count; |
196 | 0 | applies = false; |
197 | 0 | for (;;) |
198 | 0 | { |
199 | 0 | if (s2 < s_end) |
200 | 0 | { |
201 | 0 | ucs4_t uc2; |
202 | 0 | int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); |
203 | 0 | if (uc2 == 0x0307) /* COMBINING DOT ABOVE */ |
204 | 0 | { |
205 | 0 | applies = true; |
206 | 0 | break; |
207 | 0 | } |
208 | 0 | { |
209 | 0 | int ccc = uc_combining_class (uc2); |
210 | 0 | if (ccc == UC_CCC_A || ccc == UC_CCC_NR) |
211 | 0 | break; |
212 | 0 | } |
213 | 0 | s2 += count2; |
214 | 0 | } |
215 | 0 | else |
216 | 0 | { |
217 | 0 | applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0); |
218 | 0 | break; |
219 | 0 | } |
220 | 0 | } |
221 | 0 | } |
222 | 0 | break; |
223 | | |
224 | 0 | case SCC_AFTER_I: |
225 | | /* "Before" condition: There is an uppercase I before |
226 | | it, and there is no intervening character of |
227 | | combining class 0 or 230 (Above). */ |
228 | | /* Test the "before" condition. */ |
229 | 0 | applies = (last_char_normal_or_above == 'I'); |
230 | 0 | break; |
231 | | |
232 | 0 | default: |
233 | 0 | abort (); |
234 | 35.6k | } |
235 | 35.6k | if (rule->context < 0) |
236 | 0 | applies = !applies; |
237 | | |
238 | 35.6k | if (applies) |
239 | 11.0k | { |
240 | | /* The rule applies. |
241 | | Look up the mapping (0 to 3 characters). */ |
242 | 11.0k | const unsigned short *mapped_in_rule = |
243 | 11.0k | (const unsigned short *)((const char *)rule + offset_in_rule); |
244 | | |
245 | 11.0k | if (mapped_in_rule[0] == 0) |
246 | 0 | mapped_count = 0; |
247 | 11.0k | else |
248 | 11.0k | { |
249 | 11.0k | mapped_uc[0] = mapped_in_rule[0]; |
250 | 11.0k | if (mapped_in_rule[1] == 0) |
251 | 9.59k | mapped_count = 1; |
252 | 1.41k | else |
253 | 1.41k | { |
254 | 1.41k | mapped_uc[1] = mapped_in_rule[1]; |
255 | 1.41k | if (mapped_in_rule[2] == 0) |
256 | 1.41k | mapped_count = 2; |
257 | 0 | else |
258 | 0 | { |
259 | 0 | mapped_uc[2] = mapped_in_rule[2]; |
260 | 0 | mapped_count = 3; |
261 | 0 | } |
262 | 1.41k | } |
263 | 11.0k | } |
264 | 11.0k | goto found_mapping; |
265 | 11.0k | } |
266 | 35.6k | } |
267 | | |
268 | | /* Optimization: Save a hash table lookup in the next round. */ |
269 | 999k | if (!rule->has_next) |
270 | 973k | break; |
271 | 999k | } |
272 | 10.9M | } |
273 | | |
274 | | /* No special-cased mapping. So use the locale and context independent |
275 | | mapping. */ |
276 | 10.9M | mapped_uc[0] = single_character_map (uc); |
277 | 10.9M | mapped_count = 1; |
278 | | |
279 | 11.0M | found_mapping: |
280 | | /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */ |
281 | 22.0M | for (unsigned int i = 0; i < mapped_count; i++) |
282 | 11.0M | { |
283 | 11.0M | ucs4_t muc = mapped_uc[i]; |
284 | | |
285 | | /* Append muc to the result accumulator. */ |
286 | 11.0M | if (length < allocated) |
287 | 8.55M | { |
288 | 8.55M | int ret = U_UCTOMB (result + length, muc, allocated - length); |
289 | 8.55M | if (ret == -1) |
290 | 0 | { |
291 | 0 | errno = EINVAL; |
292 | 0 | goto fail; |
293 | 0 | } |
294 | 8.55M | if (ret >= 0) |
295 | 8.54M | { |
296 | 8.54M | length += ret; |
297 | 8.54M | goto done_appending; |
298 | 8.54M | } |
299 | 8.55M | } |
300 | 2.45M | { |
301 | 2.45M | size_t old_allocated = allocated; |
302 | 2.45M | size_t new_allocated = 2 * old_allocated; |
303 | 2.45M | if (new_allocated < 64) |
304 | 2.45M | new_allocated = 64; |
305 | 2.45M | if (new_allocated < old_allocated) /* integer overflow? */ |
306 | 0 | abort (); |
307 | 2.45M | { |
308 | 2.45M | UNIT *larger_result; |
309 | 2.45M | if (result == NULL) |
310 | 2.45M | { |
311 | 2.45M | larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); |
312 | 2.45M | if (larger_result == NULL) |
313 | 0 | { |
314 | 0 | errno = ENOMEM; |
315 | 0 | goto fail; |
316 | 0 | } |
317 | 2.45M | } |
318 | 7.72k | else if (result == resultbuf) |
319 | 0 | { |
320 | 0 | larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); |
321 | 0 | if (larger_result == NULL) |
322 | 0 | { |
323 | 0 | errno = ENOMEM; |
324 | 0 | goto fail; |
325 | 0 | } |
326 | 0 | U_CPY (larger_result, resultbuf, length); |
327 | 0 | } |
328 | 7.72k | else |
329 | 7.72k | { |
330 | 7.72k | larger_result = |
331 | 7.72k | (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); |
332 | 7.72k | if (larger_result == NULL) |
333 | 0 | { |
334 | 0 | errno = ENOMEM; |
335 | 0 | goto fail; |
336 | 0 | } |
337 | 7.72k | } |
338 | 2.45M | result = larger_result; |
339 | 2.45M | allocated = new_allocated; |
340 | 2.45M | { |
341 | 2.45M | int ret = U_UCTOMB (result + length, muc, allocated - length); |
342 | 2.45M | if (ret == -1) |
343 | 0 | { |
344 | 0 | errno = EINVAL; |
345 | 0 | goto fail; |
346 | 0 | } |
347 | 2.45M | if (ret < 0) |
348 | 0 | abort (); |
349 | 2.45M | length += ret; |
350 | 2.45M | goto done_appending; |
351 | 2.45M | } |
352 | 2.45M | } |
353 | 2.45M | } |
354 | 11.0M | done_appending: ; |
355 | 11.0M | } |
356 | | |
357 | 11.0M | if (!uc_is_case_ignorable (uc)) |
358 | 8.80M | last_char_except_ignorable = uc; |
359 | | |
360 | 11.0M | { |
361 | 11.0M | int ccc = uc_combining_class (uc); |
362 | 11.0M | if (ccc == UC_CCC_A || ccc == UC_CCC_NR) |
363 | 10.9M | last_char_normal_or_above = uc; |
364 | 11.0M | } |
365 | | |
366 | 11.0M | s += count; |
367 | 11.0M | } |
368 | 2.45M | } |
369 | | |
370 | 2.45M | if (nf != NULL) |
371 | 2.45M | { |
372 | | /* Finally, normalize the result. */ |
373 | 2.45M | UNIT *normalized_result = |
374 | 2.45M | U_NORMALIZE (nf, result, length, resultbuf, lengthp); |
375 | 2.45M | if (normalized_result == NULL) |
376 | 0 | goto fail; |
377 | | |
378 | 2.45M | free (result); |
379 | 2.45M | return normalized_result; |
380 | 2.45M | } |
381 | | |
382 | 0 | if (length == 0) |
383 | 0 | { |
384 | 0 | if (result == NULL) |
385 | 0 | { |
386 | | /* Return a non-NULL value. NULL means error. */ |
387 | 0 | result = (UNIT *) malloc (1); |
388 | 0 | if (result == NULL) |
389 | 0 | { |
390 | 0 | errno = ENOMEM; |
391 | 0 | goto fail; |
392 | 0 | } |
393 | 0 | } |
394 | 0 | } |
395 | 0 | else if (result != resultbuf && length < allocated) |
396 | 0 | { |
397 | | /* Shrink the allocated memory if possible. */ |
398 | 0 | UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT)); |
399 | 0 | if (memory != NULL) |
400 | 0 | result = memory; |
401 | 0 | } |
402 | | |
403 | 0 | *lengthp = length; |
404 | 0 | return result; |
405 | | |
406 | 0 | fail: |
407 | 0 | if (result != resultbuf) |
408 | 0 | { |
409 | 0 | int saved_errno = errno; |
410 | 0 | free (result); |
411 | 0 | errno = saved_errno; |
412 | 0 | } |
413 | | return NULL; |
414 | 0 | } |