/src/libunistring/lib/unicase/u-casemap.h
| Line | Count | Source (jump to first uncovered line) | 
| 1 |  | /* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent). | 
| 2 |  |    Copyright (C) 2009-2023 Free Software Foundation, Inc. | 
| 3 |  |    Written by Bruno Haible <bruno@clisp.org>, 2009. | 
| 4 |  |  | 
| 5 |  |    This file is free software. | 
| 6 |  |    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". | 
| 7 |  |    You can redistribute it and/or modify it under either | 
| 8 |  |      - the terms of the GNU Lesser General Public License as published | 
| 9 |  |        by the Free Software Foundation, either version 3, or (at your | 
| 10 |  |        option) any later version, or | 
| 11 |  |      - the terms of the GNU General Public License as published by the | 
| 12 |  |        Free Software Foundation; either version 2, or (at your option) | 
| 13 |  |        any later version, or | 
| 14 |  |      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". | 
| 15 |  |  | 
| 16 |  |    This file is distributed in the hope that it will be useful, | 
| 17 |  |    but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 18 |  |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
| 19 |  |    Lesser General Public License and the GNU General Public License | 
| 20 |  |    for more details. | 
| 21 |  |  | 
| 22 |  |    You should have received a copy of the GNU Lesser General Public | 
| 23 |  |    License and of the GNU General Public License along with this | 
| 24 |  |    program.  If not, see <https://www.gnu.org/licenses/>.  */ | 
| 25 |  |  | 
| 26 |  | UNIT * | 
| 27 |  | FUNC (const UNIT *s, size_t n, | 
| 28 |  |       casing_prefix_context_t prefix_context, | 
| 29 |  |       casing_suffix_context_t suffix_context, | 
| 30 |  |       const char *iso639_language, | 
| 31 |  |       ucs4_t (*single_character_map) (ucs4_t), | 
| 32 |  |       size_t offset_in_rule, /* offset in 'struct special_casing_rule' */ | 
| 33 |  |       uninorm_t nf, | 
| 34 |  |       UNIT *resultbuf, size_t *lengthp) | 
| 35 | 3.86k | { | 
| 36 |  |   /* The result being accumulated.  */ | 
| 37 | 3.86k |   UNIT *result; | 
| 38 | 3.86k |   size_t length; | 
| 39 | 3.86k |   size_t allocated; | 
| 40 |  |  | 
| 41 |  |   /* Initialize the accumulator.  */ | 
| 42 | 3.86k |   if (nf != NULL || resultbuf == NULL) | 
| 43 | 3.86k |     { | 
| 44 | 3.86k |       result = NULL; | 
| 45 | 3.86k |       allocated = 0; | 
| 46 | 3.86k |     } | 
| 47 | 0 |   else | 
| 48 | 0 |     { | 
| 49 | 0 |       result = resultbuf; | 
| 50 | 0 |       allocated = *lengthp; | 
| 51 | 0 |     } | 
| 52 | 3.86k |   length = 0; | 
| 53 |  |  | 
| 54 | 3.86k |   { | 
| 55 | 3.86k |     const UNIT *s_end = s + n; | 
| 56 |  |  | 
| 57 |  |     /* Helper for evaluating the FINAL_SIGMA condition: | 
| 58 |  |        Last character that was not case-ignorable.  */ | 
| 59 | 3.86k |     ucs4_t last_char_except_ignorable = | 
| 60 | 3.86k |       prefix_context.last_char_except_ignorable; | 
| 61 |  |  | 
| 62 |  |     /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions: | 
| 63 |  |        Last character that was of combining class 230 ("Above") or 0.  */ | 
| 64 | 3.86k |     ucs4_t last_char_normal_or_above = | 
| 65 | 3.86k |       prefix_context.last_char_normal_or_above; | 
| 66 |  |  | 
| 67 | 3.52M |     while (s < s_end) | 
| 68 | 3.52M |       { | 
| 69 | 3.52M |         ucs4_t uc; | 
| 70 | 3.52M |         int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); | 
| 71 |  |  | 
| 72 | 3.52M |         ucs4_t mapped_uc[3]; | 
| 73 | 3.52M |         unsigned int mapped_count; | 
| 74 |  |  | 
| 75 | 3.52M |         if (uc < 0x10000) | 
| 76 | 3.51M |           { | 
| 77 |  |             /* Look first in the special-casing table.  */ | 
| 78 | 3.51M |             char code[3]; | 
| 79 |  |  | 
| 80 | 3.51M |             code[0] = (uc >> 8) & 0xff; | 
| 81 | 3.51M |             code[1] = uc & 0xff; | 
| 82 |  |  | 
| 83 | 3.51M |             for (code[2] = 0; ; code[2]++) | 
| 84 | 3.51M |               { | 
| 85 | 3.51M |                 const struct special_casing_rule *rule = | 
| 86 | 3.51M |                   gl_unicase_special_lookup (code, 3); | 
| 87 |  |  | 
| 88 | 3.51M |                 if (rule == NULL) | 
| 89 | 2.95M |                   break; | 
| 90 |  |  | 
| 91 |  |                 /* Test if the condition applies.  */ | 
| 92 |  |                 /* Does the language apply?  */ | 
| 93 | 563k |                 if (rule->language[0] == '\0' | 
| 94 | 563k |                     || (iso639_language != NULL | 
| 95 | 466k |                         && iso639_language[0] == rule->language[0] | 
| 96 | 466k |                         && iso639_language[1] == rule->language[1])) | 
| 97 | 96.4k |                   { | 
| 98 |  |                     /* Does the context apply?  */ | 
| 99 | 96.4k |                     int context = rule->context; | 
| 100 | 96.4k |                     bool applies; | 
| 101 |  |  | 
| 102 | 96.4k |                     if (context < 0) | 
| 103 | 0 |                       context = - context; | 
| 104 | 96.4k |                     switch (context) | 
| 105 | 96.4k |                       { | 
| 106 | 92.8k |                       case SCC_ALWAYS: | 
| 107 | 92.8k |                         applies = true; | 
| 108 | 92.8k |                         break; | 
| 109 |  |  | 
| 110 | 3.58k |                       case SCC_FINAL_SIGMA: | 
| 111 |  |                         /* "Before" condition: preceded by a sequence | 
| 112 |  |                            consisting of a cased letter and a case-ignorable | 
| 113 |  |                            sequence. | 
| 114 |  |                            "After" condition: not followed by a sequence | 
| 115 |  |                            consisting of a case-ignorable sequence and then a | 
| 116 |  |                            cased letter.  */ | 
| 117 |  |                         /* Test the "before" condition.  */ | 
| 118 | 3.58k |                         applies = uc_is_cased (last_char_except_ignorable); | 
| 119 |  |                         /* Test the "after" condition.  */ | 
| 120 | 3.58k |                         if (applies) | 
| 121 | 2.46k |                           { | 
| 122 | 2.46k |                             const UNIT *s2 = s + count; | 
| 123 | 2.46k |                             for (;;) | 
| 124 | 4.66k |                               { | 
| 125 | 4.66k |                                 if (s2 < s_end) | 
| 126 | 4.66k |                                   { | 
| 127 | 4.66k |                                     ucs4_t uc2; | 
| 128 | 4.66k |                                     int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); | 
| 129 |  |                                     /* Our uc_is_case_ignorable function is | 
| 130 |  |                                        known to return false for all cased | 
| 131 |  |                                        characters.  So we can call | 
| 132 |  |                                        uc_is_case_ignorable first.  */ | 
| 133 | 4.66k |                                     if (!uc_is_case_ignorable (uc2)) | 
| 134 | 2.46k |                                       { | 
| 135 | 2.46k |                                         applies = ! uc_is_cased (uc2); | 
| 136 | 2.46k |                                         break; | 
| 137 | 2.46k |                                       } | 
| 138 | 2.20k |                                     s2 += count2; | 
| 139 | 2.20k |                                   } | 
| 140 | 0 |                                 else | 
| 141 | 0 |                                   { | 
| 142 | 0 |                                     applies = ! uc_is_cased (suffix_context.first_char_except_ignorable); | 
| 143 | 0 |                                     break; | 
| 144 | 0 |                                   } | 
| 145 | 4.66k |                               } | 
| 146 | 2.46k |                           } | 
| 147 | 3.58k |                         break; | 
| 148 |  |  | 
| 149 | 2.20k |                       case SCC_AFTER_SOFT_DOTTED: | 
| 150 |  |                         /* "Before" condition: There is a Soft_Dotted character | 
| 151 |  |                            before it, with no intervening character of | 
| 152 |  |                            combining class 0 or 230 (Above).  */ | 
| 153 |  |                         /* Test the "before" condition.  */ | 
| 154 | 0 |                         applies = uc_is_property_soft_dotted (last_char_normal_or_above); | 
| 155 | 0 |                         break; | 
| 156 |  |  | 
| 157 | 0 |                       case SCC_MORE_ABOVE: | 
| 158 |  |                         /* "After" condition: followed by a character of | 
| 159 |  |                            combining class 230 (Above) with no intervening | 
| 160 |  |                            character of combining class 0 or 230 (Above).  */ | 
| 161 |  |                         /* Test the "after" condition.  */ | 
| 162 | 0 |                         { | 
| 163 | 0 |                           const UNIT *s2 = s + count; | 
| 164 | 0 |                           applies = false; | 
| 165 | 0 |                           for (;;) | 
| 166 | 0 |                             { | 
| 167 | 0 |                               if (s2 < s_end) | 
| 168 | 0 |                                 { | 
| 169 | 0 |                                   ucs4_t uc2; | 
| 170 | 0 |                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); | 
| 171 | 0 |                                   int ccc = uc_combining_class (uc2); | 
| 172 | 0 |                                   if (ccc == UC_CCC_A) | 
| 173 | 0 |                                     { | 
| 174 | 0 |                                       applies = true; | 
| 175 | 0 |                                       break; | 
| 176 | 0 |                                     } | 
| 177 | 0 |                                   if (ccc == UC_CCC_NR) | 
| 178 | 0 |                                     break; | 
| 179 | 0 |                                   s2 += count2; | 
| 180 | 0 |                                 } | 
| 181 | 0 |                               else | 
| 182 | 0 |                                 { | 
| 183 | 0 |                                   applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0); | 
| 184 | 0 |                                   break; | 
| 185 | 0 |                                 } | 
| 186 | 0 |                             } | 
| 187 | 0 |                         } | 
| 188 | 0 |                         break; | 
| 189 |  |  | 
| 190 | 0 |                       case SCC_BEFORE_DOT: | 
| 191 |  |                         /* "After" condition: followed by COMBINING DOT ABOVE | 
| 192 |  |                            (U+0307). Any sequence of characters with a | 
| 193 |  |                            combining class that is neither 0 nor 230 may | 
| 194 |  |                            intervene between the current character and the | 
| 195 |  |                            combining dot above.  */ | 
| 196 |  |                         /* Test the "after" condition.  */ | 
| 197 | 0 |                         { | 
| 198 | 0 |                           const UNIT *s2 = s + count; | 
| 199 | 0 |                           applies = false; | 
| 200 | 0 |                           for (;;) | 
| 201 | 0 |                             { | 
| 202 | 0 |                               if (s2 < s_end) | 
| 203 | 0 |                                 { | 
| 204 | 0 |                                   ucs4_t uc2; | 
| 205 | 0 |                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); | 
| 206 | 0 |                                   if (uc2 == 0x0307) /* COMBINING DOT ABOVE */ | 
| 207 | 0 |                                     { | 
| 208 | 0 |                                       applies = true; | 
| 209 | 0 |                                       break; | 
| 210 | 0 |                                     } | 
| 211 | 0 |                                   { | 
| 212 | 0 |                                     int ccc = uc_combining_class (uc2); | 
| 213 | 0 |                                     if (ccc == UC_CCC_A || ccc == UC_CCC_NR) | 
| 214 | 0 |                                       break; | 
| 215 | 0 |                                   } | 
| 216 | 0 |                                   s2 += count2; | 
| 217 | 0 |                                 } | 
| 218 | 0 |                               else | 
| 219 | 0 |                                 { | 
| 220 | 0 |                                   applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0); | 
| 221 | 0 |                                   break; | 
| 222 | 0 |                                 } | 
| 223 | 0 |                             } | 
| 224 | 0 |                         } | 
| 225 | 0 |                         break; | 
| 226 |  |  | 
| 227 | 0 |                       case SCC_AFTER_I: | 
| 228 |  |                         /* "Before" condition: There is an uppercase I before | 
| 229 |  |                            it, and there is no intervening character of | 
| 230 |  |                            combining class 0 or 230 (Above).  */ | 
| 231 |  |                         /* Test the "before" condition.  */ | 
| 232 | 0 |                         applies = (last_char_normal_or_above == 'I'); | 
| 233 | 0 |                         break; | 
| 234 |  |  | 
| 235 | 0 |                       default: | 
| 236 | 0 |                         abort (); | 
| 237 | 96.4k |                       } | 
| 238 | 96.4k |                     if (rule->context < 0) | 
| 239 | 0 |                       applies = !applies; | 
| 240 |  |  | 
| 241 | 96.4k |                     if (applies) | 
| 242 | 94.6k |                       { | 
| 243 |  |                         /* The rule applies. | 
| 244 |  |                            Look up the mapping (0 to 3 characters).  */ | 
| 245 | 94.6k |                         const unsigned short *mapped_in_rule = | 
| 246 | 94.6k |                           (const unsigned short *)((const char *)rule + offset_in_rule); | 
| 247 |  |  | 
| 248 | 94.6k |                         if (mapped_in_rule[0] == 0) | 
| 249 | 0 |                           mapped_count = 0; | 
| 250 | 94.6k |                         else | 
| 251 | 94.6k |                           { | 
| 252 | 94.6k |                             mapped_uc[0] = mapped_in_rule[0]; | 
| 253 | 94.6k |                             if (mapped_in_rule[1] == 0) | 
| 254 | 94.4k |                               mapped_count = 1; | 
| 255 | 222 |                             else | 
| 256 | 222 |                               { | 
| 257 | 222 |                                 mapped_uc[1] = mapped_in_rule[1]; | 
| 258 | 222 |                                 if (mapped_in_rule[2] == 0) | 
| 259 | 222 |                                   mapped_count = 2; | 
| 260 | 0 |                                 else | 
| 261 | 0 |                                   { | 
| 262 | 0 |                                     mapped_uc[2] = mapped_in_rule[2]; | 
| 263 | 0 |                                     mapped_count = 3; | 
| 264 | 0 |                                   } | 
| 265 | 222 |                               } | 
| 266 | 94.6k |                           } | 
| 267 | 94.6k |                         goto found_mapping; | 
| 268 | 94.6k |                       } | 
| 269 | 96.4k |                   } | 
| 270 |  |  | 
| 271 |  |                 /* Optimization: Save a hash table lookup in the next round.  */ | 
| 272 | 468k |                 if (!rule->has_next) | 
| 273 | 460k |                   break; | 
| 274 | 468k |               } | 
| 275 | 3.51M |           } | 
| 276 |  |  | 
| 277 |  |         /* No special-cased mapping.  So use the locale and context independent | 
| 278 |  |            mapping.  */ | 
| 279 | 3.42M |         mapped_uc[0] = single_character_map (uc); | 
| 280 | 3.42M |         mapped_count = 1; | 
| 281 |  |  | 
| 282 | 3.52M |        found_mapping: | 
| 283 |  |         /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */ | 
| 284 | 3.52M |         { | 
| 285 | 3.52M |           unsigned int i; | 
| 286 |  |  | 
| 287 | 7.04M |           for (i = 0; i < mapped_count; i++) | 
| 288 | 3.52M |             { | 
| 289 | 3.52M |               ucs4_t muc = mapped_uc[i]; | 
| 290 |  |  | 
| 291 |  |               /* Append muc to the result accumulator.  */ | 
| 292 | 3.52M |               if (length < allocated) | 
| 293 | 3.51M |                 { | 
| 294 | 3.51M |                   int ret = U_UCTOMB (result + length, muc, allocated - length); | 
| 295 | 3.51M |                   if (ret == -1) | 
| 296 | 0 |                     { | 
| 297 | 0 |                       errno = EINVAL; | 
| 298 | 0 |                       goto fail; | 
| 299 | 0 |                     } | 
| 300 | 3.51M |                   if (ret >= 0) | 
| 301 | 3.51M |                     { | 
| 302 | 3.51M |                       length += ret; | 
| 303 | 3.51M |                       goto done_appending; | 
| 304 | 3.51M |                     } | 
| 305 | 3.51M |                 } | 
| 306 | 7.42k |               { | 
| 307 | 7.42k |                 size_t old_allocated = allocated; | 
| 308 | 7.42k |                 size_t new_allocated = 2 * old_allocated; | 
| 309 | 7.42k |                 if (new_allocated < 64) | 
| 310 | 3.86k |                   new_allocated = 64; | 
| 311 | 7.42k |                 if (new_allocated < old_allocated) /* integer overflow? */ | 
| 312 | 0 |                   abort (); | 
| 313 | 7.42k |                 { | 
| 314 | 7.42k |                   UNIT *larger_result; | 
| 315 | 7.42k |                   if (result == NULL) | 
| 316 | 3.86k |                     { | 
| 317 | 3.86k |                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); | 
| 318 | 3.86k |                       if (larger_result == NULL) | 
| 319 | 0 |                         { | 
| 320 | 0 |                           errno = ENOMEM; | 
| 321 | 0 |                           goto fail; | 
| 322 | 0 |                         } | 
| 323 | 3.86k |                     } | 
| 324 | 3.55k |                   else if (result == resultbuf) | 
| 325 | 0 |                     { | 
| 326 | 0 |                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); | 
| 327 | 0 |                       if (larger_result == NULL) | 
| 328 | 0 |                         { | 
| 329 | 0 |                           errno = ENOMEM; | 
| 330 | 0 |                           goto fail; | 
| 331 | 0 |                         } | 
| 332 | 0 |                       U_CPY (larger_result, resultbuf, length); | 
| 333 | 0 |                     } | 
| 334 | 3.55k |                   else | 
| 335 | 3.55k |                     { | 
| 336 | 3.55k |                       larger_result = | 
| 337 | 3.55k |                         (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); | 
| 338 | 3.55k |                       if (larger_result == NULL) | 
| 339 | 0 |                         { | 
| 340 | 0 |                           errno = ENOMEM; | 
| 341 | 0 |                           goto fail; | 
| 342 | 0 |                         } | 
| 343 | 3.55k |                     } | 
| 344 | 7.42k |                   result = larger_result; | 
| 345 | 7.42k |                   allocated = new_allocated; | 
| 346 | 7.42k |                   { | 
| 347 | 7.42k |                     int ret = U_UCTOMB (result + length, muc, allocated - length); | 
| 348 | 7.42k |                     if (ret == -1) | 
| 349 | 0 |                       { | 
| 350 | 0 |                         errno = EINVAL; | 
| 351 | 0 |                         goto fail; | 
| 352 | 0 |                       } | 
| 353 | 7.42k |                     if (ret < 0) | 
| 354 | 0 |                       abort (); | 
| 355 | 7.42k |                     length += ret; | 
| 356 | 7.42k |                     goto done_appending; | 
| 357 | 7.42k |                   } | 
| 358 | 7.42k |                 } | 
| 359 | 7.42k |               } | 
| 360 | 3.52M |              done_appending: ; | 
| 361 | 3.52M |             } | 
| 362 | 3.52M |         } | 
| 363 |  |  | 
| 364 | 3.52M |         if (!uc_is_case_ignorable (uc)) | 
| 365 | 2.78M |           last_char_except_ignorable = uc; | 
| 366 |  |  | 
| 367 | 3.52M |         { | 
| 368 | 3.52M |           int ccc = uc_combining_class (uc); | 
| 369 | 3.52M |           if (ccc == UC_CCC_A || ccc == UC_CCC_NR) | 
| 370 | 3.50M |             last_char_normal_or_above = uc; | 
| 371 | 3.52M |         } | 
| 372 |  |  | 
| 373 | 3.52M |         s += count; | 
| 374 | 3.52M |       } | 
| 375 | 3.86k |   } | 
| 376 |  |  | 
| 377 | 3.86k |   if (nf != NULL) | 
| 378 | 3.86k |     { | 
| 379 |  |       /* Finally, normalize the result.  */ | 
| 380 | 3.86k |       UNIT *normalized_result; | 
| 381 |  |  | 
| 382 | 3.86k |       normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp); | 
| 383 | 3.86k |       if (normalized_result == NULL) | 
| 384 | 0 |         goto fail; | 
| 385 |  |  | 
| 386 | 3.86k |       free (result); | 
| 387 | 3.86k |       return normalized_result; | 
| 388 | 3.86k |     } | 
| 389 |  |  | 
| 390 | 0 |   if (length == 0) | 
| 391 | 0 |     { | 
| 392 | 0 |       if (result == NULL) | 
| 393 | 0 |         { | 
| 394 |  |           /* Return a non-NULL value.  NULL means error.  */ | 
| 395 | 0 |           result = (UNIT *) malloc (1); | 
| 396 | 0 |           if (result == NULL) | 
| 397 | 0 |             { | 
| 398 | 0 |               errno = ENOMEM; | 
| 399 | 0 |               goto fail; | 
| 400 | 0 |             } | 
| 401 | 0 |         } | 
| 402 | 0 |     } | 
| 403 | 0 |   else if (result != resultbuf && length < allocated) | 
| 404 | 0 |     { | 
| 405 |  |       /* Shrink the allocated memory if possible.  */ | 
| 406 | 0 |       UNIT *memory; | 
| 407 |  | 
 | 
| 408 | 0 |       memory = (UNIT *) realloc (result, length * sizeof (UNIT)); | 
| 409 | 0 |       if (memory != NULL) | 
| 410 | 0 |         result = memory; | 
| 411 | 0 |     } | 
| 412 |  |  | 
| 413 | 0 |   *lengthp = length; | 
| 414 | 0 |   return result; | 
| 415 |  |  | 
| 416 | 0 |  fail: | 
| 417 | 0 |   if (result != resultbuf) | 
| 418 | 0 |     { | 
| 419 | 0 |       int saved_errno = errno; | 
| 420 | 0 |       free (result); | 
| 421 | 0 |       errno = saved_errno; | 
| 422 | 0 |     } | 
| 423 | 0 |   return NULL; | 
| 424 | 0 | } |