Coverage Report

Created: 2025-07-11 06:23

/src/libunistring/lib/unicase/u-casemap.h
Line
Count
Source (jump to first uncovered line)
1
/* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software.
6
   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7
   You can redistribute it and/or modify it under either
8
     - the terms of the GNU Lesser General Public License as published
9
       by the Free Software Foundation, either version 3, or (at your
10
       option) any later version, or
11
     - the terms of the GNU General Public License as published by the
12
       Free Software Foundation; either version 2, or (at your option)
13
       any later version, or
14
     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16
   This file is distributed in the hope that it will be useful,
17
   but WITHOUT ANY WARRANTY; without even the implied warranty of
18
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19
   Lesser General Public License and the GNU General Public License
20
   for more details.
21
22
   You should have received a copy of the GNU Lesser General Public
23
   License and of the GNU General Public License along with this
24
   program.  If not, see <https://www.gnu.org/licenses/>.  */
25
26
UNIT *
27
FUNC (const UNIT *s, size_t n,
28
      casing_prefix_context_t prefix_context,
29
      casing_suffix_context_t suffix_context,
30
      const char *iso639_language,
31
      ucs4_t (*single_character_map) (ucs4_t),
32
      size_t offset_in_rule, /* offset in 'struct special_casing_rule' */
33
      uninorm_t nf,
34
      UNIT *resultbuf, size_t *lengthp)
35
2.98k
{
36
  /* The result being accumulated.  */
37
2.98k
  UNIT *result;
38
2.98k
  size_t length;
39
2.98k
  size_t allocated;
40
41
  /* Initialize the accumulator.  */
42
2.98k
  if (nf != NULL || resultbuf == NULL)
43
2.98k
    {
44
2.98k
      result = NULL;
45
2.98k
      allocated = 0;
46
2.98k
    }
47
0
  else
48
0
    {
49
0
      result = resultbuf;
50
0
      allocated = *lengthp;
51
0
    }
52
2.98k
  length = 0;
53
54
2.98k
  {
55
2.98k
    const UNIT *s_end = s + n;
56
57
    /* Helper for evaluating the FINAL_SIGMA condition:
58
       Last character that was not case-ignorable.  */
59
2.98k
    ucs4_t last_char_except_ignorable =
60
2.98k
      prefix_context.last_char_except_ignorable;
61
62
    /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
63
       Last character that was of combining class 230 ("Above") or 0.  */
64
2.98k
    ucs4_t last_char_normal_or_above =
65
2.98k
      prefix_context.last_char_normal_or_above;
66
67
2.61M
    while (s < s_end)
68
2.60M
      {
69
2.60M
        ucs4_t uc;
70
2.60M
        int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
71
72
2.60M
        ucs4_t mapped_uc[3];
73
2.60M
        unsigned int mapped_count;
74
75
2.60M
        if (uc < 0x10000)
76
2.60M
          {
77
            /* Look first in the special-casing table.  */
78
2.60M
            char code[3];
79
80
2.60M
            code[0] = (uc >> 8) & 0xff;
81
2.60M
            code[1] = uc & 0xff;
82
83
2.60M
            for (code[2] = 0; ; code[2]++)
84
2.62M
              {
85
2.62M
                const struct special_casing_rule *rule =
86
2.62M
                  gl_unicase_special_lookup (code, 3);
87
88
2.62M
                if (rule == NULL)
89
2.14M
                  break;
90
91
                /* Test if the condition applies.  */
92
                /* Does the language apply?  */
93
479k
                if (rule->language[0] == '\0'
94
479k
                    || (iso639_language != NULL
95
474k
                        && iso639_language[0] == rule->language[0]
96
474k
                        && iso639_language[1] == rule->language[1]))
97
4.38k
                  {
98
                    /* Does the context apply?  */
99
4.38k
                    int context = rule->context;
100
4.38k
                    bool applies;
101
102
4.38k
                    if (context < 0)
103
0
                      context = - context;
104
4.38k
                    switch (context)
105
4.38k
                      {
106
2.72k
                      case SCC_ALWAYS:
107
2.72k
                        applies = true;
108
2.72k
                        break;
109
110
1.65k
                      case SCC_FINAL_SIGMA:
111
                        /* "Before" condition: preceded by a sequence
112
                           consisting of a cased letter and a case-ignorable
113
                           sequence.
114
                           "After" condition: not followed by a sequence
115
                           consisting of a case-ignorable sequence and then a
116
                           cased letter.  */
117
                        /* Test the "before" condition.  */
118
1.65k
                        applies = uc_is_cased (last_char_except_ignorable);
119
                        /* Test the "after" condition.  */
120
1.65k
                        if (applies)
121
1.12k
                          {
122
1.12k
                            const UNIT *s2 = s + count;
123
1.12k
                            for (;;)
124
1.82k
                              {
125
1.82k
                                if (s2 < s_end)
126
1.82k
                                  {
127
1.82k
                                    ucs4_t uc2;
128
1.82k
                                    int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
129
                                    /* Our uc_is_case_ignorable function is
130
                                       known to return false for all cased
131
                                       characters.  So we can call
132
                                       uc_is_case_ignorable first.  */
133
1.82k
                                    if (!uc_is_case_ignorable (uc2))
134
1.12k
                                      {
135
1.12k
                                        applies = ! uc_is_cased (uc2);
136
1.12k
                                        break;
137
1.12k
                                      }
138
700
                                    s2 += count2;
139
700
                                  }
140
0
                                else
141
0
                                  {
142
0
                                    applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
143
0
                                    break;
144
0
                                  }
145
1.82k
                              }
146
1.12k
                          }
147
1.65k
                        break;
148
149
700
                      case SCC_AFTER_SOFT_DOTTED:
150
                        /* "Before" condition: There is a Soft_Dotted character
151
                           before it, with no intervening character of
152
                           combining class 0 or 230 (Above).  */
153
                        /* Test the "before" condition.  */
154
0
                        applies = uc_is_property_soft_dotted (last_char_normal_or_above);
155
0
                        break;
156
157
0
                      case SCC_MORE_ABOVE:
158
                        /* "After" condition: followed by a character of
159
                           combining class 230 (Above) with no intervening
160
                           character of combining class 0 or 230 (Above).  */
161
                        /* Test the "after" condition.  */
162
0
                        {
163
0
                          const UNIT *s2 = s + count;
164
0
                          applies = false;
165
0
                          for (;;)
166
0
                            {
167
0
                              if (s2 < s_end)
168
0
                                {
169
0
                                  ucs4_t uc2;
170
0
                                  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
171
0
                                  int ccc = uc_combining_class (uc2);
172
0
                                  if (ccc == UC_CCC_A)
173
0
                                    {
174
0
                                      applies = true;
175
0
                                      break;
176
0
                                    }
177
0
                                  if (ccc == UC_CCC_NR)
178
0
                                    break;
179
0
                                  s2 += count2;
180
0
                                }
181
0
                              else
182
0
                                {
183
0
                                  applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
184
0
                                  break;
185
0
                                }
186
0
                            }
187
0
                        }
188
0
                        break;
189
190
0
                      case SCC_BEFORE_DOT:
191
                        /* "After" condition: followed by COMBINING DOT ABOVE
192
                           (U+0307). Any sequence of characters with a
193
                           combining class that is neither 0 nor 230 may
194
                           intervene between the current character and the
195
                           combining dot above.  */
196
                        /* Test the "after" condition.  */
197
0
                        {
198
0
                          const UNIT *s2 = s + count;
199
0
                          applies = false;
200
0
                          for (;;)
201
0
                            {
202
0
                              if (s2 < s_end)
203
0
                                {
204
0
                                  ucs4_t uc2;
205
0
                                  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
206
0
                                  if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
207
0
                                    {
208
0
                                      applies = true;
209
0
                                      break;
210
0
                                    }
211
0
                                  {
212
0
                                    int ccc = uc_combining_class (uc2);
213
0
                                    if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
214
0
                                      break;
215
0
                                  }
216
0
                                  s2 += count2;
217
0
                                }
218
0
                              else
219
0
                                {
220
0
                                  applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
221
0
                                  break;
222
0
                                }
223
0
                            }
224
0
                        }
225
0
                        break;
226
227
0
                      case SCC_AFTER_I:
228
                        /* "Before" condition: There is an uppercase I before
229
                           it, and there is no intervening character of
230
                           combining class 0 or 230 (Above).  */
231
                        /* Test the "before" condition.  */
232
0
                        applies = (last_char_normal_or_above == 'I');
233
0
                        break;
234
235
0
                      default:
236
0
                        abort ();
237
4.38k
                      }
238
4.38k
                    if (rule->context < 0)
239
0
                      applies = !applies;
240
241
4.38k
                    if (applies)
242
3.32k
                      {
243
                        /* The rule applies.
244
                           Look up the mapping (0 to 3 characters).  */
245
3.32k
                        const unsigned short *mapped_in_rule =
246
3.32k
                          (const unsigned short *)((const char *)rule + offset_in_rule);
247
248
3.32k
                        if (mapped_in_rule[0] == 0)
249
0
                          mapped_count = 0;
250
3.32k
                        else
251
3.32k
                          {
252
3.32k
                            mapped_uc[0] = mapped_in_rule[0];
253
3.32k
                            if (mapped_in_rule[1] == 0)
254
3.02k
                              mapped_count = 1;
255
306
                            else
256
306
                              {
257
306
                                mapped_uc[1] = mapped_in_rule[1];
258
306
                                if (mapped_in_rule[2] == 0)
259
306
                                  mapped_count = 2;
260
0
                                else
261
0
                                  {
262
0
                                    mapped_uc[2] = mapped_in_rule[2];
263
0
                                    mapped_count = 3;
264
0
                                  }
265
306
                              }
266
3.32k
                          }
267
3.32k
                        goto found_mapping;
268
3.32k
                      }
269
4.38k
                  }
270
271
                /* Optimization: Save a hash table lookup in the next round.  */
272
476k
                if (!rule->has_next)
273
457k
                  break;
274
476k
              }
275
2.60M
          }
276
277
        /* No special-cased mapping.  So use the locale and context independent
278
           mapping.  */
279
2.60M
        mapped_uc[0] = single_character_map (uc);
280
2.60M
        mapped_count = 1;
281
282
2.60M
       found_mapping:
283
        /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
284
2.60M
        {
285
2.60M
          unsigned int i;
286
287
5.21M
          for (i = 0; i < mapped_count; i++)
288
2.60M
            {
289
2.60M
              ucs4_t muc = mapped_uc[i];
290
291
              /* Append muc to the result accumulator.  */
292
2.60M
              if (length < allocated)
293
2.60M
                {
294
2.60M
                  int ret = U_UCTOMB (result + length, muc, allocated - length);
295
2.60M
                  if (ret == -1)
296
0
                    {
297
0
                      errno = EINVAL;
298
0
                      goto fail;
299
0
                    }
300
2.60M
                  if (ret >= 0)
301
2.60M
                    {
302
2.60M
                      length += ret;
303
2.60M
                      goto done_appending;
304
2.60M
                    }
305
2.60M
                }
306
6.09k
              {
307
6.09k
                size_t old_allocated = allocated;
308
6.09k
                size_t new_allocated = 2 * old_allocated;
309
6.09k
                if (new_allocated < 64)
310
2.98k
                  new_allocated = 64;
311
6.09k
                if (new_allocated < old_allocated) /* integer overflow? */
312
0
                  abort ();
313
6.09k
                {
314
6.09k
                  UNIT *larger_result;
315
6.09k
                  if (result == NULL)
316
2.98k
                    {
317
2.98k
                      larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
318
2.98k
                      if (larger_result == NULL)
319
0
                        {
320
0
                          errno = ENOMEM;
321
0
                          goto fail;
322
0
                        }
323
2.98k
                    }
324
3.10k
                  else if (result == resultbuf)
325
0
                    {
326
0
                      larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
327
0
                      if (larger_result == NULL)
328
0
                        {
329
0
                          errno = ENOMEM;
330
0
                          goto fail;
331
0
                        }
332
0
                      U_CPY (larger_result, resultbuf, length);
333
0
                    }
334
3.10k
                  else
335
3.10k
                    {
336
3.10k
                      larger_result =
337
3.10k
                        (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
338
3.10k
                      if (larger_result == NULL)
339
0
                        {
340
0
                          errno = ENOMEM;
341
0
                          goto fail;
342
0
                        }
343
3.10k
                    }
344
6.09k
                  result = larger_result;
345
6.09k
                  allocated = new_allocated;
346
6.09k
                  {
347
6.09k
                    int ret = U_UCTOMB (result + length, muc, allocated - length);
348
6.09k
                    if (ret == -1)
349
0
                      {
350
0
                        errno = EINVAL;
351
0
                        goto fail;
352
0
                      }
353
6.09k
                    if (ret < 0)
354
0
                      abort ();
355
6.09k
                    length += ret;
356
6.09k
                    goto done_appending;
357
6.09k
                  }
358
6.09k
                }
359
6.09k
              }
360
2.60M
             done_appending: ;
361
2.60M
            }
362
2.60M
        }
363
364
2.60M
        if (!uc_is_case_ignorable (uc))
365
2.17M
          last_char_except_ignorable = uc;
366
367
2.60M
        {
368
2.60M
          int ccc = uc_combining_class (uc);
369
2.60M
          if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
370
2.58M
            last_char_normal_or_above = uc;
371
2.60M
        }
372
373
2.60M
        s += count;
374
2.60M
      }
375
2.98k
  }
376
377
2.98k
  if (nf != NULL)
378
2.98k
    {
379
      /* Finally, normalize the result.  */
380
2.98k
      UNIT *normalized_result;
381
382
2.98k
      normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
383
2.98k
      if (normalized_result == NULL)
384
0
        goto fail;
385
386
2.98k
      free (result);
387
2.98k
      return normalized_result;
388
2.98k
    }
389
390
0
  if (length == 0)
391
0
    {
392
0
      if (result == NULL)
393
0
        {
394
          /* Return a non-NULL value.  NULL means error.  */
395
0
          result = (UNIT *) malloc (1);
396
0
          if (result == NULL)
397
0
            {
398
0
              errno = ENOMEM;
399
0
              goto fail;
400
0
            }
401
0
        }
402
0
    }
403
0
  else if (result != resultbuf && length < allocated)
404
0
    {
405
      /* Shrink the allocated memory if possible.  */
406
0
      UNIT *memory;
407
408
0
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
409
0
      if (memory != NULL)
410
0
        result = memory;
411
0
    }
412
413
0
  *lengthp = length;
414
0
  return result;
415
416
0
 fail:
417
0
  if (result != resultbuf)
418
0
    {
419
0
      int saved_errno = errno;
420
0
      free (result);
421
0
      errno = saved_errno;
422
0
    }
423
0
  return NULL;
424
0
}