Coverage Report

Created: 2026-02-05 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/unicase/u-casemap.h
Line
Count
Source
1
/* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
2
   Copyright (C) 2009-2026 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software.
6
   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7
   You can redistribute it and/or modify it under either
8
     - the terms of the GNU Lesser General Public License as published
9
       by the Free Software Foundation, either version 3, or (at your
10
       option) any later version, or
11
     - the terms of the GNU General Public License as published by the
12
       Free Software Foundation; either version 2, or (at your option)
13
       any later version, or
14
     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16
   This file is distributed in the hope that it will be useful,
17
   but WITHOUT ANY WARRANTY; without even the implied warranty of
18
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19
   Lesser General Public License and the GNU General Public License
20
   for more details.
21
22
   You should have received a copy of the GNU Lesser General Public
23
   License and of the GNU General Public License along with this
24
   program.  If not, see <https://www.gnu.org/licenses/>.  */
25
26
UNIT *
27
FUNC (const UNIT *s, size_t n,
28
      casing_prefix_context_t prefix_context,
29
      casing_suffix_context_t suffix_context,
30
      const char *iso639_language,
31
      ucs4_t (*single_character_map) (ucs4_t),
32
      size_t offset_in_rule, /* offset in 'struct special_casing_rule' */
33
      uninorm_t nf,
34
      UNIT *resultbuf, size_t *lengthp)
35
2.45M
{
36
  /* The result being accumulated.  */
37
2.45M
  UNIT *result;
38
2.45M
  size_t allocated;
39
2.45M
  if (nf != NULL || resultbuf == NULL)
40
2.45M
    {
41
2.45M
      result = NULL;
42
2.45M
      allocated = 0;
43
2.45M
    }
44
0
  else
45
0
    {
46
0
      result = resultbuf;
47
0
      allocated = *lengthp;
48
0
    }
49
2.45M
  size_t length = 0;
50
51
2.45M
  {
52
2.45M
    const UNIT *s_end = s + n;
53
54
    /* Helper for evaluating the FINAL_SIGMA condition:
55
       Last character that was not case-ignorable.  */
56
2.45M
    ucs4_t last_char_except_ignorable =
57
2.45M
      prefix_context.last_char_except_ignorable;
58
59
    /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
60
       Last character that was of combining class 230 ("Above") or 0.  */
61
2.45M
    ucs4_t last_char_normal_or_above =
62
2.45M
      prefix_context.last_char_normal_or_above;
63
64
13.4M
    while (s < s_end)
65
11.0M
      {
66
11.0M
        ucs4_t uc;
67
11.0M
        int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
68
69
11.0M
        ucs4_t mapped_uc[3];
70
11.0M
        unsigned int mapped_count;
71
72
11.0M
        if (uc < 0x10000)
73
10.9M
          {
74
            /* Look first in the special-casing table.  */
75
10.9M
            char code[3];
76
77
10.9M
            code[0] = (uc >> 8) & 0xff;
78
10.9M
            code[1] = uc & 0xff;
79
80
10.9M
            for (code[2] = 0; ; code[2]++)
81
11.0M
              {
82
11.0M
                const struct special_casing_rule *rule =
83
11.0M
                  gl_unicase_special_lookup (code, 3);
84
85
11.0M
                if (rule == NULL)
86
10.0M
                  break;
87
88
                /* Test if the condition applies.  */
89
                /* Does the language apply?  */
90
1.01M
                if (rule->language[0] == '\0'
91
975k
                    || (iso639_language != NULL
92
0
                        && iso639_language[0] == rule->language[0]
93
0
                        && iso639_language[1] == rule->language[1]))
94
35.6k
                  {
95
                    /* Does the context apply?  */
96
35.6k
                    int context = rule->context;
97
35.6k
                    if (context < 0)
98
0
                      context = - context;
99
100
35.6k
                    bool applies;
101
35.6k
                    switch (context)
102
35.6k
                      {
103
7.29k
                      case SCC_ALWAYS:
104
7.29k
                        applies = true;
105
7.29k
                        break;
106
107
28.3k
                      case SCC_FINAL_SIGMA:
108
                        /* "Before" condition: preceded by a sequence
109
                           consisting of a cased letter and a case-ignorable
110
                           sequence.
111
                           "After" condition: not followed by a sequence
112
                           consisting of a case-ignorable sequence and then a
113
                           cased letter.  */
114
                        /* Test the "before" condition.  */
115
28.3k
                        applies = uc_is_cased (last_char_except_ignorable);
116
                        /* Test the "after" condition.  */
117
28.3k
                        if (applies)
118
5.74k
                          {
119
5.74k
                            const UNIT *s2 = s + count;
120
5.74k
                            for (;;)
121
9.00k
                              {
122
9.00k
                                if (s2 < s_end)
123
9.00k
                                  {
124
9.00k
                                    ucs4_t uc2;
125
9.00k
                                    int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
126
                                    /* Our uc_is_case_ignorable function is
127
                                       known to return false for all cased
128
                                       characters.  So we can call
129
                                       uc_is_case_ignorable first.  */
130
9.00k
                                    if (!uc_is_case_ignorable (uc2))
131
5.74k
                                      {
132
5.74k
                                        applies = ! uc_is_cased (uc2);
133
5.74k
                                        break;
134
5.74k
                                      }
135
3.26k
                                    s2 += count2;
136
3.26k
                                  }
137
0
                                else
138
0
                                  {
139
0
                                    applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
140
0
                                    break;
141
0
                                  }
142
9.00k
                              }
143
5.74k
                          }
144
28.3k
                        break;
145
146
3.26k
                      case SCC_AFTER_SOFT_DOTTED:
147
                        /* "Before" condition: There is a Soft_Dotted character
148
                           before it, with no intervening character of
149
                           combining class 0 or 230 (Above).  */
150
                        /* Test the "before" condition.  */
151
0
                        applies = uc_is_property_soft_dotted (last_char_normal_or_above);
152
0
                        break;
153
154
0
                      case SCC_MORE_ABOVE:
155
                        /* "After" condition: followed by a character of
156
                           combining class 230 (Above) with no intervening
157
                           character of combining class 0 or 230 (Above).  */
158
                        /* Test the "after" condition.  */
159
0
                        {
160
0
                          const UNIT *s2 = s + count;
161
0
                          applies = false;
162
0
                          for (;;)
163
0
                            {
164
0
                              if (s2 < s_end)
165
0
                                {
166
0
                                  ucs4_t uc2;
167
0
                                  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
168
0
                                  int ccc = uc_combining_class (uc2);
169
0
                                  if (ccc == UC_CCC_A)
170
0
                                    {
171
0
                                      applies = true;
172
0
                                      break;
173
0
                                    }
174
0
                                  if (ccc == UC_CCC_NR)
175
0
                                    break;
176
0
                                  s2 += count2;
177
0
                                }
178
0
                              else
179
0
                                {
180
0
                                  applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
181
0
                                  break;
182
0
                                }
183
0
                            }
184
0
                        }
185
0
                        break;
186
187
0
                      case SCC_BEFORE_DOT:
188
                        /* "After" condition: followed by COMBINING DOT ABOVE
189
                           (U+0307). Any sequence of characters with a
190
                           combining class that is neither 0 nor 230 may
191
                           intervene between the current character and the
192
                           combining dot above.  */
193
                        /* Test the "after" condition.  */
194
0
                        {
195
0
                          const UNIT *s2 = s + count;
196
0
                          applies = false;
197
0
                          for (;;)
198
0
                            {
199
0
                              if (s2 < s_end)
200
0
                                {
201
0
                                  ucs4_t uc2;
202
0
                                  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
203
0
                                  if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
204
0
                                    {
205
0
                                      applies = true;
206
0
                                      break;
207
0
                                    }
208
0
                                  {
209
0
                                    int ccc = uc_combining_class (uc2);
210
0
                                    if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
211
0
                                      break;
212
0
                                  }
213
0
                                  s2 += count2;
214
0
                                }
215
0
                              else
216
0
                                {
217
0
                                  applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
218
0
                                  break;
219
0
                                }
220
0
                            }
221
0
                        }
222
0
                        break;
223
224
0
                      case SCC_AFTER_I:
225
                        /* "Before" condition: There is an uppercase I before
226
                           it, and there is no intervening character of
227
                           combining class 0 or 230 (Above).  */
228
                        /* Test the "before" condition.  */
229
0
                        applies = (last_char_normal_or_above == 'I');
230
0
                        break;
231
232
0
                      default:
233
0
                        abort ();
234
35.6k
                      }
235
35.6k
                    if (rule->context < 0)
236
0
                      applies = !applies;
237
238
35.6k
                    if (applies)
239
11.0k
                      {
240
                        /* The rule applies.
241
                           Look up the mapping (0 to 3 characters).  */
242
11.0k
                        const unsigned short *mapped_in_rule =
243
11.0k
                          (const unsigned short *)((const char *)rule + offset_in_rule);
244
245
11.0k
                        if (mapped_in_rule[0] == 0)
246
0
                          mapped_count = 0;
247
11.0k
                        else
248
11.0k
                          {
249
11.0k
                            mapped_uc[0] = mapped_in_rule[0];
250
11.0k
                            if (mapped_in_rule[1] == 0)
251
9.59k
                              mapped_count = 1;
252
1.41k
                            else
253
1.41k
                              {
254
1.41k
                                mapped_uc[1] = mapped_in_rule[1];
255
1.41k
                                if (mapped_in_rule[2] == 0)
256
1.41k
                                  mapped_count = 2;
257
0
                                else
258
0
                                  {
259
0
                                    mapped_uc[2] = mapped_in_rule[2];
260
0
                                    mapped_count = 3;
261
0
                                  }
262
1.41k
                              }
263
11.0k
                          }
264
11.0k
                        goto found_mapping;
265
11.0k
                      }
266
35.6k
                  }
267
268
                /* Optimization: Save a hash table lookup in the next round.  */
269
999k
                if (!rule->has_next)
270
973k
                  break;
271
999k
              }
272
10.9M
          }
273
274
        /* No special-cased mapping.  So use the locale and context independent
275
           mapping.  */
276
10.9M
        mapped_uc[0] = single_character_map (uc);
277
10.9M
        mapped_count = 1;
278
279
11.0M
       found_mapping:
280
        /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
281
22.0M
        for (unsigned int i = 0; i < mapped_count; i++)
282
11.0M
          {
283
11.0M
            ucs4_t muc = mapped_uc[i];
284
285
            /* Append muc to the result accumulator.  */
286
11.0M
            if (length < allocated)
287
8.55M
              {
288
8.55M
                int ret = U_UCTOMB (result + length, muc, allocated - length);
289
8.55M
                if (ret == -1)
290
0
                  {
291
0
                    errno = EINVAL;
292
0
                    goto fail;
293
0
                  }
294
8.55M
                if (ret >= 0)
295
8.54M
                  {
296
8.54M
                    length += ret;
297
8.54M
                    goto done_appending;
298
8.54M
                  }
299
8.55M
              }
300
2.45M
            {
301
2.45M
              size_t old_allocated = allocated;
302
2.45M
              size_t new_allocated = 2 * old_allocated;
303
2.45M
              if (new_allocated < 64)
304
2.45M
                new_allocated = 64;
305
2.45M
              if (new_allocated < old_allocated) /* integer overflow? */
306
0
                abort ();
307
2.45M
              {
308
2.45M
                UNIT *larger_result;
309
2.45M
                if (result == NULL)
310
2.45M
                  {
311
2.45M
                    larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
312
2.45M
                    if (larger_result == NULL)
313
0
                      {
314
0
                        errno = ENOMEM;
315
0
                        goto fail;
316
0
                      }
317
2.45M
                  }
318
7.72k
                else if (result == resultbuf)
319
0
                  {
320
0
                    larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
321
0
                    if (larger_result == NULL)
322
0
                      {
323
0
                        errno = ENOMEM;
324
0
                        goto fail;
325
0
                      }
326
0
                    U_CPY (larger_result, resultbuf, length);
327
0
                  }
328
7.72k
                else
329
7.72k
                  {
330
7.72k
                    larger_result =
331
7.72k
                      (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
332
7.72k
                    if (larger_result == NULL)
333
0
                      {
334
0
                        errno = ENOMEM;
335
0
                        goto fail;
336
0
                      }
337
7.72k
                  }
338
2.45M
                result = larger_result;
339
2.45M
                allocated = new_allocated;
340
2.45M
                {
341
2.45M
                  int ret = U_UCTOMB (result + length, muc, allocated - length);
342
2.45M
                  if (ret == -1)
343
0
                    {
344
0
                      errno = EINVAL;
345
0
                      goto fail;
346
0
                    }
347
2.45M
                  if (ret < 0)
348
0
                    abort ();
349
2.45M
                  length += ret;
350
2.45M
                  goto done_appending;
351
2.45M
                }
352
2.45M
              }
353
2.45M
            }
354
11.0M
           done_appending: ;
355
11.0M
          }
356
357
11.0M
        if (!uc_is_case_ignorable (uc))
358
8.80M
          last_char_except_ignorable = uc;
359
360
11.0M
        {
361
11.0M
          int ccc = uc_combining_class (uc);
362
11.0M
          if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
363
10.9M
            last_char_normal_or_above = uc;
364
11.0M
        }
365
366
11.0M
        s += count;
367
11.0M
      }
368
2.45M
  }
369
370
2.45M
  if (nf != NULL)
371
2.45M
    {
372
      /* Finally, normalize the result.  */
373
2.45M
      UNIT *normalized_result =
374
2.45M
        U_NORMALIZE (nf, result, length, resultbuf, lengthp);
375
2.45M
      if (normalized_result == NULL)
376
0
        goto fail;
377
378
2.45M
      free (result);
379
2.45M
      return normalized_result;
380
2.45M
    }
381
382
0
  if (length == 0)
383
0
    {
384
0
      if (result == NULL)
385
0
        {
386
          /* Return a non-NULL value.  NULL means error.  */
387
0
          result = (UNIT *) malloc (1);
388
0
          if (result == NULL)
389
0
            {
390
0
              errno = ENOMEM;
391
0
              goto fail;
392
0
            }
393
0
        }
394
0
    }
395
0
  else if (result != resultbuf && length < allocated)
396
0
    {
397
      /* Shrink the allocated memory if possible.  */
398
0
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
399
0
      if (memory != NULL)
400
0
        result = memory;
401
0
    }
402
403
0
  *lengthp = length;
404
0
  return result;
405
406
0
 fail:
407
0
  if (result != resultbuf)
408
0
    {
409
0
      int saved_errno = errno;
410
0
      free (result);
411
0
      errno = saved_errno;
412
0
    }
413
  return NULL;
414
0
}