Coverage Report

Created: 2025-12-05 06:16

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/unicase/u-casemap.h
Line
Count
Source
1
/* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software.
6
   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7
   You can redistribute it and/or modify it under either
8
     - the terms of the GNU Lesser General Public License as published
9
       by the Free Software Foundation, either version 3, or (at your
10
       option) any later version, or
11
     - the terms of the GNU General Public License as published by the
12
       Free Software Foundation; either version 2, or (at your option)
13
       any later version, or
14
     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16
   This file is distributed in the hope that it will be useful,
17
   but WITHOUT ANY WARRANTY; without even the implied warranty of
18
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19
   Lesser General Public License and the GNU General Public License
20
   for more details.
21
22
   You should have received a copy of the GNU Lesser General Public
23
   License and of the GNU General Public License along with this
24
   program.  If not, see <https://www.gnu.org/licenses/>.  */
25
26
UNIT *
27
FUNC (const UNIT *s, size_t n,
28
      casing_prefix_context_t prefix_context,
29
      casing_suffix_context_t suffix_context,
30
      const char *iso639_language,
31
      ucs4_t (*single_character_map) (ucs4_t),
32
      size_t offset_in_rule, /* offset in 'struct special_casing_rule' */
33
      uninorm_t nf,
34
      UNIT *resultbuf, size_t *lengthp)
35
4.12M
{
36
  /* The result being accumulated.  */
37
4.12M
  UNIT *result;
38
4.12M
  size_t length;
39
4.12M
  size_t allocated;
40
41
  /* Initialize the accumulator.  */
42
4.12M
  if (nf != NULL || resultbuf == NULL)
43
4.12M
    {
44
4.12M
      result = NULL;
45
4.12M
      allocated = 0;
46
4.12M
    }
47
0
  else
48
0
    {
49
0
      result = resultbuf;
50
0
      allocated = *lengthp;
51
0
    }
52
4.12M
  length = 0;
53
54
4.12M
  {
55
4.12M
    const UNIT *s_end = s + n;
56
57
    /* Helper for evaluating the FINAL_SIGMA condition:
58
       Last character that was not case-ignorable.  */
59
4.12M
    ucs4_t last_char_except_ignorable =
60
4.12M
      prefix_context.last_char_except_ignorable;
61
62
    /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
63
       Last character that was of combining class 230 ("Above") or 0.  */
64
4.12M
    ucs4_t last_char_normal_or_above =
65
4.12M
      prefix_context.last_char_normal_or_above;
66
67
18.8M
    while (s < s_end)
68
14.6M
      {
69
14.6M
        ucs4_t uc;
70
14.6M
        int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
71
72
14.6M
        ucs4_t mapped_uc[3];
73
14.6M
        unsigned int mapped_count;
74
75
14.6M
        if (uc < 0x10000)
76
14.6M
          {
77
            /* Look first in the special-casing table.  */
78
14.6M
            char code[3];
79
80
14.6M
            code[0] = (uc >> 8) & 0xff;
81
14.6M
            code[1] = uc & 0xff;
82
83
14.6M
            for (code[2] = 0; ; code[2]++)
84
14.7M
              {
85
14.7M
                const struct special_casing_rule *rule =
86
14.7M
                  gl_unicase_special_lookup (code, 3);
87
88
14.7M
                if (rule == NULL)
89
13.4M
                  break;
90
91
                /* Test if the condition applies.  */
92
                /* Does the language apply?  */
93
1.34M
                if (rule->language[0] == '\0'
94
1.29M
                    || (iso639_language != NULL
95
0
                        && iso639_language[0] == rule->language[0]
96
0
                        && iso639_language[1] == rule->language[1]))
97
44.8k
                  {
98
                    /* Does the context apply?  */
99
44.8k
                    int context = rule->context;
100
44.8k
                    bool applies;
101
102
44.8k
                    if (context < 0)
103
0
                      context = - context;
104
44.8k
                    switch (context)
105
44.8k
                      {
106
9.36k
                      case SCC_ALWAYS:
107
9.36k
                        applies = true;
108
9.36k
                        break;
109
110
35.5k
                      case SCC_FINAL_SIGMA:
111
                        /* "Before" condition: preceded by a sequence
112
                           consisting of a cased letter and a case-ignorable
113
                           sequence.
114
                           "After" condition: not followed by a sequence
115
                           consisting of a case-ignorable sequence and then a
116
                           cased letter.  */
117
                        /* Test the "before" condition.  */
118
35.5k
                        applies = uc_is_cased (last_char_except_ignorable);
119
                        /* Test the "after" condition.  */
120
35.5k
                        if (applies)
121
6.26k
                          {
122
6.26k
                            const UNIT *s2 = s + count;
123
6.26k
                            for (;;)
124
9.30k
                              {
125
9.30k
                                if (s2 < s_end)
126
9.30k
                                  {
127
9.30k
                                    ucs4_t uc2;
128
9.30k
                                    int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
129
                                    /* Our uc_is_case_ignorable function is
130
                                       known to return false for all cased
131
                                       characters.  So we can call
132
                                       uc_is_case_ignorable first.  */
133
9.30k
                                    if (!uc_is_case_ignorable (uc2))
134
6.26k
                                      {
135
6.26k
                                        applies = ! uc_is_cased (uc2);
136
6.26k
                                        break;
137
6.26k
                                      }
138
3.04k
                                    s2 += count2;
139
3.04k
                                  }
140
0
                                else
141
0
                                  {
142
0
                                    applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
143
0
                                    break;
144
0
                                  }
145
9.30k
                              }
146
6.26k
                          }
147
35.5k
                        break;
148
149
3.04k
                      case SCC_AFTER_SOFT_DOTTED:
150
                        /* "Before" condition: There is a Soft_Dotted character
151
                           before it, with no intervening character of
152
                           combining class 0 or 230 (Above).  */
153
                        /* Test the "before" condition.  */
154
0
                        applies = uc_is_property_soft_dotted (last_char_normal_or_above);
155
0
                        break;
156
157
0
                      case SCC_MORE_ABOVE:
158
                        /* "After" condition: followed by a character of
159
                           combining class 230 (Above) with no intervening
160
                           character of combining class 0 or 230 (Above).  */
161
                        /* Test the "after" condition.  */
162
0
                        {
163
0
                          const UNIT *s2 = s + count;
164
0
                          applies = false;
165
0
                          for (;;)
166
0
                            {
167
0
                              if (s2 < s_end)
168
0
                                {
169
0
                                  ucs4_t uc2;
170
0
                                  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
171
0
                                  int ccc = uc_combining_class (uc2);
172
0
                                  if (ccc == UC_CCC_A)
173
0
                                    {
174
0
                                      applies = true;
175
0
                                      break;
176
0
                                    }
177
0
                                  if (ccc == UC_CCC_NR)
178
0
                                    break;
179
0
                                  s2 += count2;
180
0
                                }
181
0
                              else
182
0
                                {
183
0
                                  applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
184
0
                                  break;
185
0
                                }
186
0
                            }
187
0
                        }
188
0
                        break;
189
190
0
                      case SCC_BEFORE_DOT:
191
                        /* "After" condition: followed by COMBINING DOT ABOVE
192
                           (U+0307). Any sequence of characters with a
193
                           combining class that is neither 0 nor 230 may
194
                           intervene between the current character and the
195
                           combining dot above.  */
196
                        /* Test the "after" condition.  */
197
0
                        {
198
0
                          const UNIT *s2 = s + count;
199
0
                          applies = false;
200
0
                          for (;;)
201
0
                            {
202
0
                              if (s2 < s_end)
203
0
                                {
204
0
                                  ucs4_t uc2;
205
0
                                  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
206
0
                                  if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
207
0
                                    {
208
0
                                      applies = true;
209
0
                                      break;
210
0
                                    }
211
0
                                  {
212
0
                                    int ccc = uc_combining_class (uc2);
213
0
                                    if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
214
0
                                      break;
215
0
                                  }
216
0
                                  s2 += count2;
217
0
                                }
218
0
                              else
219
0
                                {
220
0
                                  applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
221
0
                                  break;
222
0
                                }
223
0
                            }
224
0
                        }
225
0
                        break;
226
227
0
                      case SCC_AFTER_I:
228
                        /* "Before" condition: There is an uppercase I before
229
                           it, and there is no intervening character of
230
                           combining class 0 or 230 (Above).  */
231
                        /* Test the "before" condition.  */
232
0
                        applies = (last_char_normal_or_above == 'I');
233
0
                        break;
234
235
0
                      default:
236
0
                        abort ();
237
44.8k
                      }
238
44.8k
                    if (rule->context < 0)
239
0
                      applies = !applies;
240
241
44.8k
                    if (applies)
242
13.3k
                      {
243
                        /* The rule applies.
244
                           Look up the mapping (0 to 3 characters).  */
245
13.3k
                        const unsigned short *mapped_in_rule =
246
13.3k
                          (const unsigned short *)((const char *)rule + offset_in_rule);
247
248
13.3k
                        if (mapped_in_rule[0] == 0)
249
0
                          mapped_count = 0;
250
13.3k
                        else
251
13.3k
                          {
252
13.3k
                            mapped_uc[0] = mapped_in_rule[0];
253
13.3k
                            if (mapped_in_rule[1] == 0)
254
11.7k
                              mapped_count = 1;
255
1.59k
                            else
256
1.59k
                              {
257
1.59k
                                mapped_uc[1] = mapped_in_rule[1];
258
1.59k
                                if (mapped_in_rule[2] == 0)
259
1.59k
                                  mapped_count = 2;
260
0
                                else
261
0
                                  {
262
0
                                    mapped_uc[2] = mapped_in_rule[2];
263
0
                                    mapped_count = 3;
264
0
                                  }
265
1.59k
                              }
266
13.3k
                          }
267
13.3k
                        goto found_mapping;
268
13.3k
                      }
269
44.8k
                  }
270
271
                /* Optimization: Save a hash table lookup in the next round.  */
272
1.33M
                if (!rule->has_next)
273
1.21M
                  break;
274
1.33M
              }
275
14.6M
          }
276
277
        /* No special-cased mapping.  So use the locale and context independent
278
           mapping.  */
279
14.6M
        mapped_uc[0] = single_character_map (uc);
280
14.6M
        mapped_count = 1;
281
282
14.6M
       found_mapping:
283
        /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
284
29.3M
        for (unsigned int i = 0; i < mapped_count; i++)
285
14.6M
          {
286
14.6M
            ucs4_t muc = mapped_uc[i];
287
288
            /* Append muc to the result accumulator.  */
289
14.6M
            if (length < allocated)
290
10.5M
              {
291
10.5M
                int ret = U_UCTOMB (result + length, muc, allocated - length);
292
10.5M
                if (ret == -1)
293
0
                  {
294
0
                    errno = EINVAL;
295
0
                    goto fail;
296
0
                  }
297
10.5M
                if (ret >= 0)
298
10.5M
                  {
299
10.5M
                    length += ret;
300
10.5M
                    goto done_appending;
301
10.5M
                  }
302
10.5M
              }
303
4.12M
            {
304
4.12M
              size_t old_allocated = allocated;
305
4.12M
              size_t new_allocated = 2 * old_allocated;
306
4.12M
              if (new_allocated < 64)
307
4.12M
                new_allocated = 64;
308
4.12M
              if (new_allocated < old_allocated) /* integer overflow? */
309
0
                abort ();
310
4.12M
              {
311
4.12M
                UNIT *larger_result;
312
4.12M
                if (result == NULL)
313
4.12M
                  {
314
4.12M
                    larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
315
4.12M
                    if (larger_result == NULL)
316
0
                      {
317
0
                        errno = ENOMEM;
318
0
                        goto fail;
319
0
                      }
320
4.12M
                  }
321
8.24k
                else if (result == resultbuf)
322
0
                  {
323
0
                    larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
324
0
                    if (larger_result == NULL)
325
0
                      {
326
0
                        errno = ENOMEM;
327
0
                        goto fail;
328
0
                      }
329
0
                    U_CPY (larger_result, resultbuf, length);
330
0
                  }
331
8.24k
                else
332
8.24k
                  {
333
8.24k
                    larger_result =
334
8.24k
                      (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
335
8.24k
                    if (larger_result == NULL)
336
0
                      {
337
0
                        errno = ENOMEM;
338
0
                        goto fail;
339
0
                      }
340
8.24k
                  }
341
4.12M
                result = larger_result;
342
4.12M
                allocated = new_allocated;
343
4.12M
                {
344
4.12M
                  int ret = U_UCTOMB (result + length, muc, allocated - length);
345
4.12M
                  if (ret == -1)
346
0
                    {
347
0
                      errno = EINVAL;
348
0
                      goto fail;
349
0
                    }
350
4.12M
                  if (ret < 0)
351
0
                    abort ();
352
4.12M
                  length += ret;
353
4.12M
                  goto done_appending;
354
4.12M
                }
355
4.12M
              }
356
4.12M
            }
357
14.6M
           done_appending: ;
358
14.6M
          }
359
360
14.6M
        if (!uc_is_case_ignorable (uc))
361
11.8M
          last_char_except_ignorable = uc;
362
363
14.6M
        {
364
14.6M
          int ccc = uc_combining_class (uc);
365
14.6M
          if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
366
14.6M
            last_char_normal_or_above = uc;
367
14.6M
        }
368
369
14.6M
        s += count;
370
14.6M
      }
371
4.12M
  }
372
373
4.12M
  if (nf != NULL)
374
4.12M
    {
375
      /* Finally, normalize the result.  */
376
4.12M
      UNIT *normalized_result;
377
378
4.12M
      normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
379
4.12M
      if (normalized_result == NULL)
380
0
        goto fail;
381
382
4.12M
      free (result);
383
4.12M
      return normalized_result;
384
4.12M
    }
385
386
0
  if (length == 0)
387
0
    {
388
0
      if (result == NULL)
389
0
        {
390
          /* Return a non-NULL value.  NULL means error.  */
391
0
          result = (UNIT *) malloc (1);
392
0
          if (result == NULL)
393
0
            {
394
0
              errno = ENOMEM;
395
0
              goto fail;
396
0
            }
397
0
        }
398
0
    }
399
0
  else if (result != resultbuf && length < allocated)
400
0
    {
401
      /* Shrink the allocated memory if possible.  */
402
0
      UNIT *memory;
403
404
0
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
405
0
      if (memory != NULL)
406
0
        result = memory;
407
0
    }
408
409
0
  *lengthp = length;
410
0
  return result;
411
412
0
 fail:
413
0
  if (result != resultbuf)
414
0
    {
415
0
      int saved_errno = errno;
416
0
      free (result);
417
0
      errno = saved_errno;
418
0
    }
419
  return NULL;
420
0
}