Coverage Report

Created: 2023-03-26 07:33

/src/libunistring/lib/unicase/u-casemap.h
Line
Count
Source (jump to first uncovered line)
1
/* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
2
   Copyright (C) 2009-2022 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software.
6
   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7
   You can redistribute it and/or modify it under either
8
     - the terms of the GNU Lesser General Public License as published
9
       by the Free Software Foundation; either version 3, or (at your
10
       option) any later version, or
11
     - the terms of the GNU General Public License as published by the
12
       Free Software Foundation; either version 2, or (at your option)
13
       any later version, or
14
     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16
   This file is distributed in the hope that it will be useful,
17
   but WITHOUT ANY WARRANTY; without even the implied warranty of
18
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19
   Lesser General Public License and the GNU General Public License
20
   for more details.
21
22
   You should have received a copy of the GNU Lesser General Public
23
   License and of the GNU General Public License along with this
24
   program.  If not, see <https://www.gnu.org/licenses/>.  */
25
26
UNIT *
27
FUNC (const UNIT *s, size_t n,
28
      casing_prefix_context_t prefix_context,
29
      casing_suffix_context_t suffix_context,
30
      const char *iso639_language,
31
      ucs4_t (*single_character_map) (ucs4_t),
32
      size_t offset_in_rule, /* offset in 'struct special_casing_rule' */
33
      uninorm_t nf,
34
      UNIT *resultbuf, size_t *lengthp)
35
0
{
36
  /* The result being accumulated.  */
37
0
  UNIT *result;
38
0
  size_t length;
39
0
  size_t allocated;
40
41
  /* Initialize the accumulator.  */
42
0
  if (nf != NULL || resultbuf == NULL)
43
0
    {
44
0
      result = NULL;
45
0
      allocated = 0;
46
0
    }
47
0
  else
48
0
    {
49
0
      result = resultbuf;
50
0
      allocated = *lengthp;
51
0
    }
52
0
  length = 0;
53
54
0
  {
55
0
    const UNIT *s_end = s + n;
56
57
    /* Helper for evaluating the FINAL_SIGMA condition:
58
       Last character that was not case-ignorable.  */
59
0
    ucs4_t last_char_except_ignorable =
60
0
      prefix_context.last_char_except_ignorable;
61
62
    /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
63
       Last character that was of combining class 230 ("Above") or 0.  */
64
0
    ucs4_t last_char_normal_or_above =
65
0
      prefix_context.last_char_normal_or_above;
66
67
0
    while (s < s_end)
68
0
      {
69
0
        ucs4_t uc;
70
0
        int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
71
72
0
        ucs4_t mapped_uc[3];
73
0
        unsigned int mapped_count;
74
75
0
        if (uc < 0x10000)
76
0
          {
77
            /* Look first in the special-casing table.  */
78
0
            char code[3];
79
80
0
            code[0] = (uc >> 8) & 0xff;
81
0
            code[1] = uc & 0xff;
82
83
0
            for (code[2] = 0; ; code[2]++)
84
0
              {
85
0
                const struct special_casing_rule *rule =
86
0
                  gl_unicase_special_lookup (code, 3);
87
88
0
                if (rule == NULL)
89
0
                  break;
90
91
                /* Test if the condition applies.  */
92
                /* Does the language apply?  */
93
0
                if (rule->language[0] == '\0'
94
0
                    || (iso639_language != NULL
95
0
                        && iso639_language[0] == rule->language[0]
96
0
                        && iso639_language[1] == rule->language[1]))
97
0
                  {
98
                    /* Does the context apply?  */
99
0
                    int context = rule->context;
100
0
                    bool applies;
101
102
0
                    if (context < 0)
103
0
                      context = - context;
104
0
                    switch (context)
105
0
                      {
106
0
                      case SCC_ALWAYS:
107
0
                        applies = true;
108
0
                        break;
109
110
0
                      case SCC_FINAL_SIGMA:
111
                        /* "Before" condition: preceded by a sequence
112
                           consisting of a cased letter and a case-ignorable
113
                           sequence.
114
                           "After" condition: not followed by a sequence
115
                           consisting of a case-ignorable sequence and then a
116
                           cased letter.  */
117
                        /* Test the "before" condition.  */
118
0
                        applies = uc_is_cased (last_char_except_ignorable);
119
                        /* Test the "after" condition.  */
120
0
                        if (applies)
121
0
                          {
122
0
                            const UNIT *s2 = s + count;
123
0
                            for (;;)
124
0
                              {
125
0
                                if (s2 < s_end)
126
0
                                  {
127
0
                                    ucs4_t uc2;
128
0
                                    int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
129
                                    /* Our uc_is_case_ignorable function is
130
                                       known to return false for all cased
131
                                       characters.  So we can call
132
                                       uc_is_case_ignorable first.  */
133
0
                                    if (!uc_is_case_ignorable (uc2))
134
0
                                      {
135
0
                                        applies = ! uc_is_cased (uc2);
136
0
                                        break;
137
0
                                      }
138
0
                                    s2 += count2;
139
0
                                  }
140
0
                                else
141
0
                                  {
142
0
                                    applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
143
0
                                    break;
144
0
                                  }
145
0
                              }
146
0
                          }
147
0
                        break;
148
149
0
                      case SCC_AFTER_SOFT_DOTTED:
150
                        /* "Before" condition: There is a Soft_Dotted character
151
                           before it, with no intervening character of
152
                           combining class 0 or 230 (Above).  */
153
                        /* Test the "before" condition.  */
154
0
                        applies = uc_is_property_soft_dotted (last_char_normal_or_above);
155
0
                        break;
156
157
0
                      case SCC_MORE_ABOVE:
158
                        /* "After" condition: followed by a character of
159
                           combining class 230 (Above) with no intervening
160
                           character of combining class 0 or 230 (Above).  */
161
                        /* Test the "after" condition.  */
162
0
                        {
163
0
                          const UNIT *s2 = s + count;
164
0
                          applies = false;
165
0
                          for (;;)
166
0
                            {
167
0
                              if (s2 < s_end)
168
0
                                {
169
0
                                  ucs4_t uc2;
170
0
                                  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
171
0
                                  int ccc = uc_combining_class (uc2);
172
0
                                  if (ccc == UC_CCC_A)
173
0
                                    {
174
0
                                      applies = true;
175
0
                                      break;
176
0
                                    }
177
0
                                  if (ccc == UC_CCC_NR)
178
0
                                    break;
179
0
                                  s2 += count2;
180
0
                                }
181
0
                              else
182
0
                                {
183
0
                                  applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
184
0
                                  break;
185
0
                                }
186
0
                            }
187
0
                        }
188
0
                        break;
189
190
0
                      case SCC_BEFORE_DOT:
191
                        /* "After" condition: followed by COMBINING DOT ABOVE
192
                           (U+0307). Any sequence of characters with a
193
                           combining class that is neither 0 nor 230 may
194
                           intervene between the current character and the
195
                           combining dot above.  */
196
                        /* Test the "after" condition.  */
197
0
                        {
198
0
                          const UNIT *s2 = s + count;
199
0
                          applies = false;
200
0
                          for (;;)
201
0
                            {
202
0
                              if (s2 < s_end)
203
0
                                {
204
0
                                  ucs4_t uc2;
205
0
                                  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
206
0
                                  if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
207
0
                                    {
208
0
                                      applies = true;
209
0
                                      break;
210
0
                                    }
211
0
                                  {
212
0
                                    int ccc = uc_combining_class (uc2);
213
0
                                    if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
214
0
                                      break;
215
0
                                  }
216
0
                                  s2 += count2;
217
0
                                }
218
0
                              else
219
0
                                {
220
0
                                  applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
221
0
                                  break;
222
0
                                }
223
0
                            }
224
0
                        }
225
0
                        break;
226
227
0
                      case SCC_AFTER_I:
228
                        /* "Before" condition: There is an uppercase I before
229
                           it, and there is no intervening character of
230
                           combining class 0 or 230 (Above).  */
231
                        /* Test the "before" condition.  */
232
0
                        applies = (last_char_normal_or_above == 'I');
233
0
                        break;
234
235
0
                      default:
236
0
                        abort ();
237
0
                      }
238
0
                    if (rule->context < 0)
239
0
                      applies = !applies;
240
241
0
                    if (applies)
242
0
                      {
243
                        /* The rule applies.
244
                           Look up the mapping (0 to 3 characters).  */
245
0
                        const unsigned short *mapped_in_rule =
246
0
                          (const unsigned short *)((const char *)rule + offset_in_rule);
247
248
0
                        if (mapped_in_rule[0] == 0)
249
0
                          mapped_count = 0;
250
0
                        else
251
0
                          {
252
0
                            mapped_uc[0] = mapped_in_rule[0];
253
0
                            if (mapped_in_rule[1] == 0)
254
0
                              mapped_count = 1;
255
0
                            else
256
0
                              {
257
0
                                mapped_uc[1] = mapped_in_rule[1];
258
0
                                if (mapped_in_rule[2] == 0)
259
0
                                  mapped_count = 2;
260
0
                                else
261
0
                                  {
262
0
                                    mapped_uc[2] = mapped_in_rule[2];
263
0
                                    mapped_count = 3;
264
0
                                  }
265
0
                              }
266
0
                          }
267
0
                        goto found_mapping;
268
0
                      }
269
0
                  }
270
271
                /* Optimization: Save a hash table lookup in the next round.  */
272
0
                if (!rule->has_next)
273
0
                  break;
274
0
              }
275
0
          }
276
277
        /* No special-cased mapping.  So use the locale and context independent
278
           mapping.  */
279
0
        mapped_uc[0] = single_character_map (uc);
280
0
        mapped_count = 1;
281
282
0
       found_mapping:
283
        /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
284
0
        {
285
0
          unsigned int i;
286
287
0
          for (i = 0; i < mapped_count; i++)
288
0
            {
289
0
              ucs4_t muc = mapped_uc[i];
290
291
              /* Append muc to the result accumulator.  */
292
0
              if (length < allocated)
293
0
                {
294
0
                  int ret = U_UCTOMB (result + length, muc, allocated - length);
295
0
                  if (ret == -1)
296
0
                    {
297
0
                      errno = EINVAL;
298
0
                      goto fail;
299
0
                    }
300
0
                  if (ret >= 0)
301
0
                    {
302
0
                      length += ret;
303
0
                      goto done_appending;
304
0
                    }
305
0
                }
306
0
              {
307
0
                size_t old_allocated = allocated;
308
0
                size_t new_allocated = 2 * old_allocated;
309
0
                if (new_allocated < 64)
310
0
                  new_allocated = 64;
311
0
                if (new_allocated < old_allocated) /* integer overflow? */
312
0
                  abort ();
313
0
                {
314
0
                  UNIT *larger_result;
315
0
                  if (result == NULL)
316
0
                    {
317
0
                      larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
318
0
                      if (larger_result == NULL)
319
0
                        {
320
0
                          errno = ENOMEM;
321
0
                          goto fail;
322
0
                        }
323
0
                    }
324
0
                  else if (result == resultbuf)
325
0
                    {
326
0
                      larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
327
0
                      if (larger_result == NULL)
328
0
                        {
329
0
                          errno = ENOMEM;
330
0
                          goto fail;
331
0
                        }
332
0
                      U_CPY (larger_result, resultbuf, length);
333
0
                    }
334
0
                  else
335
0
                    {
336
0
                      larger_result =
337
0
                        (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
338
0
                      if (larger_result == NULL)
339
0
                        {
340
0
                          errno = ENOMEM;
341
0
                          goto fail;
342
0
                        }
343
0
                    }
344
0
                  result = larger_result;
345
0
                  allocated = new_allocated;
346
0
                  {
347
0
                    int ret = U_UCTOMB (result + length, muc, allocated - length);
348
0
                    if (ret == -1)
349
0
                      {
350
0
                        errno = EINVAL;
351
0
                        goto fail;
352
0
                      }
353
0
                    if (ret < 0)
354
0
                      abort ();
355
0
                    length += ret;
356
0
                    goto done_appending;
357
0
                  }
358
0
                }
359
0
              }
360
0
             done_appending: ;
361
0
            }
362
0
        }
363
364
0
        if (!uc_is_case_ignorable (uc))
365
0
          last_char_except_ignorable = uc;
366
367
0
        {
368
0
          int ccc = uc_combining_class (uc);
369
0
          if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
370
0
            last_char_normal_or_above = uc;
371
0
        }
372
373
0
        s += count;
374
0
      }
375
0
  }
376
377
0
  if (nf != NULL)
378
0
    {
379
      /* Finally, normalize the result.  */
380
0
      UNIT *normalized_result;
381
382
0
      normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
383
0
      if (normalized_result == NULL)
384
0
        goto fail;
385
386
0
      free (result);
387
0
      return normalized_result;
388
0
    }
389
390
0
  if (length == 0)
391
0
    {
392
0
      if (result == NULL)
393
0
        {
394
          /* Return a non-NULL value.  NULL means error.  */
395
0
          result = (UNIT *) malloc (1);
396
0
          if (result == NULL)
397
0
            {
398
0
              errno = ENOMEM;
399
0
              goto fail;
400
0
            }
401
0
        }
402
0
    }
403
0
  else if (result != resultbuf && length < allocated)
404
0
    {
405
      /* Shrink the allocated memory if possible.  */
406
0
      UNIT *memory;
407
408
0
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
409
0
      if (memory != NULL)
410
0
        result = memory;
411
0
    }
412
413
0
  *lengthp = length;
414
0
  return result;
415
416
0
 fail:
417
0
  if (result != resultbuf)
418
0
    {
419
0
      int saved_errno = errno;
420
0
      free (result);
421
0
      errno = saved_errno;
422
0
    }
423
0
  return NULL;
424
0
}