Coverage Report

Created: 2025-07-12 06:18

/src/libidn2/unistring/uninorm/u-normalize-internal.h
Line
Count
Source (jump to first uncovered line)
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
68.9k
{
22
68.9k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
68.9k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
68.9k
  UNIT *result;
27
68.9k
  size_t length;
28
68.9k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
68.9k
  #define SORTBUF_PREALLOCATED 64
31
68.9k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
68.9k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
68.9k
  size_t sortbuf_allocated;
34
68.9k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
68.9k
  if (resultbuf == NULL)
38
68.9k
    {
39
68.9k
      result = NULL;
40
68.9k
      allocated = 0;
41
68.9k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
68.9k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
68.9k
  sortbuf = sortbuf_preallocated;
51
68.9k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
68.9k
  sortbuf_count = 0;
53
54
68.9k
  {
55
68.9k
    const UNIT *s_end = s + n;
56
57
68.9k
    for (;;)
58
798k
      {
59
798k
        int count;
60
798k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
798k
        int decomposed_count;
62
798k
        int i;
63
64
798k
        if (s < s_end)
65
729k
          {
66
            /* Fetch the next character.  */
67
729k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
729k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
729k
            {
77
729k
              int curr;
78
79
1.52M
              for (curr = 0; curr < decomposed_count; )
80
799k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
799k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
799k
                  int curr_decomposed_count;
85
86
799k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
799k
                  if (curr_decomposed_count >= 0)
88
35.3k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
35.3k
                      int shift = curr_decomposed_count - 1;
93
94
35.3k
                      if (shift < 0)
95
0
                        abort ();
96
35.3k
                      if (shift > 0)
97
34.6k
                        {
98
34.6k
                          int j;
99
100
34.6k
                          decomposed_count += shift;
101
34.6k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
43.6k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
8.95k
                            decomposed[j + shift] = decomposed[j];
105
34.6k
                        }
106
105k
                      for (; shift >= 0; shift--)
107
70.0k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
35.3k
                    }
109
763k
                  else
110
763k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
763k
                      curr++;
113
763k
                    }
114
799k
                }
115
729k
            }
116
729k
          }
117
68.9k
        else
118
68.9k
          {
119
68.9k
            count = 0;
120
68.9k
            decomposed_count = 0;
121
68.9k
          }
122
123
798k
        i = 0;
124
798k
        for (;;)
125
1.56M
          {
126
1.56M
            ucs4_t uc;
127
1.56M
            int ccc;
128
129
1.56M
            if (s < s_end)
130
1.49M
              {
131
                /* Fetch the next character from the decomposition.  */
132
1.49M
                if (i == decomposed_count)
133
729k
                  break;
134
763k
                uc = decomposed[i];
135
763k
                ccc = uc_combining_class (uc);
136
763k
              }
137
68.9k
            else
138
68.9k
              {
139
                /* End of string reached.  */
140
68.9k
                uc = 0;
141
68.9k
                ccc = 0;
142
68.9k
              }
143
144
832k
            if (ccc == 0)
145
699k
              {
146
699k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
699k
                if (sortbuf_count > 1)
151
24.3k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
24.3k
                                                           sortbuf + sortbuf_count);
153
154
699k
                if (composer != NULL)
155
699k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
699k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
630k
                      {
178
735k
                        for (j = 1; j < sortbuf_count; )
179
104k
                          {
180
104k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
33.8k
                              {
182
33.8k
                                ucs4_t combined =
183
33.8k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
33.8k
                                if (combined)
185
13.7k
                                  {
186
13.7k
                                    size_t k;
187
188
13.7k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
61.6k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
47.8k
                                      sortbuf[k - 1] = sortbuf[k];
192
13.7k
                                    sortbuf_count--;
193
13.7k
                                    continue;
194
13.7k
                                  }
195
33.8k
                              }
196
90.5k
                            j++;
197
90.5k
                          }
198
630k
                        if (s < s_end && sortbuf_count == 1)
199
555k
                          {
200
555k
                            ucs4_t combined =
201
555k
                              composer (sortbuf[0].code, uc);
202
555k
                            if (combined)
203
6.31k
                              {
204
6.31k
                                uc = combined;
205
6.31k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
6.31k
                                sortbuf_count = 0;
210
6.31k
                              }
211
555k
                          }
212
630k
                      }
213
699k
                  }
214
215
1.44M
                for (j = 0; j < sortbuf_count; j++)
216
743k
                  {
217
743k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
743k
                    if (length < allocated)
221
674k
                      {
222
674k
                        int ret =
223
674k
                          U_UCTOMB (result + length, muc, allocated - length);
224
674k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
674k
                        if (ret >= 0)
230
674k
                          {
231
674k
                            length += ret;
232
674k
                            goto done_appending;
233
674k
                          }
234
674k
                      }
235
69.7k
                    {
236
69.7k
                      size_t old_allocated = allocated;
237
69.7k
                      size_t new_allocated = 2 * old_allocated;
238
69.7k
                      if (new_allocated < 64)
239
66.6k
                        new_allocated = 64;
240
69.7k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
69.7k
                      {
243
69.7k
                        UNIT *larger_result;
244
69.7k
                        if (result == NULL)
245
66.6k
                          {
246
66.6k
                            larger_result =
247
66.6k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
66.6k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
66.6k
                          }
254
3.10k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
3.10k
                        else
266
3.10k
                          {
267
3.10k
                            larger_result =
268
3.10k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
3.10k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
3.10k
                          }
275
69.7k
                        result = larger_result;
276
69.7k
                        allocated = new_allocated;
277
69.7k
                        {
278
69.7k
                          int ret =
279
69.7k
                            U_UCTOMB (result + length, muc, allocated - length);
280
69.7k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
69.7k
                          if (ret < 0)
286
0
                            abort ();
287
69.7k
                          length += ret;
288
69.7k
                          goto done_appending;
289
69.7k
                        }
290
69.7k
                      }
291
69.7k
                    }
292
743k
                   done_appending: ;
293
743k
                  }
294
295
                /* sortbuf is now empty.  */
296
699k
                sortbuf_count = 0;
297
699k
              }
298
299
832k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
68.9k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
763k
            if (sortbuf_count == sortbuf_allocated)
305
653
              {
306
653
                struct ucs4_with_ccc *new_sortbuf;
307
308
653
                sortbuf_allocated = 2 * sortbuf_allocated;
309
653
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
653
                new_sortbuf =
312
653
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
653
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
653
                memcpy (new_sortbuf, sortbuf,
319
653
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
653
                if (sortbuf != sortbuf_preallocated)
321
214
                  free (sortbuf);
322
653
                sortbuf = new_sortbuf;
323
653
              }
324
763k
            sortbuf[sortbuf_count].code = uc;
325
763k
            sortbuf[sortbuf_count].ccc = ccc;
326
763k
            sortbuf_count++;
327
328
763k
            i++;
329
763k
          }
330
331
798k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
68.9k
          break;
334
335
729k
        s += count;
336
729k
      }
337
68.9k
  }
338
339
68.9k
  if (length == 0)
340
2.32k
    {
341
2.32k
      if (result == NULL)
342
2.32k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
2.32k
          result = (UNIT *) malloc (1);
345
2.32k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
2.32k
        }
351
2.32k
    }
352
66.6k
  else if (result != resultbuf && length < allocated)
353
66.3k
    {
354
      /* Shrink the allocated memory if possible.  */
355
66.3k
      UNIT *memory;
356
357
66.3k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
66.3k
      if (memory != NULL)
359
66.3k
        result = memory;
360
66.3k
    }
361
362
68.9k
  if (sortbuf_count > 0)
363
0
    abort ();
364
68.9k
  if (sortbuf != sortbuf_preallocated)
365
439
    free (sortbuf);
366
367
68.9k
  *lengthp = length;
368
68.9k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
68.9k
}