Coverage Report

Created: 2025-10-10 06:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libidn2/unistring/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
69.6k
{
22
69.6k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
69.6k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
69.6k
  UNIT *result;
27
69.6k
  size_t length;
28
69.6k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
69.6k
  #define SORTBUF_PREALLOCATED 64
31
69.6k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
69.6k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
69.6k
  size_t sortbuf_allocated;
34
69.6k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
69.6k
  if (resultbuf == NULL)
38
69.6k
    {
39
69.6k
      result = NULL;
40
69.6k
      allocated = 0;
41
69.6k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
69.6k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
69.6k
  sortbuf = sortbuf_preallocated;
51
69.6k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
69.6k
  sortbuf_count = 0;
53
54
69.6k
  {
55
69.6k
    const UNIT *s_end = s + n;
56
57
69.6k
    for (;;)
58
790k
      {
59
790k
        int count;
60
790k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
790k
        int decomposed_count;
62
790k
        int i;
63
64
790k
        if (s < s_end)
65
721k
          {
66
            /* Fetch the next character.  */
67
721k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
721k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
721k
            {
77
721k
              int curr;
78
79
1.51M
              for (curr = 0; curr < decomposed_count; )
80
788k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
788k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
788k
                  int curr_decomposed_count;
85
86
788k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
788k
                  if (curr_decomposed_count >= 0)
88
34.1k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
34.1k
                      int shift = curr_decomposed_count - 1;
93
94
34.1k
                      if (shift < 0)
95
0
                        abort ();
96
34.1k
                      if (shift > 0)
97
33.3k
                        {
98
33.3k
                          int j;
99
100
33.3k
                          decomposed_count += shift;
101
33.3k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
42.1k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
8.80k
                            decomposed[j + shift] = decomposed[j];
105
33.3k
                        }
106
101k
                      for (; shift >= 0; shift--)
107
67.5k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
34.1k
                    }
109
754k
                  else
110
754k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
754k
                      curr++;
113
754k
                    }
114
788k
                }
115
721k
            }
116
721k
          }
117
69.6k
        else
118
69.6k
          {
119
69.6k
            count = 0;
120
69.6k
            decomposed_count = 0;
121
69.6k
          }
122
123
790k
        i = 0;
124
790k
        for (;;)
125
1.54M
          {
126
1.54M
            ucs4_t uc;
127
1.54M
            int ccc;
128
129
1.54M
            if (s < s_end)
130
1.47M
              {
131
                /* Fetch the next character from the decomposition.  */
132
1.47M
                if (i == decomposed_count)
133
721k
                  break;
134
754k
                uc = decomposed[i];
135
754k
                ccc = uc_combining_class (uc);
136
754k
              }
137
69.6k
            else
138
69.6k
              {
139
                /* End of string reached.  */
140
69.6k
                uc = 0;
141
69.6k
                ccc = 0;
142
69.6k
              }
143
144
824k
            if (ccc == 0)
145
700k
              {
146
700k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
700k
                if (sortbuf_count > 1)
151
22.9k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
22.9k
                                                           sortbuf + sortbuf_count);
153
154
700k
                if (composer != NULL)
155
700k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
700k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
630k
                      {
178
724k
                        for (j = 1; j < sortbuf_count; )
179
93.6k
                          {
180
93.6k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
31.1k
                              {
182
31.1k
                                ucs4_t combined =
183
31.1k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
31.1k
                                if (combined)
185
12.9k
                                  {
186
12.9k
                                    size_t k;
187
188
12.9k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
52.1k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
39.2k
                                      sortbuf[k - 1] = sortbuf[k];
192
12.9k
                                    sortbuf_count--;
193
12.9k
                                    continue;
194
12.9k
                                  }
195
31.1k
                              }
196
80.6k
                            j++;
197
80.6k
                          }
198
630k
                        if (s < s_end && sortbuf_count == 1)
199
555k
                          {
200
555k
                            ucs4_t combined =
201
555k
                              composer (sortbuf[0].code, uc);
202
555k
                            if (combined)
203
7.07k
                              {
204
7.07k
                                uc = combined;
205
7.07k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
7.07k
                                sortbuf_count = 0;
210
7.07k
                              }
211
555k
                          }
212
630k
                      }
213
700k
                  }
214
215
1.43M
                for (j = 0; j < sortbuf_count; j++)
216
734k
                  {
217
734k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
734k
                    if (length < allocated)
221
664k
                      {
222
664k
                        int ret =
223
664k
                          U_UCTOMB (result + length, muc, allocated - length);
224
664k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
664k
                        if (ret >= 0)
230
664k
                          {
231
664k
                            length += ret;
232
664k
                            goto done_appending;
233
664k
                          }
234
664k
                      }
235
70.2k
                    {
236
70.2k
                      size_t old_allocated = allocated;
237
70.2k
                      size_t new_allocated = 2 * old_allocated;
238
70.2k
                      if (new_allocated < 64)
239
67.2k
                        new_allocated = 64;
240
70.2k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
70.2k
                      {
243
70.2k
                        UNIT *larger_result;
244
70.2k
                        if (result == NULL)
245
67.2k
                          {
246
67.2k
                            larger_result =
247
67.2k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
67.2k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
67.2k
                          }
254
3.04k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
3.04k
                        else
266
3.04k
                          {
267
3.04k
                            larger_result =
268
3.04k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
3.04k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
3.04k
                          }
275
70.2k
                        result = larger_result;
276
70.2k
                        allocated = new_allocated;
277
70.2k
                        {
278
70.2k
                          int ret =
279
70.2k
                            U_UCTOMB (result + length, muc, allocated - length);
280
70.2k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
70.2k
                          if (ret < 0)
286
0
                            abort ();
287
70.2k
                          length += ret;
288
70.2k
                          goto done_appending;
289
70.2k
                        }
290
70.2k
                      }
291
70.2k
                    }
292
734k
                   done_appending: ;
293
734k
                  }
294
295
                /* sortbuf is now empty.  */
296
700k
                sortbuf_count = 0;
297
700k
              }
298
299
824k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
69.6k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
754k
            if (sortbuf_count == sortbuf_allocated)
305
636
              {
306
636
                struct ucs4_with_ccc *new_sortbuf;
307
308
636
                sortbuf_allocated = 2 * sortbuf_allocated;
309
636
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
636
                new_sortbuf =
312
636
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
636
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
636
                memcpy (new_sortbuf, sortbuf,
319
636
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
636
                if (sortbuf != sortbuf_preallocated)
321
636
                  free (sortbuf);
322
636
                sortbuf = new_sortbuf;
323
636
              }
324
754k
            sortbuf[sortbuf_count].code = uc;
325
754k
            sortbuf[sortbuf_count].ccc = ccc;
326
754k
            sortbuf_count++;
327
328
754k
            i++;
329
754k
          }
330
331
790k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
69.6k
          break;
334
335
721k
        s += count;
336
721k
      }
337
69.6k
  }
338
339
69.6k
  if (length == 0)
340
2.44k
    {
341
2.44k
      if (result == NULL)
342
2.44k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
2.44k
          result = (UNIT *) malloc (1);
345
2.44k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
2.44k
        }
351
2.44k
    }
352
67.2k
  else if (result != resultbuf && length < allocated)
353
66.9k
    {
354
      /* Shrink the allocated memory if possible.  */
355
66.9k
      UNIT *memory;
356
357
66.9k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
66.9k
      if (memory != NULL)
359
66.9k
        result = memory;
360
66.9k
    }
361
362
69.6k
  if (sortbuf_count > 0)
363
0
    abort ();
364
69.6k
  if (sortbuf != sortbuf_preallocated)
365
69.6k
    free (sortbuf);
366
367
69.6k
  *lengthp = length;
368
69.6k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
69.6k
}