Coverage Report

Created: 2026-01-09 06:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libidn2/gl/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
71.1k
{
22
71.1k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
71.1k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
71.1k
  UNIT *result;
27
71.1k
  size_t length;
28
71.1k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
71.1k
  #define SORTBUF_PREALLOCATED 64
31
71.1k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
71.1k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
71.1k
  size_t sortbuf_allocated;
34
71.1k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
71.1k
  if (resultbuf == NULL)
38
71.1k
    {
39
71.1k
      result = NULL;
40
71.1k
      allocated = 0;
41
71.1k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
71.1k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
71.1k
  sortbuf = sortbuf_preallocated;
51
71.1k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
71.1k
  sortbuf_count = 0;
53
54
71.1k
  {
55
71.1k
    const UNIT *s_end = s + n;
56
57
71.1k
    for (;;)
58
816k
      {
59
816k
        int count;
60
816k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
816k
        int decomposed_count;
62
816k
        int i;
63
64
816k
        if (s < s_end)
65
745k
          {
66
            /* Fetch the next character.  */
67
745k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
745k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
1.55M
            for (int curr = 0; curr < decomposed_count; )
77
809k
              {
78
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
79
                   all elements are atomic.  */
80
809k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
81
809k
                int curr_decomposed_count;
82
83
809k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
84
809k
                if (curr_decomposed_count >= 0)
85
32.4k
                  {
86
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
87
                       decomposed[curr], making room.  It's not worth using
88
                       memcpy() here, since the counts are so small.  */
89
32.4k
                    int shift = curr_decomposed_count - 1;
90
91
32.4k
                    if (shift < 0)
92
0
                      abort ();
93
32.4k
                    if (shift > 0)
94
31.6k
                      {
95
31.6k
                        decomposed_count += shift;
96
31.6k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
97
0
                          abort ();
98
39.1k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
99
7.51k
                          decomposed[j + shift] = decomposed[j];
100
31.6k
                      }
101
96.5k
                    for (; shift >= 0; shift--)
102
64.0k
                      decomposed[curr + shift] = curr_decomposed[shift];
103
32.4k
                  }
104
776k
                else
105
776k
                  {
106
                    /* decomposed[curr] is atomic.  */
107
776k
                    curr++;
108
776k
                  }
109
809k
              }
110
745k
          }
111
71.1k
        else
112
71.1k
          {
113
71.1k
            count = 0;
114
71.1k
            decomposed_count = 0;
115
71.1k
          }
116
117
816k
        i = 0;
118
816k
        for (;;)
119
1.59M
          {
120
1.59M
            ucs4_t uc;
121
1.59M
            int ccc;
122
123
1.59M
            if (s < s_end)
124
1.52M
              {
125
                /* Fetch the next character from the decomposition.  */
126
1.52M
                if (i == decomposed_count)
127
745k
                  break;
128
776k
                uc = decomposed[i];
129
776k
                ccc = uc_combining_class (uc);
130
776k
              }
131
71.1k
            else
132
71.1k
              {
133
                /* End of string reached.  */
134
71.1k
                uc = 0;
135
71.1k
                ccc = 0;
136
71.1k
              }
137
138
848k
            if (ccc == 0)
139
719k
              {
140
                /* Apply the canonical ordering algorithm to the accumulated
141
                   sequence of characters.  */
142
719k
                if (sortbuf_count > 1)
143
27.1k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
144
27.1k
                                                           sortbuf + sortbuf_count);
145
146
719k
                if (composer != NULL)
147
719k
                  {
148
                    /* Attempt to combine decomposed characters, as specified
149
                       in the Unicode Standard Annex #15 "Unicode Normalization
150
                       Forms".  We need to check
151
                         1. whether the first accumulated character is a
152
                            "starter" (i.e. has ccc = 0).  This is usually the
153
                            case.  But when the string starts with a
154
                            non-starter, the sortbuf also starts with a
155
                            non-starter.  Btw, this check could also be
156
                            omitted, because the composition table has only
157
                            entries (code1, code2) for which code1 is a
158
                            starter; if the first accumulated character is not
159
                            a starter, no lookup will succeed.
160
                         2. If the sortbuf has more than one character, check
161
                            for each of these characters that are not "blocked"
162
                            from the starter (i.e. have a ccc that is higher
163
                            than the ccc of the previous character) whether it
164
                            can be combined with the first character.
165
                         3. If only one character is left in sortbuf, check
166
                            whether it can be combined with the next character
167
                            (also a starter).  */
168
719k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
169
648k
                      {
170
746k
                        for (size_t j = 1; j < sortbuf_count; )
171
98.3k
                          {
172
98.3k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
173
38.6k
                              {
174
38.6k
                                ucs4_t combined =
175
38.6k
                                  composer (sortbuf[0].code, sortbuf[j].code);
176
38.6k
                                if (combined)
177
13.5k
                                  {
178
13.5k
                                    sortbuf[0].code = combined;
179
                                    /* sortbuf[0].ccc = 0, still valid.  */
180
60.1k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
181
46.6k
                                      sortbuf[k - 1] = sortbuf[k];
182
13.5k
                                    sortbuf_count--;
183
13.5k
                                    continue;
184
13.5k
                                  }
185
38.6k
                              }
186
84.7k
                            j++;
187
84.7k
                          }
188
648k
                        if (s < s_end && sortbuf_count == 1)
189
568k
                          {
190
568k
                            ucs4_t combined =
191
568k
                              composer (sortbuf[0].code, uc);
192
568k
                            if (combined)
193
5.87k
                              {
194
5.87k
                                uc = combined;
195
5.87k
                                ccc = 0;
196
                                /* uc could be further combined with subsequent
197
                                   characters.  So don't put it into sortbuf[0] in
198
                                   this round, only in the next round.  */
199
5.87k
                                sortbuf_count = 0;
200
5.87k
                              }
201
568k
                          }
202
648k
                      }
203
719k
                  }
204
205
1.47M
                for (size_t j = 0; j < sortbuf_count; j++)
206
757k
                  {
207
757k
                    ucs4_t muc = sortbuf[j].code;
208
209
                    /* Append muc to the result accumulator.  */
210
757k
                    if (length < allocated)
211
685k
                      {
212
685k
                        int ret =
213
685k
                          U_UCTOMB (result + length, muc, allocated - length);
214
685k
                        if (ret == -1)
215
0
                          {
216
0
                            errno = EINVAL;
217
0
                            goto fail;
218
0
                          }
219
685k
                        if (ret >= 0)
220
685k
                          {
221
685k
                            length += ret;
222
685k
                            goto done_appending;
223
685k
                          }
224
685k
                      }
225
71.9k
                    {
226
71.9k
                      size_t old_allocated = allocated;
227
71.9k
                      size_t new_allocated = 2 * old_allocated;
228
71.9k
                      if (new_allocated < 64)
229
68.8k
                        new_allocated = 64;
230
71.9k
                      if (new_allocated < old_allocated) /* integer overflow? */
231
0
                        abort ();
232
71.9k
                      {
233
71.9k
                        UNIT *larger_result;
234
71.9k
                        if (result == NULL)
235
68.8k
                          {
236
68.8k
                            larger_result =
237
68.8k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
238
68.8k
                            if (larger_result == NULL)
239
0
                              {
240
0
                                errno = ENOMEM;
241
0
                                goto fail;
242
0
                              }
243
68.8k
                          }
244
3.09k
                        else if (result == resultbuf)
245
0
                          {
246
0
                            larger_result =
247
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
0
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
0
                            U_CPY (larger_result, resultbuf, length);
254
0
                          }
255
3.09k
                        else
256
3.09k
                          {
257
3.09k
                            larger_result =
258
3.09k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
259
3.09k
                            if (larger_result == NULL)
260
0
                              {
261
0
                                errno = ENOMEM;
262
0
                                goto fail;
263
0
                              }
264
3.09k
                          }
265
71.9k
                        result = larger_result;
266
71.9k
                        allocated = new_allocated;
267
71.9k
                        {
268
71.9k
                          int ret =
269
71.9k
                            U_UCTOMB (result + length, muc, allocated - length);
270
71.9k
                          if (ret == -1)
271
0
                            {
272
0
                              errno = EINVAL;
273
0
                              goto fail;
274
0
                            }
275
71.9k
                          if (ret < 0)
276
0
                            abort ();
277
71.9k
                          length += ret;
278
71.9k
                          goto done_appending;
279
71.9k
                        }
280
71.9k
                      }
281
71.9k
                    }
282
757k
                   done_appending: ;
283
757k
                  }
284
285
                /* sortbuf is now empty.  */
286
719k
                sortbuf_count = 0;
287
719k
              }
288
289
848k
            if (!(s < s_end))
290
              /* End of string reached.  */
291
71.1k
              break;
292
293
            /* Append (uc, ccc) to sortbuf.  */
294
776k
            if (sortbuf_count == sortbuf_allocated)
295
603
              {
296
603
                struct ucs4_with_ccc *new_sortbuf;
297
298
603
                sortbuf_allocated = 2 * sortbuf_allocated;
299
603
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
300
0
                  abort ();
301
603
                new_sortbuf =
302
603
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
303
603
                if (new_sortbuf == NULL)
304
0
                  {
305
0
                    errno = ENOMEM;
306
0
                    goto fail;
307
0
                  }
308
603
                memcpy (new_sortbuf, sortbuf,
309
603
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
310
603
                if (sortbuf != sortbuf_preallocated)
311
603
                  free (sortbuf);
312
603
                sortbuf = new_sortbuf;
313
603
              }
314
776k
            sortbuf[sortbuf_count].code = uc;
315
776k
            sortbuf[sortbuf_count].ccc = ccc;
316
776k
            sortbuf_count++;
317
318
776k
            i++;
319
776k
          }
320
321
816k
        if (!(s < s_end))
322
          /* End of string reached.  */
323
71.1k
          break;
324
325
745k
        s += count;
326
745k
      }
327
71.1k
  }
328
329
71.1k
  if (length == 0)
330
2.25k
    {
331
2.25k
      if (result == NULL)
332
2.25k
        {
333
          /* Return a non-NULL value.  NULL means error.  */
334
2.25k
          result = (UNIT *) malloc (1);
335
2.25k
          if (result == NULL)
336
0
            {
337
0
              errno = ENOMEM;
338
0
              goto fail;
339
0
            }
340
2.25k
        }
341
2.25k
    }
342
68.8k
  else if (result != resultbuf && length < allocated)
343
68.6k
    {
344
      /* Shrink the allocated memory if possible.  */
345
68.6k
      UNIT *memory;
346
347
68.6k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
348
68.6k
      if (memory != NULL)
349
68.6k
        result = memory;
350
68.6k
    }
351
352
71.1k
  if (sortbuf_count > 0)
353
0
    abort ();
354
71.1k
  if (sortbuf != sortbuf_preallocated)
355
71.1k
    free (sortbuf);
356
357
71.1k
  *lengthp = length;
358
71.1k
  return result;
359
360
0
 fail:
361
0
  {
362
0
    int saved_errno = errno;
363
0
    if (sortbuf != sortbuf_preallocated)
364
0
      free (sortbuf);
365
0
    if (result != resultbuf)
366
0
      free (result);
367
0
    errno = saved_errno;
368
0
  }
369
  return NULL;
370
71.1k
}