Coverage Report

Created: 2026-03-16 06:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libidn2/gl/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
72.9k
{
22
72.9k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
72.9k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
72.9k
  UNIT *result;
27
72.9k
  size_t length;
28
72.9k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
72.9k
  #define SORTBUF_PREALLOCATED 64
31
72.9k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
72.9k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
72.9k
  size_t sortbuf_allocated;
34
72.9k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
72.9k
  if (resultbuf == NULL)
38
72.9k
    {
39
72.9k
      result = NULL;
40
72.9k
      allocated = 0;
41
72.9k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
72.9k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
72.9k
  sortbuf = sortbuf_preallocated;
51
72.9k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
72.9k
  sortbuf_count = 0;
53
54
72.9k
  {
55
72.9k
    const UNIT *s_end = s + n;
56
57
72.9k
    for (;;)
58
832k
      {
59
832k
        int count;
60
832k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
832k
        int decomposed_count;
62
832k
        int i;
63
64
832k
        if (s < s_end)
65
759k
          {
66
            /* Fetch the next character.  */
67
759k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
759k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
1.58M
            for (int curr = 0; curr < decomposed_count; )
77
824k
              {
78
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
79
                   all elements are atomic.  */
80
824k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
81
824k
                int curr_decomposed_count;
82
83
824k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
84
824k
                if (curr_decomposed_count >= 0)
85
32.8k
                  {
86
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
87
                       decomposed[curr], making room.  It's not worth using
88
                       memcpy() here, since the counts are so small.  */
89
32.8k
                    int shift = curr_decomposed_count - 1;
90
91
32.8k
                    if (shift < 0)
92
0
                      abort ();
93
32.8k
                    if (shift > 0)
94
32.1k
                      {
95
32.1k
                        decomposed_count += shift;
96
32.1k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
97
0
                          abort ();
98
39.9k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
99
7.84k
                          decomposed[j + shift] = decomposed[j];
100
32.1k
                      }
101
97.8k
                    for (; shift >= 0; shift--)
102
65.0k
                      decomposed[curr + shift] = curr_decomposed[shift];
103
32.8k
                  }
104
791k
                else
105
791k
                  {
106
                    /* decomposed[curr] is atomic.  */
107
791k
                    curr++;
108
791k
                  }
109
824k
              }
110
759k
          }
111
72.9k
        else
112
72.9k
          {
113
72.9k
            count = 0;
114
72.9k
            decomposed_count = 0;
115
72.9k
          }
116
117
832k
        i = 0;
118
832k
        for (;;)
119
1.62M
          {
120
1.62M
            ucs4_t uc;
121
1.62M
            int ccc;
122
123
1.62M
            if (s < s_end)
124
1.55M
              {
125
                /* Fetch the next character from the decomposition.  */
126
1.55M
                if (i == decomposed_count)
127
759k
                  break;
128
791k
                uc = decomposed[i];
129
791k
                ccc = uc_combining_class (uc);
130
791k
              }
131
72.9k
            else
132
72.9k
              {
133
                /* End of string reached.  */
134
72.9k
                uc = 0;
135
72.9k
                ccc = 0;
136
72.9k
              }
137
138
864k
            if (ccc == 0)
139
735k
              {
140
                /* Apply the canonical ordering algorithm to the accumulated
141
                   sequence of characters.  */
142
735k
                if (sortbuf_count > 1)
143
23.6k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
144
23.6k
                                                           sortbuf + sortbuf_count);
145
146
735k
                if (composer != NULL)
147
735k
                  {
148
                    /* Attempt to combine decomposed characters, as specified
149
                       in the Unicode Standard Annex #15 "Unicode Normalization
150
                       Forms".  We need to check
151
                         1. whether the first accumulated character is a
152
                            "starter" (i.e. has ccc = 0).  This is usually the
153
                            case.  But when the string starts with a
154
                            non-starter, the sortbuf also starts with a
155
                            non-starter.  Btw, this check could also be
156
                            omitted, because the composition table has only
157
                            entries (code1, code2) for which code1 is a
158
                            starter; if the first accumulated character is not
159
                            a starter, no lookup will succeed.
160
                         2. If the sortbuf has more than one character, check
161
                            for each of these characters that are not "blocked"
162
                            from the starter (i.e. have a ccc that is higher
163
                            than the ccc of the previous character) whether it
164
                            can be combined with the first character.
165
                         3. If only one character is left in sortbuf, check
166
                            whether it can be combined with the next character
167
                            (also a starter).  */
168
735k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
169
662k
                      {
170
760k
                        for (size_t j = 1; j < sortbuf_count; )
171
98.5k
                          {
172
98.5k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
173
31.6k
                              {
174
31.6k
                                ucs4_t combined =
175
31.6k
                                  composer (sortbuf[0].code, sortbuf[j].code);
176
31.6k
                                if (combined)
177
13.2k
                                  {
178
13.2k
                                    sortbuf[0].code = combined;
179
                                    /* sortbuf[0].ccc = 0, still valid.  */
180
60.3k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
181
47.0k
                                      sortbuf[k - 1] = sortbuf[k];
182
13.2k
                                    sortbuf_count--;
183
13.2k
                                    continue;
184
13.2k
                                  }
185
31.6k
                              }
186
85.3k
                            j++;
187
85.3k
                          }
188
662k
                        if (s < s_end && sortbuf_count == 1)
189
584k
                          {
190
584k
                            ucs4_t combined =
191
584k
                              composer (sortbuf[0].code, uc);
192
584k
                            if (combined)
193
6.15k
                              {
194
6.15k
                                uc = combined;
195
6.15k
                                ccc = 0;
196
                                /* uc could be further combined with subsequent
197
                                   characters.  So don't put it into sortbuf[0] in
198
                                   this round, only in the next round.  */
199
6.15k
                                sortbuf_count = 0;
200
6.15k
                              }
201
584k
                          }
202
662k
                      }
203
735k
                  }
204
205
1.50M
                for (size_t j = 0; j < sortbuf_count; j++)
206
772k
                  {
207
772k
                    ucs4_t muc = sortbuf[j].code;
208
209
                    /* Append muc to the result accumulator.  */
210
772k
                    if (length < allocated)
211
698k
                      {
212
698k
                        int ret =
213
698k
                          U_UCTOMB (result + length, muc, allocated - length);
214
698k
                        if (ret == -1)
215
0
                          {
216
0
                            errno = EINVAL;
217
0
                            goto fail;
218
0
                          }
219
698k
                        if (ret >= 0)
220
698k
                          {
221
698k
                            length += ret;
222
698k
                            goto done_appending;
223
698k
                          }
224
698k
                      }
225
73.7k
                    {
226
73.7k
                      size_t old_allocated = allocated;
227
73.7k
                      size_t new_allocated = 2 * old_allocated;
228
73.7k
                      if (new_allocated < 64)
229
70.6k
                        new_allocated = 64;
230
73.7k
                      if (new_allocated < old_allocated) /* integer overflow? */
231
0
                        abort ();
232
73.7k
                      {
233
73.7k
                        UNIT *larger_result;
234
73.7k
                        if (result == NULL)
235
70.6k
                          {
236
70.6k
                            larger_result =
237
70.6k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
238
70.6k
                            if (larger_result == NULL)
239
0
                              {
240
0
                                errno = ENOMEM;
241
0
                                goto fail;
242
0
                              }
243
70.6k
                          }
244
3.14k
                        else if (result == resultbuf)
245
0
                          {
246
0
                            larger_result =
247
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
0
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
0
                            U_CPY (larger_result, resultbuf, length);
254
0
                          }
255
3.14k
                        else
256
3.14k
                          {
257
3.14k
                            larger_result =
258
3.14k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
259
3.14k
                            if (larger_result == NULL)
260
0
                              {
261
0
                                errno = ENOMEM;
262
0
                                goto fail;
263
0
                              }
264
3.14k
                          }
265
73.7k
                        result = larger_result;
266
73.7k
                        allocated = new_allocated;
267
73.7k
                        {
268
73.7k
                          int ret =
269
73.7k
                            U_UCTOMB (result + length, muc, allocated - length);
270
73.7k
                          if (ret == -1)
271
0
                            {
272
0
                              errno = EINVAL;
273
0
                              goto fail;
274
0
                            }
275
73.7k
                          if (ret < 0)
276
0
                            abort ();
277
73.7k
                          length += ret;
278
73.7k
                          goto done_appending;
279
73.7k
                        }
280
73.7k
                      }
281
73.7k
                    }
282
772k
                   done_appending: ;
283
772k
                  }
284
285
                /* sortbuf is now empty.  */
286
735k
                sortbuf_count = 0;
287
735k
              }
288
289
864k
            if (!(s < s_end))
290
              /* End of string reached.  */
291
72.9k
              break;
292
293
            /* Append (uc, ccc) to sortbuf.  */
294
791k
            if (sortbuf_count == sortbuf_allocated)
295
660
              {
296
660
                struct ucs4_with_ccc *new_sortbuf;
297
298
660
                sortbuf_allocated = 2 * sortbuf_allocated;
299
660
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
300
0
                  abort ();
301
660
                new_sortbuf =
302
660
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
303
660
                if (new_sortbuf == NULL)
304
0
                  {
305
0
                    errno = ENOMEM;
306
0
                    goto fail;
307
0
                  }
308
660
                memcpy (new_sortbuf, sortbuf,
309
660
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
310
660
                if (sortbuf != sortbuf_preallocated)
311
660
                  free (sortbuf);
312
660
                sortbuf = new_sortbuf;
313
660
              }
314
791k
            sortbuf[sortbuf_count].code = uc;
315
791k
            sortbuf[sortbuf_count].ccc = ccc;
316
791k
            sortbuf_count++;
317
318
791k
            i++;
319
791k
          }
320
321
832k
        if (!(s < s_end))
322
          /* End of string reached.  */
323
72.9k
          break;
324
325
759k
        s += count;
326
759k
      }
327
72.9k
  }
328
329
72.9k
  if (length == 0)
330
2.32k
    {
331
2.32k
      if (result == NULL)
332
2.32k
        {
333
          /* Return a non-NULL value.  NULL means error.  */
334
2.32k
          result = (UNIT *) malloc (1);
335
2.32k
          if (result == NULL)
336
0
            {
337
0
              errno = ENOMEM;
338
0
              goto fail;
339
0
            }
340
2.32k
        }
341
2.32k
    }
342
70.6k
  else if (result != resultbuf && length < allocated)
343
70.3k
    {
344
      /* Shrink the allocated memory if possible.  */
345
70.3k
      UNIT *memory;
346
347
70.3k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
348
70.3k
      if (memory != NULL)
349
70.3k
        result = memory;
350
70.3k
    }
351
352
72.9k
  if (sortbuf_count > 0)
353
0
    abort ();
354
72.9k
  if (sortbuf != sortbuf_preallocated)
355
72.9k
    free (sortbuf);
356
357
72.9k
  *lengthp = length;
358
72.9k
  return result;
359
360
0
 fail:
361
0
  {
362
0
    int saved_errno = errno;
363
0
    if (sortbuf != sortbuf_preallocated)
364
0
      free (sortbuf);
365
0
    if (result != resultbuf)
366
0
      free (result);
367
0
    errno = saved_errno;
368
0
  }
369
  return NULL;
370
72.9k
}