Coverage Report

Created: 2025-12-14 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libidn2/unistring/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
69.8k
{
22
69.8k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
69.8k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
69.8k
  UNIT *result;
27
69.8k
  size_t length;
28
69.8k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
69.8k
  #define SORTBUF_PREALLOCATED 64
31
69.8k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
69.8k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
69.8k
  size_t sortbuf_allocated;
34
69.8k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
69.8k
  if (resultbuf == NULL)
38
69.8k
    {
39
69.8k
      result = NULL;
40
69.8k
      allocated = 0;
41
69.8k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
69.8k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
69.8k
  sortbuf = sortbuf_preallocated;
51
69.8k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
69.8k
  sortbuf_count = 0;
53
54
69.8k
  {
55
69.8k
    const UNIT *s_end = s + n;
56
57
69.8k
    for (;;)
58
785k
      {
59
785k
        int count;
60
785k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
785k
        int decomposed_count;
62
785k
        int i;
63
64
785k
        if (s < s_end)
65
715k
          {
66
            /* Fetch the next character.  */
67
715k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
715k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
715k
            {
77
715k
              int curr;
78
79
1.49M
              for (curr = 0; curr < decomposed_count; )
80
782k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
782k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
782k
                  int curr_decomposed_count;
85
86
782k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
782k
                  if (curr_decomposed_count >= 0)
88
33.9k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
33.9k
                      int shift = curr_decomposed_count - 1;
93
94
33.9k
                      if (shift < 0)
95
0
                        abort ();
96
33.9k
                      if (shift > 0)
97
33.3k
                        {
98
33.3k
                          int j;
99
100
33.3k
                          decomposed_count += shift;
101
33.3k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
41.6k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
8.31k
                            decomposed[j + shift] = decomposed[j];
105
33.3k
                        }
106
101k
                      for (; shift >= 0; shift--)
107
67.2k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
33.9k
                    }
109
748k
                  else
110
748k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
748k
                      curr++;
113
748k
                    }
114
782k
                }
115
715k
            }
116
715k
          }
117
69.8k
        else
118
69.8k
          {
119
69.8k
            count = 0;
120
69.8k
            decomposed_count = 0;
121
69.8k
          }
122
123
785k
        i = 0;
124
785k
        for (;;)
125
1.53M
          {
126
1.53M
            ucs4_t uc;
127
1.53M
            int ccc;
128
129
1.53M
            if (s < s_end)
130
1.46M
              {
131
                /* Fetch the next character from the decomposition.  */
132
1.46M
                if (i == decomposed_count)
133
715k
                  break;
134
748k
                uc = decomposed[i];
135
748k
                ccc = uc_combining_class (uc);
136
748k
              }
137
69.8k
            else
138
69.8k
              {
139
                /* End of string reached.  */
140
69.8k
                uc = 0;
141
69.8k
                ccc = 0;
142
69.8k
              }
143
144
818k
            if (ccc == 0)
145
692k
              {
146
692k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
692k
                if (sortbuf_count > 1)
151
24.4k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
24.4k
                                                           sortbuf + sortbuf_count);
153
154
692k
                if (composer != NULL)
155
692k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
692k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
623k
                      {
178
718k
                        for (j = 1; j < sortbuf_count; )
179
95.8k
                          {
180
95.8k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
32.9k
                              {
182
32.9k
                                ucs4_t combined =
183
32.9k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
32.9k
                                if (combined)
185
13.3k
                                  {
186
13.3k
                                    size_t k;
187
188
13.3k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
52.6k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
39.3k
                                      sortbuf[k - 1] = sortbuf[k];
192
13.3k
                                    sortbuf_count--;
193
13.3k
                                    continue;
194
13.3k
                                  }
195
32.9k
                              }
196
82.5k
                            j++;
197
82.5k
                          }
198
623k
                        if (s < s_end && sortbuf_count == 1)
199
547k
                          {
200
547k
                            ucs4_t combined =
201
547k
                              composer (sortbuf[0].code, uc);
202
547k
                            if (combined)
203
6.58k
                              {
204
6.58k
                                uc = combined;
205
6.58k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
6.58k
                                sortbuf_count = 0;
210
6.58k
                              }
211
547k
                          }
212
623k
                      }
213
692k
                  }
214
215
1.42M
                for (j = 0; j < sortbuf_count; j++)
216
729k
                  {
217
729k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
729k
                    if (length < allocated)
221
658k
                      {
222
658k
                        int ret =
223
658k
                          U_UCTOMB (result + length, muc, allocated - length);
224
658k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
658k
                        if (ret >= 0)
230
658k
                          {
231
658k
                            length += ret;
232
658k
                            goto done_appending;
233
658k
                          }
234
658k
                      }
235
70.5k
                    {
236
70.5k
                      size_t old_allocated = allocated;
237
70.5k
                      size_t new_allocated = 2 * old_allocated;
238
70.5k
                      if (new_allocated < 64)
239
67.6k
                        new_allocated = 64;
240
70.5k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
70.5k
                      {
243
70.5k
                        UNIT *larger_result;
244
70.5k
                        if (result == NULL)
245
67.6k
                          {
246
67.6k
                            larger_result =
247
67.6k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
67.6k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
67.6k
                          }
254
2.94k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
2.94k
                        else
266
2.94k
                          {
267
2.94k
                            larger_result =
268
2.94k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
2.94k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
2.94k
                          }
275
70.5k
                        result = larger_result;
276
70.5k
                        allocated = new_allocated;
277
70.5k
                        {
278
70.5k
                          int ret =
279
70.5k
                            U_UCTOMB (result + length, muc, allocated - length);
280
70.5k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
70.5k
                          if (ret < 0)
286
0
                            abort ();
287
70.5k
                          length += ret;
288
70.5k
                          goto done_appending;
289
70.5k
                        }
290
70.5k
                      }
291
70.5k
                    }
292
729k
                   done_appending: ;
293
729k
                  }
294
295
                /* sortbuf is now empty.  */
296
692k
                sortbuf_count = 0;
297
692k
              }
298
299
818k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
69.8k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
748k
            if (sortbuf_count == sortbuf_allocated)
305
636
              {
306
636
                struct ucs4_with_ccc *new_sortbuf;
307
308
636
                sortbuf_allocated = 2 * sortbuf_allocated;
309
636
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
636
                new_sortbuf =
312
636
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
636
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
636
                memcpy (new_sortbuf, sortbuf,
319
636
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
636
                if (sortbuf != sortbuf_preallocated)
321
636
                  free (sortbuf);
322
636
                sortbuf = new_sortbuf;
323
636
              }
324
748k
            sortbuf[sortbuf_count].code = uc;
325
748k
            sortbuf[sortbuf_count].ccc = ccc;
326
748k
            sortbuf_count++;
327
328
748k
            i++;
329
748k
          }
330
331
785k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
69.8k
          break;
334
335
715k
        s += count;
336
715k
      }
337
69.8k
  }
338
339
69.8k
  if (length == 0)
340
2.23k
    {
341
2.23k
      if (result == NULL)
342
2.23k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
2.23k
          result = (UNIT *) malloc (1);
345
2.23k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
2.23k
        }
351
2.23k
    }
352
67.6k
  else if (result != resultbuf && length < allocated)
353
67.3k
    {
354
      /* Shrink the allocated memory if possible.  */
355
67.3k
      UNIT *memory;
356
357
67.3k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
67.3k
      if (memory != NULL)
359
67.3k
        result = memory;
360
67.3k
    }
361
362
69.8k
  if (sortbuf_count > 0)
363
0
    abort ();
364
69.8k
  if (sortbuf != sortbuf_preallocated)
365
69.8k
    free (sortbuf);
366
367
69.8k
  *lengthp = length;
368
69.8k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
69.8k
}