Coverage Report

Created: 2025-10-13 06:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libidn2/unistring/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
69.7k
{
22
69.7k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
69.7k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
69.7k
  UNIT *result;
27
69.7k
  size_t length;
28
69.7k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
69.7k
  #define SORTBUF_PREALLOCATED 64
31
69.7k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
69.7k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
69.7k
  size_t sortbuf_allocated;
34
69.7k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
69.7k
  if (resultbuf == NULL)
38
69.7k
    {
39
69.7k
      result = NULL;
40
69.7k
      allocated = 0;
41
69.7k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
69.7k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
69.7k
  sortbuf = sortbuf_preallocated;
51
69.7k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
69.7k
  sortbuf_count = 0;
53
54
69.7k
  {
55
69.7k
    const UNIT *s_end = s + n;
56
57
69.7k
    for (;;)
58
797k
      {
59
797k
        int count;
60
797k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
797k
        int decomposed_count;
62
797k
        int i;
63
64
797k
        if (s < s_end)
65
728k
          {
66
            /* Fetch the next character.  */
67
728k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
728k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
728k
            {
77
728k
              int curr;
78
79
1.52M
              for (curr = 0; curr < decomposed_count; )
80
797k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
797k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
797k
                  int curr_decomposed_count;
85
86
797k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
797k
                  if (curr_decomposed_count >= 0)
88
34.8k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
34.8k
                      int shift = curr_decomposed_count - 1;
93
94
34.8k
                      if (shift < 0)
95
0
                        abort ();
96
34.8k
                      if (shift > 0)
97
34.1k
                        {
98
34.1k
                          int j;
99
100
34.1k
                          decomposed_count += shift;
101
34.1k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
42.9k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
8.80k
                            decomposed[j + shift] = decomposed[j];
105
34.1k
                        }
106
103k
                      for (; shift >= 0; shift--)
107
69.0k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
34.8k
                    }
109
762k
                  else
110
762k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
762k
                      curr++;
113
762k
                    }
114
797k
                }
115
728k
            }
116
728k
          }
117
69.7k
        else
118
69.7k
          {
119
69.7k
            count = 0;
120
69.7k
            decomposed_count = 0;
121
69.7k
          }
122
123
797k
        i = 0;
124
797k
        for (;;)
125
1.56M
          {
126
1.56M
            ucs4_t uc;
127
1.56M
            int ccc;
128
129
1.56M
            if (s < s_end)
130
1.49M
              {
131
                /* Fetch the next character from the decomposition.  */
132
1.49M
                if (i == decomposed_count)
133
728k
                  break;
134
762k
                uc = decomposed[i];
135
762k
                ccc = uc_combining_class (uc);
136
762k
              }
137
69.7k
            else
138
69.7k
              {
139
                /* End of string reached.  */
140
69.7k
                uc = 0;
141
69.7k
                ccc = 0;
142
69.7k
              }
143
144
832k
            if (ccc == 0)
145
707k
              {
146
707k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
707k
                if (sortbuf_count > 1)
151
24.4k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
24.4k
                                                           sortbuf + sortbuf_count);
153
154
707k
                if (composer != NULL)
155
707k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
707k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
637k
                      {
178
732k
                        for (j = 1; j < sortbuf_count; )
179
95.3k
                          {
180
95.3k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
32.7k
                              {
182
32.7k
                                ucs4_t combined =
183
32.7k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
32.7k
                                if (combined)
185
14.2k
                                  {
186
14.2k
                                    size_t k;
187
188
14.2k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
53.8k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
39.6k
                                      sortbuf[k - 1] = sortbuf[k];
192
14.2k
                                    sortbuf_count--;
193
14.2k
                                    continue;
194
14.2k
                                  }
195
32.7k
                              }
196
81.1k
                            j++;
197
81.1k
                          }
198
637k
                        if (s < s_end && sortbuf_count == 1)
199
562k
                          {
200
562k
                            ucs4_t combined =
201
562k
                              composer (sortbuf[0].code, uc);
202
562k
                            if (combined)
203
6.85k
                              {
204
6.85k
                                uc = combined;
205
6.85k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
6.85k
                                sortbuf_count = 0;
210
6.85k
                              }
211
562k
                          }
212
637k
                      }
213
707k
                  }
214
215
1.44M
                for (j = 0; j < sortbuf_count; j++)
216
741k
                  {
217
741k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
741k
                    if (length < allocated)
221
670k
                      {
222
670k
                        int ret =
223
670k
                          U_UCTOMB (result + length, muc, allocated - length);
224
670k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
670k
                        if (ret >= 0)
230
670k
                          {
231
670k
                            length += ret;
232
670k
                            goto done_appending;
233
670k
                          }
234
670k
                      }
235
70.3k
                    {
236
70.3k
                      size_t old_allocated = allocated;
237
70.3k
                      size_t new_allocated = 2 * old_allocated;
238
70.3k
                      if (new_allocated < 64)
239
67.2k
                        new_allocated = 64;
240
70.3k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
70.3k
                      {
243
70.3k
                        UNIT *larger_result;
244
70.3k
                        if (result == NULL)
245
67.2k
                          {
246
67.2k
                            larger_result =
247
67.2k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
67.2k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
67.2k
                          }
254
3.05k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
3.05k
                        else
266
3.05k
                          {
267
3.05k
                            larger_result =
268
3.05k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
3.05k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
3.05k
                          }
275
70.3k
                        result = larger_result;
276
70.3k
                        allocated = new_allocated;
277
70.3k
                        {
278
70.3k
                          int ret =
279
70.3k
                            U_UCTOMB (result + length, muc, allocated - length);
280
70.3k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
70.3k
                          if (ret < 0)
286
0
                            abort ();
287
70.3k
                          length += ret;
288
70.3k
                          goto done_appending;
289
70.3k
                        }
290
70.3k
                      }
291
70.3k
                    }
292
741k
                   done_appending: ;
293
741k
                  }
294
295
                /* sortbuf is now empty.  */
296
707k
                sortbuf_count = 0;
297
707k
              }
298
299
832k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
69.7k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
762k
            if (sortbuf_count == sortbuf_allocated)
305
632
              {
306
632
                struct ucs4_with_ccc *new_sortbuf;
307
308
632
                sortbuf_allocated = 2 * sortbuf_allocated;
309
632
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
632
                new_sortbuf =
312
632
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
632
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
632
                memcpy (new_sortbuf, sortbuf,
319
632
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
632
                if (sortbuf != sortbuf_preallocated)
321
632
                  free (sortbuf);
322
632
                sortbuf = new_sortbuf;
323
632
              }
324
762k
            sortbuf[sortbuf_count].code = uc;
325
762k
            sortbuf[sortbuf_count].ccc = ccc;
326
762k
            sortbuf_count++;
327
328
762k
            i++;
329
762k
          }
330
331
797k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
69.7k
          break;
334
335
728k
        s += count;
336
728k
      }
337
69.7k
  }
338
339
69.7k
  if (length == 0)
340
2.50k
    {
341
2.50k
      if (result == NULL)
342
2.50k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
2.50k
          result = (UNIT *) malloc (1);
345
2.50k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
2.50k
        }
351
2.50k
    }
352
67.2k
  else if (result != resultbuf && length < allocated)
353
66.9k
    {
354
      /* Shrink the allocated memory if possible.  */
355
66.9k
      UNIT *memory;
356
357
66.9k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
66.9k
      if (memory != NULL)
359
66.9k
        result = memory;
360
66.9k
    }
361
362
69.7k
  if (sortbuf_count > 0)
363
0
    abort ();
364
69.7k
  if (sortbuf != sortbuf_preallocated)
365
69.7k
    free (sortbuf);
366
367
69.7k
  *lengthp = length;
368
69.7k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
69.7k
}