Coverage Report

Created: 2026-06-08 06:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2026 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
39.2k
{
22
39.2k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
39.2k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
39.2k
  UNIT *result;
27
39.2k
  size_t allocated;
28
39.2k
  if (resultbuf == NULL)
29
39.2k
    {
30
39.2k
      result = NULL;
31
39.2k
      allocated = 0;
32
39.2k
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
39.2k
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
39.2k
  #define SORTBUF_PREALLOCATED 64
42
39.2k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
39.2k
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
39.2k
    sortbuf_preallocated;
45
39.2k
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
39.2k
  size_t sortbuf_count = 0;
47
48
39.2k
  {
49
39.2k
    const UNIT *s_end = s + n;
50
51
39.2k
    for (;;)
52
372k
      {
53
372k
        int count;
54
372k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
372k
        int decomposed_count;
56
57
372k
        if (s < s_end)
58
333k
          {
59
            /* Fetch the next character.  */
60
333k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
333k
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
696k
            for (int curr = 0; curr < decomposed_count; )
70
362k
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
362k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
362k
                int curr_decomposed_count;
75
76
362k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
362k
                if (curr_decomposed_count >= 0)
78
14.6k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
14.6k
                    int shift = curr_decomposed_count - 1;
83
84
14.6k
                    if (shift < 0)
85
0
                      abort ();
86
14.6k
                    if (shift > 0)
87
14.4k
                      {
88
14.4k
                        decomposed_count += shift;
89
14.4k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
20.1k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
5.71k
                          decomposed[j + shift] = decomposed[j];
93
14.4k
                      }
94
43.8k
                    for (; shift >= 0; shift--)
95
29.1k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
14.6k
                  }
97
347k
                else
98
347k
                  {
99
                    /* decomposed[curr] is atomic.  */
100
347k
                    curr++;
101
347k
                  }
102
362k
              }
103
333k
          }
104
39.2k
        else
105
39.2k
          {
106
39.2k
            count = 0;
107
39.2k
            decomposed_count = 0;
108
39.2k
          }
109
110
372k
        int i = 0;
111
372k
        for (;;)
112
720k
          {
113
720k
            ucs4_t uc;
114
720k
            int ccc;
115
116
720k
            if (s < s_end)
117
681k
              {
118
                /* Fetch the next character from the decomposition.  */
119
681k
                if (i == decomposed_count)
120
333k
                  break;
121
347k
                uc = decomposed[i];
122
347k
                ccc = uc_combining_class (uc);
123
347k
              }
124
39.2k
            else
125
39.2k
              {
126
                /* End of string reached.  */
127
39.2k
                uc = 0;
128
39.2k
                ccc = 0;
129
39.2k
              }
130
131
387k
            if (ccc == 0)
132
330k
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
330k
                if (sortbuf_count > 1)
136
12.3k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
12.3k
                                                           sortbuf + sortbuf_count);
138
139
330k
                if (composer != NULL)
140
330k
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
330k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
291k
                      {
163
346k
                        for (size_t j = 1; j < sortbuf_count; )
164
55.0k
                          {
165
55.0k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
25.9k
                              {
167
25.9k
                                ucs4_t combined =
168
25.9k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
25.9k
                                if (combined)
170
10.3k
                                  {
171
10.3k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
35.4k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
25.0k
                                      sortbuf[k - 1] = sortbuf[k];
175
10.3k
                                    sortbuf_count--;
176
10.3k
                                    continue;
177
10.3k
                                  }
178
25.9k
                              }
179
44.6k
                            j++;
180
44.6k
                          }
181
291k
                        if (s < s_end && sortbuf_count == 1)
182
245k
                          {
183
245k
                            ucs4_t combined =
184
245k
                              composer (sortbuf[0].code, uc);
185
245k
                            if (combined)
186
3.15k
                              {
187
3.15k
                                uc = combined;
188
3.15k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
3.15k
                                sortbuf_count = 0;
193
3.15k
                              }
194
245k
                          }
195
291k
                      }
196
330k
                  }
197
198
665k
                for (size_t j = 0; j < sortbuf_count; j++)
199
334k
                  {
200
334k
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
334k
                    if (length < allocated)
204
294k
                      {
205
294k
                        int ret =
206
294k
                          U_UCTOMB (result + length, muc, allocated - length);
207
294k
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
294k
                        if (ret >= 0)
213
294k
                          {
214
294k
                            length += ret;
215
294k
                            goto done_appending;
216
294k
                          }
217
294k
                      }
218
39.7k
                    {
219
39.7k
                      size_t old_allocated = allocated;
220
39.7k
                      size_t new_allocated = 2 * old_allocated;
221
39.7k
                      if (new_allocated < 64)
222
38.6k
                        new_allocated = 64;
223
39.7k
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
39.7k
                      {
226
39.7k
                        UNIT *larger_result;
227
39.7k
                        if (result == NULL)
228
38.6k
                          {
229
38.6k
                            larger_result =
230
38.6k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
38.6k
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
38.6k
                          }
237
1.07k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
1.07k
                        else
249
1.07k
                          {
250
1.07k
                            larger_result =
251
1.07k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
1.07k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
1.07k
                          }
258
39.7k
                        result = larger_result;
259
39.7k
                        allocated = new_allocated;
260
39.7k
                        {
261
39.7k
                          int ret =
262
39.7k
                            U_UCTOMB (result + length, muc, allocated - length);
263
39.7k
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
39.7k
                          if (ret < 0)
269
0
                            abort ();
270
39.7k
                          length += ret;
271
39.7k
                          goto done_appending;
272
39.7k
                        }
273
39.7k
                      }
274
39.7k
                    }
275
334k
                   done_appending: ;
276
334k
                  }
277
278
                /* sortbuf is now empty.  */
279
330k
                sortbuf_count = 0;
280
330k
              }
281
282
387k
            if (!(s < s_end))
283
              /* End of string reached.  */
284
39.2k
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
347k
            if (sortbuf_count == sortbuf_allocated)
288
182
              {
289
182
                sortbuf_allocated = 2 * sortbuf_allocated;
290
182
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
182
                struct ucs4_with_ccc *new_sortbuf =
293
182
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
182
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
182
                memcpy (new_sortbuf, sortbuf,
300
182
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
182
                if (sortbuf != sortbuf_preallocated)
302
182
                  free (sortbuf);
303
182
                sortbuf = new_sortbuf;
304
182
              }
305
347k
            sortbuf[sortbuf_count].code = uc;
306
347k
            sortbuf[sortbuf_count].ccc = ccc;
307
347k
            sortbuf_count++;
308
309
347k
            i++;
310
347k
          }
311
312
372k
        if (!(s < s_end))
313
          /* End of string reached.  */
314
39.2k
          break;
315
316
333k
        s += count;
317
333k
      }
318
39.2k
  }
319
320
39.2k
  if (length == 0)
321
576
    {
322
576
      if (result == NULL)
323
576
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
576
          result = (UNIT *) malloc (1);
326
576
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
576
        }
332
576
    }
333
38.6k
  else if (result != resultbuf && length < allocated)
334
38.5k
    {
335
      /* Shrink the allocated memory if possible.  */
336
38.5k
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
38.5k
      if (memory != NULL)
338
38.5k
        result = memory;
339
38.5k
    }
340
341
39.2k
  if (sortbuf_count > 0)
342
0
    abort ();
343
39.2k
  if (sortbuf != sortbuf_preallocated)
344
39.2k
    free (sortbuf);
345
346
39.2k
  *lengthp = length;
347
39.2k
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
39.2k
}
u16_normalize
Line
Count
Source
21
10.5k
{
22
10.5k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
10.5k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
10.5k
  UNIT *result;
27
10.5k
  size_t allocated;
28
10.5k
  if (resultbuf == NULL)
29
10.5k
    {
30
10.5k
      result = NULL;
31
10.5k
      allocated = 0;
32
10.5k
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
10.5k
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
10.5k
  #define SORTBUF_PREALLOCATED 64
42
10.5k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
10.5k
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
10.5k
    sortbuf_preallocated;
45
10.5k
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
10.5k
  size_t sortbuf_count = 0;
47
48
10.5k
  {
49
10.5k
    const UNIT *s_end = s + n;
50
51
10.5k
    for (;;)
52
58.5k
      {
53
58.5k
        int count;
54
58.5k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
58.5k
        int decomposed_count;
56
57
58.5k
        if (s < s_end)
58
48.0k
          {
59
            /* Fetch the next character.  */
60
48.0k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
48.0k
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
96.0k
            for (int curr = 0; curr < decomposed_count; )
70
48.0k
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
48.0k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
48.0k
                int curr_decomposed_count;
75
76
48.0k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
48.0k
                if (curr_decomposed_count >= 0)
78
0
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
0
                    int shift = curr_decomposed_count - 1;
83
84
0
                    if (shift < 0)
85
0
                      abort ();
86
0
                    if (shift > 0)
87
0
                      {
88
0
                        decomposed_count += shift;
89
0
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
0
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
0
                          decomposed[j + shift] = decomposed[j];
93
0
                      }
94
0
                    for (; shift >= 0; shift--)
95
0
                      decomposed[curr + shift] = curr_decomposed[shift];
96
0
                  }
97
48.0k
                else
98
48.0k
                  {
99
                    /* decomposed[curr] is atomic.  */
100
48.0k
                    curr++;
101
48.0k
                  }
102
48.0k
              }
103
48.0k
          }
104
10.5k
        else
105
10.5k
          {
106
10.5k
            count = 0;
107
10.5k
            decomposed_count = 0;
108
10.5k
          }
109
110
58.5k
        int i = 0;
111
58.5k
        for (;;)
112
106k
          {
113
106k
            ucs4_t uc;
114
106k
            int ccc;
115
116
106k
            if (s < s_end)
117
96.0k
              {
118
                /* Fetch the next character from the decomposition.  */
119
96.0k
                if (i == decomposed_count)
120
48.0k
                  break;
121
48.0k
                uc = decomposed[i];
122
48.0k
                ccc = uc_combining_class (uc);
123
48.0k
              }
124
10.5k
            else
125
10.5k
              {
126
                /* End of string reached.  */
127
10.5k
                uc = 0;
128
10.5k
                ccc = 0;
129
10.5k
              }
130
131
58.5k
            if (ccc == 0)
132
58.5k
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
58.5k
                if (sortbuf_count > 1)
136
0
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
0
                                                           sortbuf + sortbuf_count);
138
139
58.5k
                if (composer != NULL)
140
58.5k
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
58.5k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
48.0k
                      {
163
48.0k
                        for (size_t j = 1; j < sortbuf_count; )
164
0
                          {
165
0
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
0
                              {
167
0
                                ucs4_t combined =
168
0
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
0
                                if (combined)
170
0
                                  {
171
0
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
0
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
0
                                      sortbuf[k - 1] = sortbuf[k];
175
0
                                    sortbuf_count--;
176
0
                                    continue;
177
0
                                  }
178
0
                              }
179
0
                            j++;
180
0
                          }
181
48.0k
                        if (s < s_end && sortbuf_count == 1)
182
37.5k
                          {
183
37.5k
                            ucs4_t combined =
184
37.5k
                              composer (sortbuf[0].code, uc);
185
37.5k
                            if (combined)
186
0
                              {
187
0
                                uc = combined;
188
0
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
0
                                sortbuf_count = 0;
193
0
                              }
194
37.5k
                          }
195
48.0k
                      }
196
58.5k
                  }
197
198
106k
                for (size_t j = 0; j < sortbuf_count; j++)
199
48.0k
                  {
200
48.0k
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
48.0k
                    if (length < allocated)
204
37.5k
                      {
205
37.5k
                        int ret =
206
37.5k
                          U_UCTOMB (result + length, muc, allocated - length);
207
37.5k
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
37.5k
                        if (ret >= 0)
213
37.5k
                          {
214
37.5k
                            length += ret;
215
37.5k
                            goto done_appending;
216
37.5k
                          }
217
37.5k
                      }
218
10.5k
                    {
219
10.5k
                      size_t old_allocated = allocated;
220
10.5k
                      size_t new_allocated = 2 * old_allocated;
221
10.5k
                      if (new_allocated < 64)
222
10.5k
                        new_allocated = 64;
223
10.5k
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
10.5k
                      {
226
10.5k
                        UNIT *larger_result;
227
10.5k
                        if (result == NULL)
228
10.5k
                          {
229
10.5k
                            larger_result =
230
10.5k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
10.5k
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
10.5k
                          }
237
0
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
0
                        else
249
0
                          {
250
0
                            larger_result =
251
0
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
0
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
0
                          }
258
10.5k
                        result = larger_result;
259
10.5k
                        allocated = new_allocated;
260
10.5k
                        {
261
10.5k
                          int ret =
262
10.5k
                            U_UCTOMB (result + length, muc, allocated - length);
263
10.5k
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
10.5k
                          if (ret < 0)
269
0
                            abort ();
270
10.5k
                          length += ret;
271
10.5k
                          goto done_appending;
272
10.5k
                        }
273
10.5k
                      }
274
10.5k
                    }
275
48.0k
                   done_appending: ;
276
48.0k
                  }
277
278
                /* sortbuf is now empty.  */
279
58.5k
                sortbuf_count = 0;
280
58.5k
              }
281
282
58.5k
            if (!(s < s_end))
283
              /* End of string reached.  */
284
10.5k
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
48.0k
            if (sortbuf_count == sortbuf_allocated)
288
0
              {
289
0
                sortbuf_allocated = 2 * sortbuf_allocated;
290
0
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
0
                struct ucs4_with_ccc *new_sortbuf =
293
0
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
0
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
0
                memcpy (new_sortbuf, sortbuf,
300
0
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
0
                if (sortbuf != sortbuf_preallocated)
302
0
                  free (sortbuf);
303
0
                sortbuf = new_sortbuf;
304
0
              }
305
48.0k
            sortbuf[sortbuf_count].code = uc;
306
48.0k
            sortbuf[sortbuf_count].ccc = ccc;
307
48.0k
            sortbuf_count++;
308
309
48.0k
            i++;
310
48.0k
          }
311
312
58.5k
        if (!(s < s_end))
313
          /* End of string reached.  */
314
10.5k
          break;
315
316
48.0k
        s += count;
317
48.0k
      }
318
10.5k
  }
319
320
10.5k
  if (length == 0)
321
0
    {
322
0
      if (result == NULL)
323
0
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
0
          result = (UNIT *) malloc (1);
326
0
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
0
        }
332
0
    }
333
10.5k
  else if (result != resultbuf && length < allocated)
334
10.5k
    {
335
      /* Shrink the allocated memory if possible.  */
336
10.5k
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
10.5k
      if (memory != NULL)
338
10.5k
        result = memory;
339
10.5k
    }
340
341
10.5k
  if (sortbuf_count > 0)
342
0
    abort ();
343
10.5k
  if (sortbuf != sortbuf_preallocated)
344
10.5k
    free (sortbuf);
345
346
10.5k
  *lengthp = length;
347
10.5k
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
10.5k
}
u32_normalize
Line
Count
Source
21
28.7k
{
22
28.7k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
28.7k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
28.7k
  UNIT *result;
27
28.7k
  size_t allocated;
28
28.7k
  if (resultbuf == NULL)
29
28.7k
    {
30
28.7k
      result = NULL;
31
28.7k
      allocated = 0;
32
28.7k
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
28.7k
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
28.7k
  #define SORTBUF_PREALLOCATED 64
42
28.7k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
28.7k
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
28.7k
    sortbuf_preallocated;
45
28.7k
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
28.7k
  size_t sortbuf_count = 0;
47
48
28.7k
  {
49
28.7k
    const UNIT *s_end = s + n;
50
51
28.7k
    for (;;)
52
314k
      {
53
314k
        int count;
54
314k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
314k
        int decomposed_count;
56
57
314k
        if (s < s_end)
58
285k
          {
59
            /* Fetch the next character.  */
60
285k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
285k
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
600k
            for (int curr = 0; curr < decomposed_count; )
70
314k
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
314k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
314k
                int curr_decomposed_count;
75
76
314k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
314k
                if (curr_decomposed_count >= 0)
78
14.6k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
14.6k
                    int shift = curr_decomposed_count - 1;
83
84
14.6k
                    if (shift < 0)
85
0
                      abort ();
86
14.6k
                    if (shift > 0)
87
14.4k
                      {
88
14.4k
                        decomposed_count += shift;
89
14.4k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
20.1k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
5.71k
                          decomposed[j + shift] = decomposed[j];
93
14.4k
                      }
94
43.8k
                    for (; shift >= 0; shift--)
95
29.1k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
14.6k
                  }
97
299k
                else
98
299k
                  {
99
                    /* decomposed[curr] is atomic.  */
100
299k
                    curr++;
101
299k
                  }
102
314k
              }
103
285k
          }
104
28.7k
        else
105
28.7k
          {
106
28.7k
            count = 0;
107
28.7k
            decomposed_count = 0;
108
28.7k
          }
109
110
314k
        int i = 0;
111
314k
        for (;;)
112
614k
          {
113
614k
            ucs4_t uc;
114
614k
            int ccc;
115
116
614k
            if (s < s_end)
117
585k
              {
118
                /* Fetch the next character from the decomposition.  */
119
585k
                if (i == decomposed_count)
120
285k
                  break;
121
299k
                uc = decomposed[i];
122
299k
                ccc = uc_combining_class (uc);
123
299k
              }
124
28.7k
            else
125
28.7k
              {
126
                /* End of string reached.  */
127
28.7k
                uc = 0;
128
28.7k
                ccc = 0;
129
28.7k
              }
130
131
328k
            if (ccc == 0)
132
272k
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
272k
                if (sortbuf_count > 1)
136
12.3k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
12.3k
                                                           sortbuf + sortbuf_count);
138
139
272k
                if (composer != NULL)
140
272k
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
272k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
243k
                      {
163
298k
                        for (size_t j = 1; j < sortbuf_count; )
164
55.0k
                          {
165
55.0k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
25.9k
                              {
167
25.9k
                                ucs4_t combined =
168
25.9k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
25.9k
                                if (combined)
170
10.3k
                                  {
171
10.3k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
35.4k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
25.0k
                                      sortbuf[k - 1] = sortbuf[k];
175
10.3k
                                    sortbuf_count--;
176
10.3k
                                    continue;
177
10.3k
                                  }
178
25.9k
                              }
179
44.6k
                            j++;
180
44.6k
                          }
181
243k
                        if (s < s_end && sortbuf_count == 1)
182
208k
                          {
183
208k
                            ucs4_t combined =
184
208k
                              composer (sortbuf[0].code, uc);
185
208k
                            if (combined)
186
3.15k
                              {
187
3.15k
                                uc = combined;
188
3.15k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
3.15k
                                sortbuf_count = 0;
193
3.15k
                              }
194
208k
                          }
195
243k
                      }
196
272k
                  }
197
198
558k
                for (size_t j = 0; j < sortbuf_count; j++)
199
286k
                  {
200
286k
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
286k
                    if (length < allocated)
204
257k
                      {
205
257k
                        int ret =
206
257k
                          U_UCTOMB (result + length, muc, allocated - length);
207
257k
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
257k
                        if (ret >= 0)
213
257k
                          {
214
257k
                            length += ret;
215
257k
                            goto done_appending;
216
257k
                          }
217
257k
                      }
218
29.2k
                    {
219
29.2k
                      size_t old_allocated = allocated;
220
29.2k
                      size_t new_allocated = 2 * old_allocated;
221
29.2k
                      if (new_allocated < 64)
222
28.1k
                        new_allocated = 64;
223
29.2k
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
29.2k
                      {
226
29.2k
                        UNIT *larger_result;
227
29.2k
                        if (result == NULL)
228
28.1k
                          {
229
28.1k
                            larger_result =
230
28.1k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
28.1k
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
28.1k
                          }
237
1.07k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
1.07k
                        else
249
1.07k
                          {
250
1.07k
                            larger_result =
251
1.07k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
1.07k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
1.07k
                          }
258
29.2k
                        result = larger_result;
259
29.2k
                        allocated = new_allocated;
260
29.2k
                        {
261
29.2k
                          int ret =
262
29.2k
                            U_UCTOMB (result + length, muc, allocated - length);
263
29.2k
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
29.2k
                          if (ret < 0)
269
0
                            abort ();
270
29.2k
                          length += ret;
271
29.2k
                          goto done_appending;
272
29.2k
                        }
273
29.2k
                      }
274
29.2k
                    }
275
286k
                   done_appending: ;
276
286k
                  }
277
278
                /* sortbuf is now empty.  */
279
272k
                sortbuf_count = 0;
280
272k
              }
281
282
328k
            if (!(s < s_end))
283
              /* End of string reached.  */
284
28.7k
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
299k
            if (sortbuf_count == sortbuf_allocated)
288
182
              {
289
182
                sortbuf_allocated = 2 * sortbuf_allocated;
290
182
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
182
                struct ucs4_with_ccc *new_sortbuf =
293
182
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
182
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
182
                memcpy (new_sortbuf, sortbuf,
300
182
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
182
                if (sortbuf != sortbuf_preallocated)
302
182
                  free (sortbuf);
303
182
                sortbuf = new_sortbuf;
304
182
              }
305
299k
            sortbuf[sortbuf_count].code = uc;
306
299k
            sortbuf[sortbuf_count].ccc = ccc;
307
299k
            sortbuf_count++;
308
309
299k
            i++;
310
299k
          }
311
312
314k
        if (!(s < s_end))
313
          /* End of string reached.  */
314
28.7k
          break;
315
316
285k
        s += count;
317
285k
      }
318
28.7k
  }
319
320
28.7k
  if (length == 0)
321
576
    {
322
576
      if (result == NULL)
323
576
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
576
          result = (UNIT *) malloc (1);
326
576
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
576
        }
332
576
    }
333
28.1k
  else if (result != resultbuf && length < allocated)
334
28.0k
    {
335
      /* Shrink the allocated memory if possible.  */
336
28.0k
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
28.0k
      if (memory != NULL)
338
28.0k
        result = memory;
339
28.0k
    }
340
341
28.7k
  if (sortbuf_count > 0)
342
0
    abort ();
343
28.7k
  if (sortbuf != sortbuf_preallocated)
344
28.7k
    free (sortbuf);
345
346
28.7k
  *lengthp = length;
347
28.7k
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
28.7k
}