Coverage Report

Created: 2026-06-30 06:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2026 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
39.2k
{
22
39.2k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
39.2k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
39.2k
  UNIT *result;
27
39.2k
  size_t allocated;
28
39.2k
  if (resultbuf == NULL)
29
39.2k
    {
30
39.2k
      result = NULL;
31
39.2k
      allocated = 0;
32
39.2k
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
39.2k
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
39.2k
  #define SORTBUF_PREALLOCATED 64
42
39.2k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
39.2k
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
39.2k
    sortbuf_preallocated;
45
39.2k
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
39.2k
  size_t sortbuf_count = 0;
47
48
39.2k
  {
49
39.2k
    const UNIT *s_end = s + n;
50
51
39.2k
    for (;;)
52
367k
      {
53
367k
        int count;
54
367k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
367k
        int decomposed_count;
56
57
367k
        if (s < s_end)
58
327k
          {
59
            /* Fetch the next character.  */
60
327k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
327k
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
683k
            for (int curr = 0; curr < decomposed_count; )
70
356k
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
356k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
356k
                int curr_decomposed_count;
75
76
356k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
356k
                if (curr_decomposed_count >= 0)
78
14.2k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
14.2k
                    int shift = curr_decomposed_count - 1;
83
84
14.2k
                    if (shift < 0)
85
0
                      abort ();
86
14.2k
                    if (shift > 0)
87
14.0k
                      {
88
14.0k
                        decomposed_count += shift;
89
14.0k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
19.6k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
5.63k
                          decomposed[j + shift] = decomposed[j];
93
14.0k
                      }
94
42.4k
                    for (; shift >= 0; shift--)
95
28.2k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
14.2k
                  }
97
341k
                else
98
341k
                  {
99
                    /* decomposed[curr] is atomic.  */
100
341k
                    curr++;
101
341k
                  }
102
356k
              }
103
327k
          }
104
39.2k
        else
105
39.2k
          {
106
39.2k
            count = 0;
107
39.2k
            decomposed_count = 0;
108
39.2k
          }
109
110
367k
        int i = 0;
111
367k
        for (;;)
112
708k
          {
113
708k
            ucs4_t uc;
114
708k
            int ccc;
115
116
708k
            if (s < s_end)
117
669k
              {
118
                /* Fetch the next character from the decomposition.  */
119
669k
                if (i == decomposed_count)
120
327k
                  break;
121
341k
                uc = decomposed[i];
122
341k
                ccc = uc_combining_class (uc);
123
341k
              }
124
39.2k
            else
125
39.2k
              {
126
                /* End of string reached.  */
127
39.2k
                uc = 0;
128
39.2k
                ccc = 0;
129
39.2k
              }
130
131
381k
            if (ccc == 0)
132
327k
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
327k
                if (sortbuf_count > 1)
136
11.5k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
11.5k
                                                           sortbuf + sortbuf_count);
138
139
327k
                if (composer != NULL)
140
327k
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
327k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
288k
                      {
163
340k
                        for (size_t j = 1; j < sortbuf_count; )
164
51.9k
                          {
165
51.9k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
24.8k
                              {
167
24.8k
                                ucs4_t combined =
168
24.8k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
24.8k
                                if (combined)
170
9.98k
                                  {
171
9.98k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
34.2k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
24.2k
                                      sortbuf[k - 1] = sortbuf[k];
175
9.98k
                                    sortbuf_count--;
176
9.98k
                                    continue;
177
9.98k
                                  }
178
24.8k
                              }
179
41.9k
                            j++;
180
41.9k
                          }
181
288k
                        if (s < s_end && sortbuf_count == 1)
182
243k
                          {
183
243k
                            ucs4_t combined =
184
243k
                              composer (sortbuf[0].code, uc);
185
243k
                            if (combined)
186
3.20k
                              {
187
3.20k
                                uc = combined;
188
3.20k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
3.20k
                                sortbuf_count = 0;
193
3.20k
                              }
194
243k
                          }
195
288k
                      }
196
327k
                  }
197
198
656k
                for (size_t j = 0; j < sortbuf_count; j++)
199
328k
                  {
200
328k
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
328k
                    if (length < allocated)
204
288k
                      {
205
288k
                        int ret =
206
288k
                          U_UCTOMB (result + length, muc, allocated - length);
207
288k
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
288k
                        if (ret >= 0)
213
288k
                          {
214
288k
                            length += ret;
215
288k
                            goto done_appending;
216
288k
                          }
217
288k
                      }
218
39.6k
                    {
219
39.6k
                      size_t old_allocated = allocated;
220
39.6k
                      size_t new_allocated = 2 * old_allocated;
221
39.6k
                      if (new_allocated < 64)
222
38.6k
                        new_allocated = 64;
223
39.6k
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
39.6k
                      {
226
39.6k
                        UNIT *larger_result;
227
39.6k
                        if (result == NULL)
228
38.6k
                          {
229
38.6k
                            larger_result =
230
38.6k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
38.6k
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
38.6k
                          }
237
1.01k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
1.01k
                        else
249
1.01k
                          {
250
1.01k
                            larger_result =
251
1.01k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
1.01k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
1.01k
                          }
258
39.6k
                        result = larger_result;
259
39.6k
                        allocated = new_allocated;
260
39.6k
                        {
261
39.6k
                          int ret =
262
39.6k
                            U_UCTOMB (result + length, muc, allocated - length);
263
39.6k
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
39.6k
                          if (ret < 0)
269
0
                            abort ();
270
39.6k
                          length += ret;
271
39.6k
                          goto done_appending;
272
39.6k
                        }
273
39.6k
                      }
274
39.6k
                    }
275
328k
                   done_appending: ;
276
328k
                  }
277
278
                /* sortbuf is now empty.  */
279
327k
                sortbuf_count = 0;
280
327k
              }
281
282
381k
            if (!(s < s_end))
283
              /* End of string reached.  */
284
39.2k
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
341k
            if (sortbuf_count == sortbuf_allocated)
288
169
              {
289
169
                sortbuf_allocated = 2 * sortbuf_allocated;
290
169
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
169
                struct ucs4_with_ccc *new_sortbuf =
293
169
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
169
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
169
                memcpy (new_sortbuf, sortbuf,
300
169
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
169
                if (sortbuf != sortbuf_preallocated)
302
169
                  free (sortbuf);
303
169
                sortbuf = new_sortbuf;
304
169
              }
305
341k
            sortbuf[sortbuf_count].code = uc;
306
341k
            sortbuf[sortbuf_count].ccc = ccc;
307
341k
            sortbuf_count++;
308
309
341k
            i++;
310
341k
          }
311
312
367k
        if (!(s < s_end))
313
          /* End of string reached.  */
314
39.2k
          break;
315
316
327k
        s += count;
317
327k
      }
318
39.2k
  }
319
320
39.2k
  if (length == 0)
321
533
    {
322
533
      if (result == NULL)
323
533
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
533
          result = (UNIT *) malloc (1);
326
533
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
533
        }
332
533
    }
333
38.6k
  else if (result != resultbuf && length < allocated)
334
38.5k
    {
335
      /* Shrink the allocated memory if possible.  */
336
38.5k
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
38.5k
      if (memory != NULL)
338
38.5k
        result = memory;
339
38.5k
    }
340
341
39.2k
  if (sortbuf_count > 0)
342
0
    abort ();
343
39.2k
  if (sortbuf != sortbuf_preallocated)
344
39.2k
    free (sortbuf);
345
346
39.2k
  *lengthp = length;
347
39.2k
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
39.2k
}
u16_normalize
Line
Count
Source
21
10.4k
{
22
10.4k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
10.4k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
10.4k
  UNIT *result;
27
10.4k
  size_t allocated;
28
10.4k
  if (resultbuf == NULL)
29
10.4k
    {
30
10.4k
      result = NULL;
31
10.4k
      allocated = 0;
32
10.4k
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
10.4k
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
10.4k
  #define SORTBUF_PREALLOCATED 64
42
10.4k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
10.4k
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
10.4k
    sortbuf_preallocated;
45
10.4k
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
10.4k
  size_t sortbuf_count = 0;
47
48
10.4k
  {
49
10.4k
    const UNIT *s_end = s + n;
50
51
10.4k
    for (;;)
52
58.0k
      {
53
58.0k
        int count;
54
58.0k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
58.0k
        int decomposed_count;
56
57
58.0k
        if (s < s_end)
58
47.6k
          {
59
            /* Fetch the next character.  */
60
47.6k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
47.6k
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
95.2k
            for (int curr = 0; curr < decomposed_count; )
70
47.6k
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
47.6k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
47.6k
                int curr_decomposed_count;
75
76
47.6k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
47.6k
                if (curr_decomposed_count >= 0)
78
0
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
0
                    int shift = curr_decomposed_count - 1;
83
84
0
                    if (shift < 0)
85
0
                      abort ();
86
0
                    if (shift > 0)
87
0
                      {
88
0
                        decomposed_count += shift;
89
0
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
0
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
0
                          decomposed[j + shift] = decomposed[j];
93
0
                      }
94
0
                    for (; shift >= 0; shift--)
95
0
                      decomposed[curr + shift] = curr_decomposed[shift];
96
0
                  }
97
47.6k
                else
98
47.6k
                  {
99
                    /* decomposed[curr] is atomic.  */
100
47.6k
                    curr++;
101
47.6k
                  }
102
47.6k
              }
103
47.6k
          }
104
10.4k
        else
105
10.4k
          {
106
10.4k
            count = 0;
107
10.4k
            decomposed_count = 0;
108
10.4k
          }
109
110
58.0k
        int i = 0;
111
58.0k
        for (;;)
112
105k
          {
113
105k
            ucs4_t uc;
114
105k
            int ccc;
115
116
105k
            if (s < s_end)
117
95.2k
              {
118
                /* Fetch the next character from the decomposition.  */
119
95.2k
                if (i == decomposed_count)
120
47.6k
                  break;
121
47.6k
                uc = decomposed[i];
122
47.6k
                ccc = uc_combining_class (uc);
123
47.6k
              }
124
10.4k
            else
125
10.4k
              {
126
                /* End of string reached.  */
127
10.4k
                uc = 0;
128
10.4k
                ccc = 0;
129
10.4k
              }
130
131
58.0k
            if (ccc == 0)
132
58.0k
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
58.0k
                if (sortbuf_count > 1)
136
0
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
0
                                                           sortbuf + sortbuf_count);
138
139
58.0k
                if (composer != NULL)
140
58.0k
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
58.0k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
47.6k
                      {
163
47.6k
                        for (size_t j = 1; j < sortbuf_count; )
164
0
                          {
165
0
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
0
                              {
167
0
                                ucs4_t combined =
168
0
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
0
                                if (combined)
170
0
                                  {
171
0
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
0
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
0
                                      sortbuf[k - 1] = sortbuf[k];
175
0
                                    sortbuf_count--;
176
0
                                    continue;
177
0
                                  }
178
0
                              }
179
0
                            j++;
180
0
                          }
181
47.6k
                        if (s < s_end && sortbuf_count == 1)
182
37.1k
                          {
183
37.1k
                            ucs4_t combined =
184
37.1k
                              composer (sortbuf[0].code, uc);
185
37.1k
                            if (combined)
186
0
                              {
187
0
                                uc = combined;
188
0
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
0
                                sortbuf_count = 0;
193
0
                              }
194
37.1k
                          }
195
47.6k
                      }
196
58.0k
                  }
197
198
105k
                for (size_t j = 0; j < sortbuf_count; j++)
199
47.6k
                  {
200
47.6k
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
47.6k
                    if (length < allocated)
204
37.1k
                      {
205
37.1k
                        int ret =
206
37.1k
                          U_UCTOMB (result + length, muc, allocated - length);
207
37.1k
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
37.1k
                        if (ret >= 0)
213
37.1k
                          {
214
37.1k
                            length += ret;
215
37.1k
                            goto done_appending;
216
37.1k
                          }
217
37.1k
                      }
218
10.4k
                    {
219
10.4k
                      size_t old_allocated = allocated;
220
10.4k
                      size_t new_allocated = 2 * old_allocated;
221
10.4k
                      if (new_allocated < 64)
222
10.4k
                        new_allocated = 64;
223
10.4k
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
10.4k
                      {
226
10.4k
                        UNIT *larger_result;
227
10.4k
                        if (result == NULL)
228
10.4k
                          {
229
10.4k
                            larger_result =
230
10.4k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
10.4k
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
10.4k
                          }
237
0
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
0
                        else
249
0
                          {
250
0
                            larger_result =
251
0
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
0
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
0
                          }
258
10.4k
                        result = larger_result;
259
10.4k
                        allocated = new_allocated;
260
10.4k
                        {
261
10.4k
                          int ret =
262
10.4k
                            U_UCTOMB (result + length, muc, allocated - length);
263
10.4k
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
10.4k
                          if (ret < 0)
269
0
                            abort ();
270
10.4k
                          length += ret;
271
10.4k
                          goto done_appending;
272
10.4k
                        }
273
10.4k
                      }
274
10.4k
                    }
275
47.6k
                   done_appending: ;
276
47.6k
                  }
277
278
                /* sortbuf is now empty.  */
279
58.0k
                sortbuf_count = 0;
280
58.0k
              }
281
282
58.0k
            if (!(s < s_end))
283
              /* End of string reached.  */
284
10.4k
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
47.6k
            if (sortbuf_count == sortbuf_allocated)
288
0
              {
289
0
                sortbuf_allocated = 2 * sortbuf_allocated;
290
0
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
0
                struct ucs4_with_ccc *new_sortbuf =
293
0
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
0
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
0
                memcpy (new_sortbuf, sortbuf,
300
0
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
0
                if (sortbuf != sortbuf_preallocated)
302
0
                  free (sortbuf);
303
0
                sortbuf = new_sortbuf;
304
0
              }
305
47.6k
            sortbuf[sortbuf_count].code = uc;
306
47.6k
            sortbuf[sortbuf_count].ccc = ccc;
307
47.6k
            sortbuf_count++;
308
309
47.6k
            i++;
310
47.6k
          }
311
312
58.0k
        if (!(s < s_end))
313
          /* End of string reached.  */
314
10.4k
          break;
315
316
47.6k
        s += count;
317
47.6k
      }
318
10.4k
  }
319
320
10.4k
  if (length == 0)
321
0
    {
322
0
      if (result == NULL)
323
0
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
0
          result = (UNIT *) malloc (1);
326
0
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
0
        }
332
0
    }
333
10.4k
  else if (result != resultbuf && length < allocated)
334
10.4k
    {
335
      /* Shrink the allocated memory if possible.  */
336
10.4k
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
10.4k
      if (memory != NULL)
338
10.4k
        result = memory;
339
10.4k
    }
340
341
10.4k
  if (sortbuf_count > 0)
342
0
    abort ();
343
10.4k
  if (sortbuf != sortbuf_preallocated)
344
10.4k
    free (sortbuf);
345
346
10.4k
  *lengthp = length;
347
10.4k
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
10.4k
}
u32_normalize
Line
Count
Source
21
28.7k
{
22
28.7k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
28.7k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
28.7k
  UNIT *result;
27
28.7k
  size_t allocated;
28
28.7k
  if (resultbuf == NULL)
29
28.7k
    {
30
28.7k
      result = NULL;
31
28.7k
      allocated = 0;
32
28.7k
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
28.7k
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
28.7k
  #define SORTBUF_PREALLOCATED 64
42
28.7k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
28.7k
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
28.7k
    sortbuf_preallocated;
45
28.7k
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
28.7k
  size_t sortbuf_count = 0;
47
48
28.7k
  {
49
28.7k
    const UNIT *s_end = s + n;
50
51
28.7k
    for (;;)
52
308k
      {
53
308k
        int count;
54
308k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
308k
        int decomposed_count;
56
57
308k
        if (s < s_end)
58
280k
          {
59
            /* Fetch the next character.  */
60
280k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
280k
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
588k
            for (int curr = 0; curr < decomposed_count; )
70
308k
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
308k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
308k
                int curr_decomposed_count;
75
76
308k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
308k
                if (curr_decomposed_count >= 0)
78
14.2k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
14.2k
                    int shift = curr_decomposed_count - 1;
83
84
14.2k
                    if (shift < 0)
85
0
                      abort ();
86
14.2k
                    if (shift > 0)
87
14.0k
                      {
88
14.0k
                        decomposed_count += shift;
89
14.0k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
19.6k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
5.63k
                          decomposed[j + shift] = decomposed[j];
93
14.0k
                      }
94
42.4k
                    for (; shift >= 0; shift--)
95
28.2k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
14.2k
                  }
97
294k
                else
98
294k
                  {
99
                    /* decomposed[curr] is atomic.  */
100
294k
                    curr++;
101
294k
                  }
102
308k
              }
103
280k
          }
104
28.7k
        else
105
28.7k
          {
106
28.7k
            count = 0;
107
28.7k
            decomposed_count = 0;
108
28.7k
          }
109
110
308k
        int i = 0;
111
308k
        for (;;)
112
603k
          {
113
603k
            ucs4_t uc;
114
603k
            int ccc;
115
116
603k
            if (s < s_end)
117
574k
              {
118
                /* Fetch the next character from the decomposition.  */
119
574k
                if (i == decomposed_count)
120
280k
                  break;
121
294k
                uc = decomposed[i];
122
294k
                ccc = uc_combining_class (uc);
123
294k
              }
124
28.7k
            else
125
28.7k
              {
126
                /* End of string reached.  */
127
28.7k
                uc = 0;
128
28.7k
                ccc = 0;
129
28.7k
              }
130
131
322k
            if (ccc == 0)
132
269k
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
269k
                if (sortbuf_count > 1)
136
11.5k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
11.5k
                                                           sortbuf + sortbuf_count);
138
139
269k
                if (composer != NULL)
140
269k
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
269k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
240k
                      {
163
292k
                        for (size_t j = 1; j < sortbuf_count; )
164
51.9k
                          {
165
51.9k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
24.8k
                              {
167
24.8k
                                ucs4_t combined =
168
24.8k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
24.8k
                                if (combined)
170
9.98k
                                  {
171
9.98k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
34.2k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
24.2k
                                      sortbuf[k - 1] = sortbuf[k];
175
9.98k
                                    sortbuf_count--;
176
9.98k
                                    continue;
177
9.98k
                                  }
178
24.8k
                              }
179
41.9k
                            j++;
180
41.9k
                          }
181
240k
                        if (s < s_end && sortbuf_count == 1)
182
205k
                          {
183
205k
                            ucs4_t combined =
184
205k
                              composer (sortbuf[0].code, uc);
185
205k
                            if (combined)
186
3.20k
                              {
187
3.20k
                                uc = combined;
188
3.20k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
3.20k
                                sortbuf_count = 0;
193
3.20k
                              }
194
205k
                          }
195
240k
                      }
196
269k
                  }
197
198
550k
                for (size_t j = 0; j < sortbuf_count; j++)
199
281k
                  {
200
281k
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
281k
                    if (length < allocated)
204
251k
                      {
205
251k
                        int ret =
206
251k
                          U_UCTOMB (result + length, muc, allocated - length);
207
251k
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
251k
                        if (ret >= 0)
213
251k
                          {
214
251k
                            length += ret;
215
251k
                            goto done_appending;
216
251k
                          }
217
251k
                      }
218
29.2k
                    {
219
29.2k
                      size_t old_allocated = allocated;
220
29.2k
                      size_t new_allocated = 2 * old_allocated;
221
29.2k
                      if (new_allocated < 64)
222
28.2k
                        new_allocated = 64;
223
29.2k
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
29.2k
                      {
226
29.2k
                        UNIT *larger_result;
227
29.2k
                        if (result == NULL)
228
28.2k
                          {
229
28.2k
                            larger_result =
230
28.2k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
28.2k
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
28.2k
                          }
237
1.01k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
1.01k
                        else
249
1.01k
                          {
250
1.01k
                            larger_result =
251
1.01k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
1.01k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
1.01k
                          }
258
29.2k
                        result = larger_result;
259
29.2k
                        allocated = new_allocated;
260
29.2k
                        {
261
29.2k
                          int ret =
262
29.2k
                            U_UCTOMB (result + length, muc, allocated - length);
263
29.2k
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
29.2k
                          if (ret < 0)
269
0
                            abort ();
270
29.2k
                          length += ret;
271
29.2k
                          goto done_appending;
272
29.2k
                        }
273
29.2k
                      }
274
29.2k
                    }
275
281k
                   done_appending: ;
276
281k
                  }
277
278
                /* sortbuf is now empty.  */
279
269k
                sortbuf_count = 0;
280
269k
              }
281
282
322k
            if (!(s < s_end))
283
              /* End of string reached.  */
284
28.7k
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
294k
            if (sortbuf_count == sortbuf_allocated)
288
169
              {
289
169
                sortbuf_allocated = 2 * sortbuf_allocated;
290
169
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
169
                struct ucs4_with_ccc *new_sortbuf =
293
169
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
169
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
169
                memcpy (new_sortbuf, sortbuf,
300
169
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
169
                if (sortbuf != sortbuf_preallocated)
302
169
                  free (sortbuf);
303
169
                sortbuf = new_sortbuf;
304
169
              }
305
294k
            sortbuf[sortbuf_count].code = uc;
306
294k
            sortbuf[sortbuf_count].ccc = ccc;
307
294k
            sortbuf_count++;
308
309
294k
            i++;
310
294k
          }
311
312
308k
        if (!(s < s_end))
313
          /* End of string reached.  */
314
28.7k
          break;
315
316
280k
        s += count;
317
280k
      }
318
28.7k
  }
319
320
28.7k
  if (length == 0)
321
533
    {
322
533
      if (result == NULL)
323
533
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
533
          result = (UNIT *) malloc (1);
326
533
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
533
        }
332
533
    }
333
28.2k
  else if (result != resultbuf && length < allocated)
334
28.1k
    {
335
      /* Shrink the allocated memory if possible.  */
336
28.1k
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
28.1k
      if (memory != NULL)
338
28.1k
        result = memory;
339
28.1k
    }
340
341
28.7k
  if (sortbuf_count > 0)
342
0
    abort ();
343
28.7k
  if (sortbuf != sortbuf_preallocated)
344
28.7k
    free (sortbuf);
345
346
28.7k
  *lengthp = length;
347
28.7k
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
28.7k
}