Coverage Report

Created: 2026-05-16 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2026 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
37.9k
{
22
37.9k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
37.9k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
37.9k
  UNIT *result;
27
37.9k
  size_t allocated;
28
37.9k
  if (resultbuf == NULL)
29
37.9k
    {
30
37.9k
      result = NULL;
31
37.9k
      allocated = 0;
32
37.9k
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
37.9k
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
37.9k
  #define SORTBUF_PREALLOCATED 64
42
37.9k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
37.9k
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
37.9k
    sortbuf_preallocated;
45
37.9k
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
37.9k
  size_t sortbuf_count = 0;
47
48
37.9k
  {
49
37.9k
    const UNIT *s_end = s + n;
50
51
37.9k
    for (;;)
52
362k
      {
53
362k
        int count;
54
362k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
362k
        int decomposed_count;
56
57
362k
        if (s < s_end)
58
324k
          {
59
            /* Fetch the next character.  */
60
324k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
324k
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
678k
            for (int curr = 0; curr < decomposed_count; )
70
353k
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
353k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
353k
                int curr_decomposed_count;
75
76
353k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
353k
                if (curr_decomposed_count >= 0)
78
14.7k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
14.7k
                    int shift = curr_decomposed_count - 1;
83
84
14.7k
                    if (shift < 0)
85
0
                      abort ();
86
14.7k
                    if (shift > 0)
87
14.5k
                      {
88
14.5k
                        decomposed_count += shift;
89
14.5k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
20.2k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
5.79k
                          decomposed[j + shift] = decomposed[j];
93
14.5k
                      }
94
43.9k
                    for (; shift >= 0; shift--)
95
29.2k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
14.7k
                  }
97
338k
                else
98
338k
                  {
99
                    /* decomposed[curr] is atomic.  */
100
338k
                    curr++;
101
338k
                  }
102
353k
              }
103
324k
          }
104
37.9k
        else
105
37.9k
          {
106
37.9k
            count = 0;
107
37.9k
            decomposed_count = 0;
108
37.9k
          }
109
110
362k
        int i = 0;
111
362k
        for (;;)
112
701k
          {
113
701k
            ucs4_t uc;
114
701k
            int ccc;
115
116
701k
            if (s < s_end)
117
663k
              {
118
                /* Fetch the next character from the decomposition.  */
119
663k
                if (i == decomposed_count)
120
324k
                  break;
121
338k
                uc = decomposed[i];
122
338k
                ccc = uc_combining_class (uc);
123
338k
              }
124
37.9k
            else
125
37.9k
              {
126
                /* End of string reached.  */
127
37.9k
                uc = 0;
128
37.9k
                ccc = 0;
129
37.9k
              }
130
131
376k
            if (ccc == 0)
132
321k
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
321k
                if (sortbuf_count > 1)
136
12.1k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
12.1k
                                                           sortbuf + sortbuf_count);
138
139
321k
                if (composer != NULL)
140
321k
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
321k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
283k
                      {
163
336k
                        for (size_t j = 1; j < sortbuf_count; )
164
53.0k
                          {
165
53.0k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
25.3k
                              {
167
25.3k
                                ucs4_t combined =
168
25.3k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
25.3k
                                if (combined)
170
10.1k
                                  {
171
10.1k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
33.9k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
23.8k
                                      sortbuf[k - 1] = sortbuf[k];
175
10.1k
                                    sortbuf_count--;
176
10.1k
                                    continue;
177
10.1k
                                  }
178
25.3k
                              }
179
42.9k
                            j++;
180
42.9k
                          }
181
283k
                        if (s < s_end && sortbuf_count == 1)
182
239k
                          {
183
239k
                            ucs4_t combined =
184
239k
                              composer (sortbuf[0].code, uc);
185
239k
                            if (combined)
186
3.60k
                              {
187
3.60k
                                uc = combined;
188
3.60k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
3.60k
                                sortbuf_count = 0;
193
3.60k
                              }
194
239k
                          }
195
283k
                      }
196
321k
                  }
197
198
646k
                for (size_t j = 0; j < sortbuf_count; j++)
199
325k
                  {
200
325k
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
325k
                    if (length < allocated)
204
286k
                      {
205
286k
                        int ret =
206
286k
                          U_UCTOMB (result + length, muc, allocated - length);
207
286k
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
286k
                        if (ret >= 0)
213
286k
                          {
214
286k
                            length += ret;
215
286k
                            goto done_appending;
216
286k
                          }
217
286k
                      }
218
38.3k
                    {
219
38.3k
                      size_t old_allocated = allocated;
220
38.3k
                      size_t new_allocated = 2 * old_allocated;
221
38.3k
                      if (new_allocated < 64)
222
37.3k
                        new_allocated = 64;
223
38.3k
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
38.3k
                      {
226
38.3k
                        UNIT *larger_result;
227
38.3k
                        if (result == NULL)
228
37.3k
                          {
229
37.3k
                            larger_result =
230
37.3k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
37.3k
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
37.3k
                          }
237
1.04k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
1.04k
                        else
249
1.04k
                          {
250
1.04k
                            larger_result =
251
1.04k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
1.04k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
1.04k
                          }
258
38.3k
                        result = larger_result;
259
38.3k
                        allocated = new_allocated;
260
38.3k
                        {
261
38.3k
                          int ret =
262
38.3k
                            U_UCTOMB (result + length, muc, allocated - length);
263
38.3k
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
38.3k
                          if (ret < 0)
269
0
                            abort ();
270
38.3k
                          length += ret;
271
38.3k
                          goto done_appending;
272
38.3k
                        }
273
38.3k
                      }
274
38.3k
                    }
275
325k
                   done_appending: ;
276
325k
                  }
277
278
                /* sortbuf is now empty.  */
279
321k
                sortbuf_count = 0;
280
321k
              }
281
282
376k
            if (!(s < s_end))
283
              /* End of string reached.  */
284
37.9k
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
338k
            if (sortbuf_count == sortbuf_allocated)
288
188
              {
289
188
                sortbuf_allocated = 2 * sortbuf_allocated;
290
188
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
188
                struct ucs4_with_ccc *new_sortbuf =
293
188
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
188
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
188
                memcpy (new_sortbuf, sortbuf,
300
188
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
188
                if (sortbuf != sortbuf_preallocated)
302
188
                  free (sortbuf);
303
188
                sortbuf = new_sortbuf;
304
188
              }
305
338k
            sortbuf[sortbuf_count].code = uc;
306
338k
            sortbuf[sortbuf_count].ccc = ccc;
307
338k
            sortbuf_count++;
308
309
338k
            i++;
310
338k
          }
311
312
362k
        if (!(s < s_end))
313
          /* End of string reached.  */
314
37.9k
          break;
315
316
324k
        s += count;
317
324k
      }
318
37.9k
  }
319
320
37.9k
  if (length == 0)
321
626
    {
322
626
      if (result == NULL)
323
626
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
626
          result = (UNIT *) malloc (1);
326
626
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
626
        }
332
626
    }
333
37.3k
  else if (result != resultbuf && length < allocated)
334
37.2k
    {
335
      /* Shrink the allocated memory if possible.  */
336
37.2k
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
37.2k
      if (memory != NULL)
338
37.2k
        result = memory;
339
37.2k
    }
340
341
37.9k
  if (sortbuf_count > 0)
342
0
    abort ();
343
37.9k
  if (sortbuf != sortbuf_preallocated)
344
37.9k
    free (sortbuf);
345
346
37.9k
  *lengthp = length;
347
37.9k
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
37.9k
}
u16_normalize
Line
Count
Source
21
9.92k
{
22
9.92k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
9.92k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
9.92k
  UNIT *result;
27
9.92k
  size_t allocated;
28
9.92k
  if (resultbuf == NULL)
29
9.92k
    {
30
9.92k
      result = NULL;
31
9.92k
      allocated = 0;
32
9.92k
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
9.92k
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
9.92k
  #define SORTBUF_PREALLOCATED 64
42
9.92k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
9.92k
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
9.92k
    sortbuf_preallocated;
45
9.92k
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
9.92k
  size_t sortbuf_count = 0;
47
48
9.92k
  {
49
9.92k
    const UNIT *s_end = s + n;
50
51
9.92k
    for (;;)
52
55.3k
      {
53
55.3k
        int count;
54
55.3k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
55.3k
        int decomposed_count;
56
57
55.3k
        if (s < s_end)
58
45.3k
          {
59
            /* Fetch the next character.  */
60
45.3k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
45.3k
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
90.7k
            for (int curr = 0; curr < decomposed_count; )
70
45.3k
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
45.3k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
45.3k
                int curr_decomposed_count;
75
76
45.3k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
45.3k
                if (curr_decomposed_count >= 0)
78
0
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
0
                    int shift = curr_decomposed_count - 1;
83
84
0
                    if (shift < 0)
85
0
                      abort ();
86
0
                    if (shift > 0)
87
0
                      {
88
0
                        decomposed_count += shift;
89
0
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
0
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
0
                          decomposed[j + shift] = decomposed[j];
93
0
                      }
94
0
                    for (; shift >= 0; shift--)
95
0
                      decomposed[curr + shift] = curr_decomposed[shift];
96
0
                  }
97
45.3k
                else
98
45.3k
                  {
99
                    /* decomposed[curr] is atomic.  */
100
45.3k
                    curr++;
101
45.3k
                  }
102
45.3k
              }
103
45.3k
          }
104
9.92k
        else
105
9.92k
          {
106
9.92k
            count = 0;
107
9.92k
            decomposed_count = 0;
108
9.92k
          }
109
110
55.3k
        int i = 0;
111
55.3k
        for (;;)
112
100k
          {
113
100k
            ucs4_t uc;
114
100k
            int ccc;
115
116
100k
            if (s < s_end)
117
90.7k
              {
118
                /* Fetch the next character from the decomposition.  */
119
90.7k
                if (i == decomposed_count)
120
45.3k
                  break;
121
45.3k
                uc = decomposed[i];
122
45.3k
                ccc = uc_combining_class (uc);
123
45.3k
              }
124
9.92k
            else
125
9.92k
              {
126
                /* End of string reached.  */
127
9.92k
                uc = 0;
128
9.92k
                ccc = 0;
129
9.92k
              }
130
131
55.3k
            if (ccc == 0)
132
55.3k
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
55.3k
                if (sortbuf_count > 1)
136
0
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
0
                                                           sortbuf + sortbuf_count);
138
139
55.3k
                if (composer != NULL)
140
55.3k
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
55.3k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
45.3k
                      {
163
45.3k
                        for (size_t j = 1; j < sortbuf_count; )
164
0
                          {
165
0
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
0
                              {
167
0
                                ucs4_t combined =
168
0
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
0
                                if (combined)
170
0
                                  {
171
0
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
0
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
0
                                      sortbuf[k - 1] = sortbuf[k];
175
0
                                    sortbuf_count--;
176
0
                                    continue;
177
0
                                  }
178
0
                              }
179
0
                            j++;
180
0
                          }
181
45.3k
                        if (s < s_end && sortbuf_count == 1)
182
35.4k
                          {
183
35.4k
                            ucs4_t combined =
184
35.4k
                              composer (sortbuf[0].code, uc);
185
35.4k
                            if (combined)
186
0
                              {
187
0
                                uc = combined;
188
0
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
0
                                sortbuf_count = 0;
193
0
                              }
194
35.4k
                          }
195
45.3k
                      }
196
55.3k
                  }
197
198
100k
                for (size_t j = 0; j < sortbuf_count; j++)
199
45.3k
                  {
200
45.3k
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
45.3k
                    if (length < allocated)
204
35.4k
                      {
205
35.4k
                        int ret =
206
35.4k
                          U_UCTOMB (result + length, muc, allocated - length);
207
35.4k
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
35.4k
                        if (ret >= 0)
213
35.4k
                          {
214
35.4k
                            length += ret;
215
35.4k
                            goto done_appending;
216
35.4k
                          }
217
35.4k
                      }
218
9.92k
                    {
219
9.92k
                      size_t old_allocated = allocated;
220
9.92k
                      size_t new_allocated = 2 * old_allocated;
221
9.92k
                      if (new_allocated < 64)
222
9.92k
                        new_allocated = 64;
223
9.92k
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
9.92k
                      {
226
9.92k
                        UNIT *larger_result;
227
9.92k
                        if (result == NULL)
228
9.92k
                          {
229
9.92k
                            larger_result =
230
9.92k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
9.92k
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
9.92k
                          }
237
0
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
0
                        else
249
0
                          {
250
0
                            larger_result =
251
0
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
0
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
0
                          }
258
9.92k
                        result = larger_result;
259
9.92k
                        allocated = new_allocated;
260
9.92k
                        {
261
9.92k
                          int ret =
262
9.92k
                            U_UCTOMB (result + length, muc, allocated - length);
263
9.92k
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
9.92k
                          if (ret < 0)
269
0
                            abort ();
270
9.92k
                          length += ret;
271
9.92k
                          goto done_appending;
272
9.92k
                        }
273
9.92k
                      }
274
9.92k
                    }
275
45.3k
                   done_appending: ;
276
45.3k
                  }
277
278
                /* sortbuf is now empty.  */
279
55.3k
                sortbuf_count = 0;
280
55.3k
              }
281
282
55.3k
            if (!(s < s_end))
283
              /* End of string reached.  */
284
9.92k
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
45.3k
            if (sortbuf_count == sortbuf_allocated)
288
0
              {
289
0
                sortbuf_allocated = 2 * sortbuf_allocated;
290
0
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
0
                struct ucs4_with_ccc *new_sortbuf =
293
0
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
0
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
0
                memcpy (new_sortbuf, sortbuf,
300
0
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
0
                if (sortbuf != sortbuf_preallocated)
302
0
                  free (sortbuf);
303
0
                sortbuf = new_sortbuf;
304
0
              }
305
45.3k
            sortbuf[sortbuf_count].code = uc;
306
45.3k
            sortbuf[sortbuf_count].ccc = ccc;
307
45.3k
            sortbuf_count++;
308
309
45.3k
            i++;
310
45.3k
          }
311
312
55.3k
        if (!(s < s_end))
313
          /* End of string reached.  */
314
9.92k
          break;
315
316
45.3k
        s += count;
317
45.3k
      }
318
9.92k
  }
319
320
9.92k
  if (length == 0)
321
0
    {
322
0
      if (result == NULL)
323
0
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
0
          result = (UNIT *) malloc (1);
326
0
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
0
        }
332
0
    }
333
9.92k
  else if (result != resultbuf && length < allocated)
334
9.92k
    {
335
      /* Shrink the allocated memory if possible.  */
336
9.92k
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
9.92k
      if (memory != NULL)
338
9.92k
        result = memory;
339
9.92k
    }
340
341
9.92k
  if (sortbuf_count > 0)
342
0
    abort ();
343
9.92k
  if (sortbuf != sortbuf_preallocated)
344
9.92k
    free (sortbuf);
345
346
9.92k
  *lengthp = length;
347
9.92k
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
9.92k
}
u32_normalize
Line
Count
Source
21
28.0k
{
22
28.0k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
28.0k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
28.0k
  UNIT *result;
27
28.0k
  size_t allocated;
28
28.0k
  if (resultbuf == NULL)
29
28.0k
    {
30
28.0k
      result = NULL;
31
28.0k
      allocated = 0;
32
28.0k
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
28.0k
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
28.0k
  #define SORTBUF_PREALLOCATED 64
42
28.0k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
28.0k
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
28.0k
    sortbuf_preallocated;
45
28.0k
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
28.0k
  size_t sortbuf_count = 0;
47
48
28.0k
  {
49
28.0k
    const UNIT *s_end = s + n;
50
51
28.0k
    for (;;)
52
307k
      {
53
307k
        int count;
54
307k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
307k
        int decomposed_count;
56
57
307k
        if (s < s_end)
58
279k
          {
59
            /* Fetch the next character.  */
60
279k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
279k
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
587k
            for (int curr = 0; curr < decomposed_count; )
70
308k
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
308k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
308k
                int curr_decomposed_count;
75
76
308k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
308k
                if (curr_decomposed_count >= 0)
78
14.7k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
14.7k
                    int shift = curr_decomposed_count - 1;
83
84
14.7k
                    if (shift < 0)
85
0
                      abort ();
86
14.7k
                    if (shift > 0)
87
14.5k
                      {
88
14.5k
                        decomposed_count += shift;
89
14.5k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
20.2k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
5.79k
                          decomposed[j + shift] = decomposed[j];
93
14.5k
                      }
94
43.9k
                    for (; shift >= 0; shift--)
95
29.2k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
14.7k
                  }
97
293k
                else
98
293k
                  {
99
                    /* decomposed[curr] is atomic.  */
100
293k
                    curr++;
101
293k
                  }
102
308k
              }
103
279k
          }
104
28.0k
        else
105
28.0k
          {
106
28.0k
            count = 0;
107
28.0k
            decomposed_count = 0;
108
28.0k
          }
109
110
307k
        int i = 0;
111
307k
        for (;;)
112
600k
          {
113
600k
            ucs4_t uc;
114
600k
            int ccc;
115
116
600k
            if (s < s_end)
117
572k
              {
118
                /* Fetch the next character from the decomposition.  */
119
572k
                if (i == decomposed_count)
120
279k
                  break;
121
293k
                uc = decomposed[i];
122
293k
                ccc = uc_combining_class (uc);
123
293k
              }
124
28.0k
            else
125
28.0k
              {
126
                /* End of string reached.  */
127
28.0k
                uc = 0;
128
28.0k
                ccc = 0;
129
28.0k
              }
130
131
321k
            if (ccc == 0)
132
266k
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
266k
                if (sortbuf_count > 1)
136
12.1k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
12.1k
                                                           sortbuf + sortbuf_count);
138
139
266k
                if (composer != NULL)
140
266k
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
266k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
238k
                      {
163
291k
                        for (size_t j = 1; j < sortbuf_count; )
164
53.0k
                          {
165
53.0k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
25.3k
                              {
167
25.3k
                                ucs4_t combined =
168
25.3k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
25.3k
                                if (combined)
170
10.1k
                                  {
171
10.1k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
33.9k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
23.8k
                                      sortbuf[k - 1] = sortbuf[k];
175
10.1k
                                    sortbuf_count--;
176
10.1k
                                    continue;
177
10.1k
                                  }
178
25.3k
                              }
179
42.9k
                            j++;
180
42.9k
                          }
181
238k
                        if (s < s_end && sortbuf_count == 1)
182
204k
                          {
183
204k
                            ucs4_t combined =
184
204k
                              composer (sortbuf[0].code, uc);
185
204k
                            if (combined)
186
3.60k
                              {
187
3.60k
                                uc = combined;
188
3.60k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
3.60k
                                sortbuf_count = 0;
193
3.60k
                              }
194
204k
                          }
195
238k
                      }
196
266k
                  }
197
198
546k
                for (size_t j = 0; j < sortbuf_count; j++)
199
279k
                  {
200
279k
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
279k
                    if (length < allocated)
204
251k
                      {
205
251k
                        int ret =
206
251k
                          U_UCTOMB (result + length, muc, allocated - length);
207
251k
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
251k
                        if (ret >= 0)
213
251k
                          {
214
251k
                            length += ret;
215
251k
                            goto done_appending;
216
251k
                          }
217
251k
                      }
218
28.4k
                    {
219
28.4k
                      size_t old_allocated = allocated;
220
28.4k
                      size_t new_allocated = 2 * old_allocated;
221
28.4k
                      if (new_allocated < 64)
222
27.3k
                        new_allocated = 64;
223
28.4k
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
28.4k
                      {
226
28.4k
                        UNIT *larger_result;
227
28.4k
                        if (result == NULL)
228
27.3k
                          {
229
27.3k
                            larger_result =
230
27.3k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
27.3k
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
27.3k
                          }
237
1.04k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
1.04k
                        else
249
1.04k
                          {
250
1.04k
                            larger_result =
251
1.04k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
1.04k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
1.04k
                          }
258
28.4k
                        result = larger_result;
259
28.4k
                        allocated = new_allocated;
260
28.4k
                        {
261
28.4k
                          int ret =
262
28.4k
                            U_UCTOMB (result + length, muc, allocated - length);
263
28.4k
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
28.4k
                          if (ret < 0)
269
0
                            abort ();
270
28.4k
                          length += ret;
271
28.4k
                          goto done_appending;
272
28.4k
                        }
273
28.4k
                      }
274
28.4k
                    }
275
279k
                   done_appending: ;
276
279k
                  }
277
278
                /* sortbuf is now empty.  */
279
266k
                sortbuf_count = 0;
280
266k
              }
281
282
321k
            if (!(s < s_end))
283
              /* End of string reached.  */
284
28.0k
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
293k
            if (sortbuf_count == sortbuf_allocated)
288
188
              {
289
188
                sortbuf_allocated = 2 * sortbuf_allocated;
290
188
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
188
                struct ucs4_with_ccc *new_sortbuf =
293
188
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
188
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
188
                memcpy (new_sortbuf, sortbuf,
300
188
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
188
                if (sortbuf != sortbuf_preallocated)
302
188
                  free (sortbuf);
303
188
                sortbuf = new_sortbuf;
304
188
              }
305
293k
            sortbuf[sortbuf_count].code = uc;
306
293k
            sortbuf[sortbuf_count].ccc = ccc;
307
293k
            sortbuf_count++;
308
309
293k
            i++;
310
293k
          }
311
312
307k
        if (!(s < s_end))
313
          /* End of string reached.  */
314
28.0k
          break;
315
316
279k
        s += count;
317
279k
      }
318
28.0k
  }
319
320
28.0k
  if (length == 0)
321
626
    {
322
626
      if (result == NULL)
323
626
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
626
          result = (UNIT *) malloc (1);
326
626
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
626
        }
332
626
    }
333
27.3k
  else if (result != resultbuf && length < allocated)
334
27.3k
    {
335
      /* Shrink the allocated memory if possible.  */
336
27.3k
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
27.3k
      if (memory != NULL)
338
27.3k
        result = memory;
339
27.3k
    }
340
341
28.0k
  if (sortbuf_count > 0)
342
0
    abort ();
343
28.0k
  if (sortbuf != sortbuf_preallocated)
344
28.0k
    free (sortbuf);
345
346
28.0k
  *lengthp = length;
347
28.0k
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
28.0k
}