Coverage Report

Created: 2026-03-31 06:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2026 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
39.4k
{
22
39.4k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
39.4k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
39.4k
  UNIT *result;
27
39.4k
  size_t allocated;
28
39.4k
  if (resultbuf == NULL)
29
39.4k
    {
30
39.4k
      result = NULL;
31
39.4k
      allocated = 0;
32
39.4k
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
39.4k
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
39.4k
  #define SORTBUF_PREALLOCATED 64
42
39.4k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
39.4k
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
39.4k
    sortbuf_preallocated;
45
39.4k
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
39.4k
  size_t sortbuf_count = 0;
47
48
39.4k
  {
49
39.4k
    const UNIT *s_end = s + n;
50
51
39.4k
    for (;;)
52
369k
      {
53
369k
        int count;
54
369k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
369k
        int decomposed_count;
56
57
369k
        if (s < s_end)
58
329k
          {
59
            /* Fetch the next character.  */
60
329k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
329k
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
686k
            for (int curr = 0; curr < decomposed_count; )
70
357k
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
357k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
357k
                int curr_decomposed_count;
75
76
357k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
357k
                if (curr_decomposed_count >= 0)
78
13.8k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
13.8k
                    int shift = curr_decomposed_count - 1;
83
84
13.8k
                    if (shift < 0)
85
0
                      abort ();
86
13.8k
                    if (shift > 0)
87
13.5k
                      {
88
13.5k
                        decomposed_count += shift;
89
13.5k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
19.1k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
5.56k
                          decomposed[j + shift] = decomposed[j];
93
13.5k
                      }
94
41.1k
                    for (; shift >= 0; shift--)
95
27.3k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
13.8k
                  }
97
343k
                else
98
343k
                  {
99
                    /* decomposed[curr] is atomic.  */
100
343k
                    curr++;
101
343k
                  }
102
357k
              }
103
329k
          }
104
39.4k
        else
105
39.4k
          {
106
39.4k
            count = 0;
107
39.4k
            decomposed_count = 0;
108
39.4k
          }
109
110
369k
        int i = 0;
111
369k
        for (;;)
112
712k
          {
113
712k
            ucs4_t uc;
114
712k
            int ccc;
115
116
712k
            if (s < s_end)
117
673k
              {
118
                /* Fetch the next character from the decomposition.  */
119
673k
                if (i == decomposed_count)
120
329k
                  break;
121
343k
                uc = decomposed[i];
122
343k
                ccc = uc_combining_class (uc);
123
343k
              }
124
39.4k
            else
125
39.4k
              {
126
                /* End of string reached.  */
127
39.4k
                uc = 0;
128
39.4k
                ccc = 0;
129
39.4k
              }
130
131
382k
            if (ccc == 0)
132
329k
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
329k
                if (sortbuf_count > 1)
136
11.4k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
11.4k
                                                           sortbuf + sortbuf_count);
138
139
329k
                if (composer != NULL)
140
329k
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
329k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
289k
                      {
163
341k
                        for (size_t j = 1; j < sortbuf_count; )
164
51.7k
                          {
165
51.7k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
24.5k
                              {
167
24.5k
                                ucs4_t combined =
168
24.5k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
24.5k
                                if (combined)
170
9.56k
                                  {
171
9.56k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
33.2k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
23.6k
                                      sortbuf[k - 1] = sortbuf[k];
175
9.56k
                                    sortbuf_count--;
176
9.56k
                                    continue;
177
9.56k
                                  }
178
24.5k
                              }
179
42.1k
                            j++;
180
42.1k
                          }
181
289k
                        if (s < s_end && sortbuf_count == 1)
182
244k
                          {
183
244k
                            ucs4_t combined =
184
244k
                              composer (sortbuf[0].code, uc);
185
244k
                            if (combined)
186
3.12k
                              {
187
3.12k
                                uc = combined;
188
3.12k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
3.12k
                                sortbuf_count = 0;
193
3.12k
                              }
194
244k
                          }
195
289k
                      }
196
329k
                  }
197
198
659k
                for (size_t j = 0; j < sortbuf_count; j++)
199
330k
                  {
200
330k
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
330k
                    if (length < allocated)
204
290k
                      {
205
290k
                        int ret =
206
290k
                          U_UCTOMB (result + length, muc, allocated - length);
207
290k
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
290k
                        if (ret >= 0)
213
290k
                          {
214
290k
                            length += ret;
215
290k
                            goto done_appending;
216
290k
                          }
217
290k
                      }
218
39.9k
                    {
219
39.9k
                      size_t old_allocated = allocated;
220
39.9k
                      size_t new_allocated = 2 * old_allocated;
221
39.9k
                      if (new_allocated < 64)
222
38.8k
                        new_allocated = 64;
223
39.9k
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
39.9k
                      {
226
39.9k
                        UNIT *larger_result;
227
39.9k
                        if (result == NULL)
228
38.8k
                          {
229
38.8k
                            larger_result =
230
38.8k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
38.8k
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
38.8k
                          }
237
1.05k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
1.05k
                        else
249
1.05k
                          {
250
1.05k
                            larger_result =
251
1.05k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
1.05k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
1.05k
                          }
258
39.9k
                        result = larger_result;
259
39.9k
                        allocated = new_allocated;
260
39.9k
                        {
261
39.9k
                          int ret =
262
39.9k
                            U_UCTOMB (result + length, muc, allocated - length);
263
39.9k
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
39.9k
                          if (ret < 0)
269
0
                            abort ();
270
39.9k
                          length += ret;
271
39.9k
                          goto done_appending;
272
39.9k
                        }
273
39.9k
                      }
274
39.9k
                    }
275
330k
                   done_appending: ;
276
330k
                  }
277
278
                /* sortbuf is now empty.  */
279
329k
                sortbuf_count = 0;
280
329k
              }
281
282
382k
            if (!(s < s_end))
283
              /* End of string reached.  */
284
39.4k
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
343k
            if (sortbuf_count == sortbuf_allocated)
288
176
              {
289
176
                sortbuf_allocated = 2 * sortbuf_allocated;
290
176
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
176
                struct ucs4_with_ccc *new_sortbuf =
293
176
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
176
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
176
                memcpy (new_sortbuf, sortbuf,
300
176
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
176
                if (sortbuf != sortbuf_preallocated)
302
176
                  free (sortbuf);
303
176
                sortbuf = new_sortbuf;
304
176
              }
305
343k
            sortbuf[sortbuf_count].code = uc;
306
343k
            sortbuf[sortbuf_count].ccc = ccc;
307
343k
            sortbuf_count++;
308
309
343k
            i++;
310
343k
          }
311
312
369k
        if (!(s < s_end))
313
          /* End of string reached.  */
314
39.4k
          break;
315
316
329k
        s += count;
317
329k
      }
318
39.4k
  }
319
320
39.4k
  if (length == 0)
321
603
    {
322
603
      if (result == NULL)
323
603
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
603
          result = (UNIT *) malloc (1);
326
603
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
603
        }
332
603
    }
333
38.8k
  else if (result != resultbuf && length < allocated)
334
38.8k
    {
335
      /* Shrink the allocated memory if possible.  */
336
38.8k
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
38.8k
      if (memory != NULL)
338
38.8k
        result = memory;
339
38.8k
    }
340
341
39.4k
  if (sortbuf_count > 0)
342
0
    abort ();
343
39.4k
  if (sortbuf != sortbuf_preallocated)
344
39.4k
    free (sortbuf);
345
346
39.4k
  *lengthp = length;
347
39.4k
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
39.4k
}
u16_normalize
Line
Count
Source
21
10.5k
{
22
10.5k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
10.5k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
10.5k
  UNIT *result;
27
10.5k
  size_t allocated;
28
10.5k
  if (resultbuf == NULL)
29
10.5k
    {
30
10.5k
      result = NULL;
31
10.5k
      allocated = 0;
32
10.5k
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
10.5k
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
10.5k
  #define SORTBUF_PREALLOCATED 64
42
10.5k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
10.5k
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
10.5k
    sortbuf_preallocated;
45
10.5k
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
10.5k
  size_t sortbuf_count = 0;
47
48
10.5k
  {
49
10.5k
    const UNIT *s_end = s + n;
50
51
10.5k
    for (;;)
52
57.9k
      {
53
57.9k
        int count;
54
57.9k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
57.9k
        int decomposed_count;
56
57
57.9k
        if (s < s_end)
58
47.4k
          {
59
            /* Fetch the next character.  */
60
47.4k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
47.4k
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
94.8k
            for (int curr = 0; curr < decomposed_count; )
70
47.4k
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
47.4k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
47.4k
                int curr_decomposed_count;
75
76
47.4k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
47.4k
                if (curr_decomposed_count >= 0)
78
0
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
0
                    int shift = curr_decomposed_count - 1;
83
84
0
                    if (shift < 0)
85
0
                      abort ();
86
0
                    if (shift > 0)
87
0
                      {
88
0
                        decomposed_count += shift;
89
0
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
0
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
0
                          decomposed[j + shift] = decomposed[j];
93
0
                      }
94
0
                    for (; shift >= 0; shift--)
95
0
                      decomposed[curr + shift] = curr_decomposed[shift];
96
0
                  }
97
47.4k
                else
98
47.4k
                  {
99
                    /* decomposed[curr] is atomic.  */
100
47.4k
                    curr++;
101
47.4k
                  }
102
47.4k
              }
103
47.4k
          }
104
10.5k
        else
105
10.5k
          {
106
10.5k
            count = 0;
107
10.5k
            decomposed_count = 0;
108
10.5k
          }
109
110
57.9k
        int i = 0;
111
57.9k
        for (;;)
112
105k
          {
113
105k
            ucs4_t uc;
114
105k
            int ccc;
115
116
105k
            if (s < s_end)
117
94.8k
              {
118
                /* Fetch the next character from the decomposition.  */
119
94.8k
                if (i == decomposed_count)
120
47.4k
                  break;
121
47.4k
                uc = decomposed[i];
122
47.4k
                ccc = uc_combining_class (uc);
123
47.4k
              }
124
10.5k
            else
125
10.5k
              {
126
                /* End of string reached.  */
127
10.5k
                uc = 0;
128
10.5k
                ccc = 0;
129
10.5k
              }
130
131
57.9k
            if (ccc == 0)
132
57.9k
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
57.9k
                if (sortbuf_count > 1)
136
0
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
0
                                                           sortbuf + sortbuf_count);
138
139
57.9k
                if (composer != NULL)
140
57.9k
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
57.9k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
47.4k
                      {
163
47.4k
                        for (size_t j = 1; j < sortbuf_count; )
164
0
                          {
165
0
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
0
                              {
167
0
                                ucs4_t combined =
168
0
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
0
                                if (combined)
170
0
                                  {
171
0
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
0
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
0
                                      sortbuf[k - 1] = sortbuf[k];
175
0
                                    sortbuf_count--;
176
0
                                    continue;
177
0
                                  }
178
0
                              }
179
0
                            j++;
180
0
                          }
181
47.4k
                        if (s < s_end && sortbuf_count == 1)
182
36.8k
                          {
183
36.8k
                            ucs4_t combined =
184
36.8k
                              composer (sortbuf[0].code, uc);
185
36.8k
                            if (combined)
186
0
                              {
187
0
                                uc = combined;
188
0
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
0
                                sortbuf_count = 0;
193
0
                              }
194
36.8k
                          }
195
47.4k
                      }
196
57.9k
                  }
197
198
105k
                for (size_t j = 0; j < sortbuf_count; j++)
199
47.4k
                  {
200
47.4k
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
47.4k
                    if (length < allocated)
204
36.8k
                      {
205
36.8k
                        int ret =
206
36.8k
                          U_UCTOMB (result + length, muc, allocated - length);
207
36.8k
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
36.8k
                        if (ret >= 0)
213
36.8k
                          {
214
36.8k
                            length += ret;
215
36.8k
                            goto done_appending;
216
36.8k
                          }
217
36.8k
                      }
218
10.5k
                    {
219
10.5k
                      size_t old_allocated = allocated;
220
10.5k
                      size_t new_allocated = 2 * old_allocated;
221
10.5k
                      if (new_allocated < 64)
222
10.5k
                        new_allocated = 64;
223
10.5k
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
10.5k
                      {
226
10.5k
                        UNIT *larger_result;
227
10.5k
                        if (result == NULL)
228
10.5k
                          {
229
10.5k
                            larger_result =
230
10.5k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
10.5k
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
10.5k
                          }
237
0
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
0
                        else
249
0
                          {
250
0
                            larger_result =
251
0
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
0
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
0
                          }
258
10.5k
                        result = larger_result;
259
10.5k
                        allocated = new_allocated;
260
10.5k
                        {
261
10.5k
                          int ret =
262
10.5k
                            U_UCTOMB (result + length, muc, allocated - length);
263
10.5k
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
10.5k
                          if (ret < 0)
269
0
                            abort ();
270
10.5k
                          length += ret;
271
10.5k
                          goto done_appending;
272
10.5k
                        }
273
10.5k
                      }
274
10.5k
                    }
275
47.4k
                   done_appending: ;
276
47.4k
                  }
277
278
                /* sortbuf is now empty.  */
279
57.9k
                sortbuf_count = 0;
280
57.9k
              }
281
282
57.9k
            if (!(s < s_end))
283
              /* End of string reached.  */
284
10.5k
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
47.4k
            if (sortbuf_count == sortbuf_allocated)
288
0
              {
289
0
                sortbuf_allocated = 2 * sortbuf_allocated;
290
0
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
0
                struct ucs4_with_ccc *new_sortbuf =
293
0
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
0
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
0
                memcpy (new_sortbuf, sortbuf,
300
0
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
0
                if (sortbuf != sortbuf_preallocated)
302
0
                  free (sortbuf);
303
0
                sortbuf = new_sortbuf;
304
0
              }
305
47.4k
            sortbuf[sortbuf_count].code = uc;
306
47.4k
            sortbuf[sortbuf_count].ccc = ccc;
307
47.4k
            sortbuf_count++;
308
309
47.4k
            i++;
310
47.4k
          }
311
312
57.9k
        if (!(s < s_end))
313
          /* End of string reached.  */
314
10.5k
          break;
315
316
47.4k
        s += count;
317
47.4k
      }
318
10.5k
  }
319
320
10.5k
  if (length == 0)
321
0
    {
322
0
      if (result == NULL)
323
0
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
0
          result = (UNIT *) malloc (1);
326
0
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
0
        }
332
0
    }
333
10.5k
  else if (result != resultbuf && length < allocated)
334
10.5k
    {
335
      /* Shrink the allocated memory if possible.  */
336
10.5k
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
10.5k
      if (memory != NULL)
338
10.5k
        result = memory;
339
10.5k
    }
340
341
10.5k
  if (sortbuf_count > 0)
342
0
    abort ();
343
10.5k
  if (sortbuf != sortbuf_preallocated)
344
10.5k
    free (sortbuf);
345
346
10.5k
  *lengthp = length;
347
10.5k
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
10.5k
}
u32_normalize
Line
Count
Source
21
28.9k
{
22
28.9k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
28.9k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
28.9k
  UNIT *result;
27
28.9k
  size_t allocated;
28
28.9k
  if (resultbuf == NULL)
29
28.9k
    {
30
28.9k
      result = NULL;
31
28.9k
      allocated = 0;
32
28.9k
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
28.9k
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
28.9k
  #define SORTBUF_PREALLOCATED 64
42
28.9k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
28.9k
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
28.9k
    sortbuf_preallocated;
45
28.9k
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
28.9k
  size_t sortbuf_count = 0;
47
48
28.9k
  {
49
28.9k
    const UNIT *s_end = s + n;
50
51
28.9k
    for (;;)
52
311k
      {
53
311k
        int count;
54
311k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
311k
        int decomposed_count;
56
57
311k
        if (s < s_end)
58
282k
          {
59
            /* Fetch the next character.  */
60
282k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
282k
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
592k
            for (int curr = 0; curr < decomposed_count; )
70
309k
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
309k
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
309k
                int curr_decomposed_count;
75
76
309k
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
309k
                if (curr_decomposed_count >= 0)
78
13.8k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
13.8k
                    int shift = curr_decomposed_count - 1;
83
84
13.8k
                    if (shift < 0)
85
0
                      abort ();
86
13.8k
                    if (shift > 0)
87
13.5k
                      {
88
13.5k
                        decomposed_count += shift;
89
13.5k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
19.1k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
5.56k
                          decomposed[j + shift] = decomposed[j];
93
13.5k
                      }
94
41.1k
                    for (; shift >= 0; shift--)
95
27.3k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
13.8k
                  }
97
295k
                else
98
295k
                  {
99
                    /* decomposed[curr] is atomic.  */
100
295k
                    curr++;
101
295k
                  }
102
309k
              }
103
282k
          }
104
28.9k
        else
105
28.9k
          {
106
28.9k
            count = 0;
107
28.9k
            decomposed_count = 0;
108
28.9k
          }
109
110
311k
        int i = 0;
111
311k
        for (;;)
112
607k
          {
113
607k
            ucs4_t uc;
114
607k
            int ccc;
115
116
607k
            if (s < s_end)
117
578k
              {
118
                /* Fetch the next character from the decomposition.  */
119
578k
                if (i == decomposed_count)
120
282k
                  break;
121
295k
                uc = decomposed[i];
122
295k
                ccc = uc_combining_class (uc);
123
295k
              }
124
28.9k
            else
125
28.9k
              {
126
                /* End of string reached.  */
127
28.9k
                uc = 0;
128
28.9k
                ccc = 0;
129
28.9k
              }
130
131
324k
            if (ccc == 0)
132
271k
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
271k
                if (sortbuf_count > 1)
136
11.4k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
11.4k
                                                           sortbuf + sortbuf_count);
138
139
271k
                if (composer != NULL)
140
271k
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
271k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
242k
                      {
163
293k
                        for (size_t j = 1; j < sortbuf_count; )
164
51.7k
                          {
165
51.7k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
24.5k
                              {
167
24.5k
                                ucs4_t combined =
168
24.5k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
24.5k
                                if (combined)
170
9.56k
                                  {
171
9.56k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
33.2k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
23.6k
                                      sortbuf[k - 1] = sortbuf[k];
175
9.56k
                                    sortbuf_count--;
176
9.56k
                                    continue;
177
9.56k
                                  }
178
24.5k
                              }
179
42.1k
                            j++;
180
42.1k
                          }
181
242k
                        if (s < s_end && sortbuf_count == 1)
182
207k
                          {
183
207k
                            ucs4_t combined =
184
207k
                              composer (sortbuf[0].code, uc);
185
207k
                            if (combined)
186
3.12k
                              {
187
3.12k
                                uc = combined;
188
3.12k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
3.12k
                                sortbuf_count = 0;
193
3.12k
                              }
194
207k
                          }
195
242k
                      }
196
271k
                  }
197
198
554k
                for (size_t j = 0; j < sortbuf_count; j++)
199
283k
                  {
200
283k
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
283k
                    if (length < allocated)
204
253k
                      {
205
253k
                        int ret =
206
253k
                          U_UCTOMB (result + length, muc, allocated - length);
207
253k
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
253k
                        if (ret >= 0)
213
253k
                          {
214
253k
                            length += ret;
215
253k
                            goto done_appending;
216
253k
                          }
217
253k
                      }
218
29.3k
                    {
219
29.3k
                      size_t old_allocated = allocated;
220
29.3k
                      size_t new_allocated = 2 * old_allocated;
221
29.3k
                      if (new_allocated < 64)
222
28.3k
                        new_allocated = 64;
223
29.3k
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
29.3k
                      {
226
29.3k
                        UNIT *larger_result;
227
29.3k
                        if (result == NULL)
228
28.3k
                          {
229
28.3k
                            larger_result =
230
28.3k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
28.3k
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
28.3k
                          }
237
1.05k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
1.05k
                        else
249
1.05k
                          {
250
1.05k
                            larger_result =
251
1.05k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
1.05k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
1.05k
                          }
258
29.3k
                        result = larger_result;
259
29.3k
                        allocated = new_allocated;
260
29.3k
                        {
261
29.3k
                          int ret =
262
29.3k
                            U_UCTOMB (result + length, muc, allocated - length);
263
29.3k
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
29.3k
                          if (ret < 0)
269
0
                            abort ();
270
29.3k
                          length += ret;
271
29.3k
                          goto done_appending;
272
29.3k
                        }
273
29.3k
                      }
274
29.3k
                    }
275
283k
                   done_appending: ;
276
283k
                  }
277
278
                /* sortbuf is now empty.  */
279
271k
                sortbuf_count = 0;
280
271k
              }
281
282
324k
            if (!(s < s_end))
283
              /* End of string reached.  */
284
28.9k
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
295k
            if (sortbuf_count == sortbuf_allocated)
288
176
              {
289
176
                sortbuf_allocated = 2 * sortbuf_allocated;
290
176
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
176
                struct ucs4_with_ccc *new_sortbuf =
293
176
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
176
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
176
                memcpy (new_sortbuf, sortbuf,
300
176
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
176
                if (sortbuf != sortbuf_preallocated)
302
176
                  free (sortbuf);
303
176
                sortbuf = new_sortbuf;
304
176
              }
305
295k
            sortbuf[sortbuf_count].code = uc;
306
295k
            sortbuf[sortbuf_count].ccc = ccc;
307
295k
            sortbuf_count++;
308
309
295k
            i++;
310
295k
          }
311
312
311k
        if (!(s < s_end))
313
          /* End of string reached.  */
314
28.9k
          break;
315
316
282k
        s += count;
317
282k
      }
318
28.9k
  }
319
320
28.9k
  if (length == 0)
321
603
    {
322
603
      if (result == NULL)
323
603
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
603
          result = (UNIT *) malloc (1);
326
603
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
603
        }
332
603
    }
333
28.3k
  else if (result != resultbuf && length < allocated)
334
28.2k
    {
335
      /* Shrink the allocated memory if possible.  */
336
28.2k
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
28.2k
      if (memory != NULL)
338
28.2k
        result = memory;
339
28.2k
    }
340
341
28.9k
  if (sortbuf_count > 0)
342
0
    abort ();
343
28.9k
  if (sortbuf != sortbuf_preallocated)
344
28.9k
    free (sortbuf);
345
346
28.9k
  *lengthp = length;
347
28.9k
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
28.9k
}