Coverage Report

Created: 2026-02-14 06:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
38.1k
{
22
38.1k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
38.1k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
38.1k
  UNIT *result;
27
38.1k
  size_t length;
28
38.1k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
38.1k
  #define SORTBUF_PREALLOCATED 64
31
38.1k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
38.1k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
38.1k
  size_t sortbuf_allocated;
34
38.1k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
38.1k
  if (resultbuf == NULL)
38
38.1k
    {
39
38.1k
      result = NULL;
40
38.1k
      allocated = 0;
41
38.1k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
38.1k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
38.1k
  sortbuf = sortbuf_preallocated;
51
38.1k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
38.1k
  sortbuf_count = 0;
53
54
38.1k
  {
55
38.1k
    const UNIT *s_end = s + n;
56
57
38.1k
    for (;;)
58
358k
      {
59
358k
        int count;
60
358k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
358k
        int decomposed_count;
62
358k
        int i;
63
64
358k
        if (s < s_end)
65
320k
          {
66
            /* Fetch the next character.  */
67
320k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
320k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
320k
            {
77
320k
              int curr;
78
79
668k
              for (curr = 0; curr < decomposed_count; )
80
348k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
348k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
348k
                  int curr_decomposed_count;
85
86
348k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
348k
                  if (curr_decomposed_count >= 0)
88
14.0k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
14.0k
                      int shift = curr_decomposed_count - 1;
93
94
14.0k
                      if (shift < 0)
95
0
                        abort ();
96
14.0k
                      if (shift > 0)
97
13.8k
                        {
98
13.8k
                          int j;
99
100
13.8k
                          decomposed_count += shift;
101
13.8k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
19.4k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
5.61k
                            decomposed[j + shift] = decomposed[j];
105
13.8k
                        }
106
41.8k
                      for (; shift >= 0; shift--)
107
27.8k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
14.0k
                    }
109
334k
                  else
110
334k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
334k
                      curr++;
113
334k
                    }
114
348k
                }
115
320k
            }
116
320k
          }
117
38.1k
        else
118
38.1k
          {
119
38.1k
            count = 0;
120
38.1k
            decomposed_count = 0;
121
38.1k
          }
122
123
358k
        i = 0;
124
358k
        for (;;)
125
692k
          {
126
692k
            ucs4_t uc;
127
692k
            int ccc;
128
129
692k
            if (s < s_end)
130
654k
              {
131
                /* Fetch the next character from the decomposition.  */
132
654k
                if (i == decomposed_count)
133
320k
                  break;
134
334k
                uc = decomposed[i];
135
334k
                ccc = uc_combining_class (uc);
136
334k
              }
137
38.1k
            else
138
38.1k
              {
139
                /* End of string reached.  */
140
38.1k
                uc = 0;
141
38.1k
                ccc = 0;
142
38.1k
              }
143
144
372k
            if (ccc == 0)
145
318k
              {
146
318k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
318k
                if (sortbuf_count > 1)
151
11.4k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
11.4k
                                                           sortbuf + sortbuf_count);
153
154
318k
                if (composer != NULL)
155
318k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
318k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
280k
                      {
178
331k
                        for (j = 1; j < sortbuf_count; )
179
50.9k
                          {
180
50.9k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
24.4k
                              {
182
24.4k
                                ucs4_t combined =
183
24.4k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
24.4k
                                if (combined)
185
9.65k
                                  {
186
9.65k
                                    size_t k;
187
188
9.65k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
34.1k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
24.5k
                                      sortbuf[k - 1] = sortbuf[k];
192
9.65k
                                    sortbuf_count--;
193
9.65k
                                    continue;
194
9.65k
                                  }
195
24.4k
                              }
196
41.3k
                            j++;
197
41.3k
                          }
198
280k
                        if (s < s_end && sortbuf_count == 1)
199
236k
                          {
200
236k
                            ucs4_t combined =
201
236k
                              composer (sortbuf[0].code, uc);
202
236k
                            if (combined)
203
3.36k
                              {
204
3.36k
                                uc = combined;
205
3.36k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
3.36k
                                sortbuf_count = 0;
210
3.36k
                              }
211
236k
                          }
212
280k
                      }
213
318k
                  }
214
215
639k
                for (j = 0; j < sortbuf_count; j++)
216
321k
                  {
217
321k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
321k
                    if (length < allocated)
221
282k
                      {
222
282k
                        int ret =
223
282k
                          U_UCTOMB (result + length, muc, allocated - length);
224
282k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
282k
                        if (ret >= 0)
230
282k
                          {
231
282k
                            length += ret;
232
282k
                            goto done_appending;
233
282k
                          }
234
282k
                      }
235
38.5k
                    {
236
38.5k
                      size_t old_allocated = allocated;
237
38.5k
                      size_t new_allocated = 2 * old_allocated;
238
38.5k
                      if (new_allocated < 64)
239
37.5k
                        new_allocated = 64;
240
38.5k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
38.5k
                      {
243
38.5k
                        UNIT *larger_result;
244
38.5k
                        if (result == NULL)
245
37.5k
                          {
246
37.5k
                            larger_result =
247
37.5k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
37.5k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
37.5k
                          }
254
1.04k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
1.04k
                        else
266
1.04k
                          {
267
1.04k
                            larger_result =
268
1.04k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
1.04k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
1.04k
                          }
275
38.5k
                        result = larger_result;
276
38.5k
                        allocated = new_allocated;
277
38.5k
                        {
278
38.5k
                          int ret =
279
38.5k
                            U_UCTOMB (result + length, muc, allocated - length);
280
38.5k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
38.5k
                          if (ret < 0)
286
0
                            abort ();
287
38.5k
                          length += ret;
288
38.5k
                          goto done_appending;
289
38.5k
                        }
290
38.5k
                      }
291
38.5k
                    }
292
321k
                   done_appending: ;
293
321k
                  }
294
295
                /* sortbuf is now empty.  */
296
318k
                sortbuf_count = 0;
297
318k
              }
298
299
372k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
38.1k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
334k
            if (sortbuf_count == sortbuf_allocated)
305
180
              {
306
180
                struct ucs4_with_ccc *new_sortbuf;
307
308
180
                sortbuf_allocated = 2 * sortbuf_allocated;
309
180
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
180
                new_sortbuf =
312
180
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
180
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
180
                memcpy (new_sortbuf, sortbuf,
319
180
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
180
                if (sortbuf != sortbuf_preallocated)
321
180
                  free (sortbuf);
322
180
                sortbuf = new_sortbuf;
323
180
              }
324
334k
            sortbuf[sortbuf_count].code = uc;
325
334k
            sortbuf[sortbuf_count].ccc = ccc;
326
334k
            sortbuf_count++;
327
328
334k
            i++;
329
334k
          }
330
331
358k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
38.1k
          break;
334
335
320k
        s += count;
336
320k
      }
337
38.1k
  }
338
339
38.1k
  if (length == 0)
340
572
    {
341
572
      if (result == NULL)
342
572
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
572
          result = (UNIT *) malloc (1);
345
572
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
572
        }
351
572
    }
352
37.5k
  else if (result != resultbuf && length < allocated)
353
37.4k
    {
354
      /* Shrink the allocated memory if possible.  */
355
37.4k
      UNIT *memory;
356
357
37.4k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
37.4k
      if (memory != NULL)
359
37.4k
        result = memory;
360
37.4k
    }
361
362
38.1k
  if (sortbuf_count > 0)
363
0
    abort ();
364
38.1k
  if (sortbuf != sortbuf_preallocated)
365
38.1k
    free (sortbuf);
366
367
38.1k
  *lengthp = length;
368
38.1k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
38.1k
}
u16_normalize
Line
Count
Source
21
10.3k
{
22
10.3k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
10.3k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
10.3k
  UNIT *result;
27
10.3k
  size_t length;
28
10.3k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
10.3k
  #define SORTBUF_PREALLOCATED 64
31
10.3k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
10.3k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
10.3k
  size_t sortbuf_allocated;
34
10.3k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
10.3k
  if (resultbuf == NULL)
38
10.3k
    {
39
10.3k
      result = NULL;
40
10.3k
      allocated = 0;
41
10.3k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
10.3k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
10.3k
  sortbuf = sortbuf_preallocated;
51
10.3k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
10.3k
  sortbuf_count = 0;
53
54
10.3k
  {
55
10.3k
    const UNIT *s_end = s + n;
56
57
10.3k
    for (;;)
58
56.4k
      {
59
56.4k
        int count;
60
56.4k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
56.4k
        int decomposed_count;
62
56.4k
        int i;
63
64
56.4k
        if (s < s_end)
65
46.0k
          {
66
            /* Fetch the next character.  */
67
46.0k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
46.0k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
46.0k
            {
77
46.0k
              int curr;
78
79
92.1k
              for (curr = 0; curr < decomposed_count; )
80
46.0k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
46.0k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
46.0k
                  int curr_decomposed_count;
85
86
46.0k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
46.0k
                  if (curr_decomposed_count >= 0)
88
0
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
0
                      int shift = curr_decomposed_count - 1;
93
94
0
                      if (shift < 0)
95
0
                        abort ();
96
0
                      if (shift > 0)
97
0
                        {
98
0
                          int j;
99
100
0
                          decomposed_count += shift;
101
0
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
0
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
0
                            decomposed[j + shift] = decomposed[j];
105
0
                        }
106
0
                      for (; shift >= 0; shift--)
107
0
                        decomposed[curr + shift] = curr_decomposed[shift];
108
0
                    }
109
46.0k
                  else
110
46.0k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
46.0k
                      curr++;
113
46.0k
                    }
114
46.0k
                }
115
46.0k
            }
116
46.0k
          }
117
10.3k
        else
118
10.3k
          {
119
10.3k
            count = 0;
120
10.3k
            decomposed_count = 0;
121
10.3k
          }
122
123
56.4k
        i = 0;
124
56.4k
        for (;;)
125
102k
          {
126
102k
            ucs4_t uc;
127
102k
            int ccc;
128
129
102k
            if (s < s_end)
130
92.1k
              {
131
                /* Fetch the next character from the decomposition.  */
132
92.1k
                if (i == decomposed_count)
133
46.0k
                  break;
134
46.0k
                uc = decomposed[i];
135
46.0k
                ccc = uc_combining_class (uc);
136
46.0k
              }
137
10.3k
            else
138
10.3k
              {
139
                /* End of string reached.  */
140
10.3k
                uc = 0;
141
10.3k
                ccc = 0;
142
10.3k
              }
143
144
56.4k
            if (ccc == 0)
145
56.4k
              {
146
56.4k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
56.4k
                if (sortbuf_count > 1)
151
0
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
0
                                                           sortbuf + sortbuf_count);
153
154
56.4k
                if (composer != NULL)
155
56.4k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
56.4k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
46.0k
                      {
178
46.0k
                        for (j = 1; j < sortbuf_count; )
179
0
                          {
180
0
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
0
                              {
182
0
                                ucs4_t combined =
183
0
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
0
                                if (combined)
185
0
                                  {
186
0
                                    size_t k;
187
188
0
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
0
                                    for (k = j + 1; k < sortbuf_count; k++)
191
0
                                      sortbuf[k - 1] = sortbuf[k];
192
0
                                    sortbuf_count--;
193
0
                                    continue;
194
0
                                  }
195
0
                              }
196
0
                            j++;
197
0
                          }
198
46.0k
                        if (s < s_end && sortbuf_count == 1)
199
35.7k
                          {
200
35.7k
                            ucs4_t combined =
201
35.7k
                              composer (sortbuf[0].code, uc);
202
35.7k
                            if (combined)
203
0
                              {
204
0
                                uc = combined;
205
0
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
0
                                sortbuf_count = 0;
210
0
                              }
211
35.7k
                          }
212
46.0k
                      }
213
56.4k
                  }
214
215
102k
                for (j = 0; j < sortbuf_count; j++)
216
46.0k
                  {
217
46.0k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
46.0k
                    if (length < allocated)
221
35.7k
                      {
222
35.7k
                        int ret =
223
35.7k
                          U_UCTOMB (result + length, muc, allocated - length);
224
35.7k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
35.7k
                        if (ret >= 0)
230
35.7k
                          {
231
35.7k
                            length += ret;
232
35.7k
                            goto done_appending;
233
35.7k
                          }
234
35.7k
                      }
235
10.3k
                    {
236
10.3k
                      size_t old_allocated = allocated;
237
10.3k
                      size_t new_allocated = 2 * old_allocated;
238
10.3k
                      if (new_allocated < 64)
239
10.3k
                        new_allocated = 64;
240
10.3k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
10.3k
                      {
243
10.3k
                        UNIT *larger_result;
244
10.3k
                        if (result == NULL)
245
10.3k
                          {
246
10.3k
                            larger_result =
247
10.3k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
10.3k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
10.3k
                          }
254
0
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
0
                        else
266
0
                          {
267
0
                            larger_result =
268
0
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
0
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
0
                          }
275
10.3k
                        result = larger_result;
276
10.3k
                        allocated = new_allocated;
277
10.3k
                        {
278
10.3k
                          int ret =
279
10.3k
                            U_UCTOMB (result + length, muc, allocated - length);
280
10.3k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
10.3k
                          if (ret < 0)
286
0
                            abort ();
287
10.3k
                          length += ret;
288
10.3k
                          goto done_appending;
289
10.3k
                        }
290
10.3k
                      }
291
10.3k
                    }
292
46.0k
                   done_appending: ;
293
46.0k
                  }
294
295
                /* sortbuf is now empty.  */
296
56.4k
                sortbuf_count = 0;
297
56.4k
              }
298
299
56.4k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
10.3k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
46.0k
            if (sortbuf_count == sortbuf_allocated)
305
0
              {
306
0
                struct ucs4_with_ccc *new_sortbuf;
307
308
0
                sortbuf_allocated = 2 * sortbuf_allocated;
309
0
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
0
                new_sortbuf =
312
0
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
0
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
0
                memcpy (new_sortbuf, sortbuf,
319
0
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
0
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
0
                sortbuf = new_sortbuf;
323
0
              }
324
46.0k
            sortbuf[sortbuf_count].code = uc;
325
46.0k
            sortbuf[sortbuf_count].ccc = ccc;
326
46.0k
            sortbuf_count++;
327
328
46.0k
            i++;
329
46.0k
          }
330
331
56.4k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
10.3k
          break;
334
335
46.0k
        s += count;
336
46.0k
      }
337
10.3k
  }
338
339
10.3k
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
10.3k
  else if (result != resultbuf && length < allocated)
353
10.3k
    {
354
      /* Shrink the allocated memory if possible.  */
355
10.3k
      UNIT *memory;
356
357
10.3k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
10.3k
      if (memory != NULL)
359
10.3k
        result = memory;
360
10.3k
    }
361
362
10.3k
  if (sortbuf_count > 0)
363
0
    abort ();
364
10.3k
  if (sortbuf != sortbuf_preallocated)
365
10.3k
    free (sortbuf);
366
367
10.3k
  *lengthp = length;
368
10.3k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
10.3k
}
u32_normalize
Line
Count
Source
21
27.7k
{
22
27.7k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
27.7k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
27.7k
  UNIT *result;
27
27.7k
  size_t length;
28
27.7k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
27.7k
  #define SORTBUF_PREALLOCATED 64
31
27.7k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
27.7k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
27.7k
  size_t sortbuf_allocated;
34
27.7k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
27.7k
  if (resultbuf == NULL)
38
27.7k
    {
39
27.7k
      result = NULL;
40
27.7k
      allocated = 0;
41
27.7k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
27.7k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
27.7k
  sortbuf = sortbuf_preallocated;
51
27.7k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
27.7k
  sortbuf_count = 0;
53
54
27.7k
  {
55
27.7k
    const UNIT *s_end = s + n;
56
57
27.7k
    for (;;)
58
301k
      {
59
301k
        int count;
60
301k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
301k
        int decomposed_count;
62
301k
        int i;
63
64
301k
        if (s < s_end)
65
274k
          {
66
            /* Fetch the next character.  */
67
274k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
274k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
274k
            {
77
274k
              int curr;
78
79
576k
              for (curr = 0; curr < decomposed_count; )
80
302k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
302k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
302k
                  int curr_decomposed_count;
85
86
302k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
302k
                  if (curr_decomposed_count >= 0)
88
14.0k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
14.0k
                      int shift = curr_decomposed_count - 1;
93
94
14.0k
                      if (shift < 0)
95
0
                        abort ();
96
14.0k
                      if (shift > 0)
97
13.8k
                        {
98
13.8k
                          int j;
99
100
13.8k
                          decomposed_count += shift;
101
13.8k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
19.4k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
5.61k
                            decomposed[j + shift] = decomposed[j];
105
13.8k
                        }
106
41.8k
                      for (; shift >= 0; shift--)
107
27.8k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
14.0k
                    }
109
288k
                  else
110
288k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
288k
                      curr++;
113
288k
                    }
114
302k
                }
115
274k
            }
116
274k
          }
117
27.7k
        else
118
27.7k
          {
119
27.7k
            count = 0;
120
27.7k
            decomposed_count = 0;
121
27.7k
          }
122
123
301k
        i = 0;
124
301k
        for (;;)
125
589k
          {
126
589k
            ucs4_t uc;
127
589k
            int ccc;
128
129
589k
            if (s < s_end)
130
562k
              {
131
                /* Fetch the next character from the decomposition.  */
132
562k
                if (i == decomposed_count)
133
274k
                  break;
134
288k
                uc = decomposed[i];
135
288k
                ccc = uc_combining_class (uc);
136
288k
              }
137
27.7k
            else
138
27.7k
              {
139
                /* End of string reached.  */
140
27.7k
                uc = 0;
141
27.7k
                ccc = 0;
142
27.7k
              }
143
144
315k
            if (ccc == 0)
145
261k
              {
146
261k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
261k
                if (sortbuf_count > 1)
151
11.4k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
11.4k
                                                           sortbuf + sortbuf_count);
153
154
261k
                if (composer != NULL)
155
261k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
261k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
234k
                      {
178
285k
                        for (j = 1; j < sortbuf_count; )
179
50.9k
                          {
180
50.9k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
24.4k
                              {
182
24.4k
                                ucs4_t combined =
183
24.4k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
24.4k
                                if (combined)
185
9.65k
                                  {
186
9.65k
                                    size_t k;
187
188
9.65k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
34.1k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
24.5k
                                      sortbuf[k - 1] = sortbuf[k];
192
9.65k
                                    sortbuf_count--;
193
9.65k
                                    continue;
194
9.65k
                                  }
195
24.4k
                              }
196
41.3k
                            j++;
197
41.3k
                          }
198
234k
                        if (s < s_end && sortbuf_count == 1)
199
200k
                          {
200
200k
                            ucs4_t combined =
201
200k
                              composer (sortbuf[0].code, uc);
202
200k
                            if (combined)
203
3.36k
                              {
204
3.36k
                                uc = combined;
205
3.36k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
3.36k
                                sortbuf_count = 0;
210
3.36k
                              }
211
200k
                          }
212
234k
                      }
213
261k
                  }
214
215
536k
                for (j = 0; j < sortbuf_count; j++)
216
274k
                  {
217
274k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
274k
                    if (length < allocated)
221
246k
                      {
222
246k
                        int ret =
223
246k
                          U_UCTOMB (result + length, muc, allocated - length);
224
246k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
246k
                        if (ret >= 0)
230
246k
                          {
231
246k
                            length += ret;
232
246k
                            goto done_appending;
233
246k
                          }
234
246k
                      }
235
28.2k
                    {
236
28.2k
                      size_t old_allocated = allocated;
237
28.2k
                      size_t new_allocated = 2 * old_allocated;
238
28.2k
                      if (new_allocated < 64)
239
27.1k
                        new_allocated = 64;
240
28.2k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
28.2k
                      {
243
28.2k
                        UNIT *larger_result;
244
28.2k
                        if (result == NULL)
245
27.1k
                          {
246
27.1k
                            larger_result =
247
27.1k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
27.1k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
27.1k
                          }
254
1.04k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
1.04k
                        else
266
1.04k
                          {
267
1.04k
                            larger_result =
268
1.04k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
1.04k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
1.04k
                          }
275
28.2k
                        result = larger_result;
276
28.2k
                        allocated = new_allocated;
277
28.2k
                        {
278
28.2k
                          int ret =
279
28.2k
                            U_UCTOMB (result + length, muc, allocated - length);
280
28.2k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
28.2k
                          if (ret < 0)
286
0
                            abort ();
287
28.2k
                          length += ret;
288
28.2k
                          goto done_appending;
289
28.2k
                        }
290
28.2k
                      }
291
28.2k
                    }
292
274k
                   done_appending: ;
293
274k
                  }
294
295
                /* sortbuf is now empty.  */
296
261k
                sortbuf_count = 0;
297
261k
              }
298
299
315k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
27.7k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
288k
            if (sortbuf_count == sortbuf_allocated)
305
180
              {
306
180
                struct ucs4_with_ccc *new_sortbuf;
307
308
180
                sortbuf_allocated = 2 * sortbuf_allocated;
309
180
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
180
                new_sortbuf =
312
180
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
180
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
180
                memcpy (new_sortbuf, sortbuf,
319
180
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
180
                if (sortbuf != sortbuf_preallocated)
321
180
                  free (sortbuf);
322
180
                sortbuf = new_sortbuf;
323
180
              }
324
288k
            sortbuf[sortbuf_count].code = uc;
325
288k
            sortbuf[sortbuf_count].ccc = ccc;
326
288k
            sortbuf_count++;
327
328
288k
            i++;
329
288k
          }
330
331
301k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
27.7k
          break;
334
335
274k
        s += count;
336
274k
      }
337
27.7k
  }
338
339
27.7k
  if (length == 0)
340
572
    {
341
572
      if (result == NULL)
342
572
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
572
          result = (UNIT *) malloc (1);
345
572
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
572
        }
351
572
    }
352
27.1k
  else if (result != resultbuf && length < allocated)
353
27.1k
    {
354
      /* Shrink the allocated memory if possible.  */
355
27.1k
      UNIT *memory;
356
357
27.1k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
27.1k
      if (memory != NULL)
359
27.1k
        result = memory;
360
27.1k
    }
361
362
27.7k
  if (sortbuf_count > 0)
363
0
    abort ();
364
27.7k
  if (sortbuf != sortbuf_preallocated)
365
27.7k
    free (sortbuf);
366
367
27.7k
  *lengthp = length;
368
27.7k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
27.7k
}