Coverage Report

Created: 2025-11-05 06:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
12.4M
{
22
12.4M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
12.4M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
12.4M
  UNIT *result;
27
12.4M
  size_t length;
28
12.4M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
12.4M
  #define SORTBUF_PREALLOCATED 64
31
12.4M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
12.4M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
12.4M
  size_t sortbuf_allocated;
34
12.4M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
12.4M
  if (resultbuf == NULL)
38
12.4M
    {
39
12.4M
      result = NULL;
40
12.4M
      allocated = 0;
41
12.4M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
12.4M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
12.4M
  sortbuf = sortbuf_preallocated;
51
12.4M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
12.4M
  sortbuf_count = 0;
53
54
12.4M
  {
55
12.4M
    const UNIT *s_end = s + n;
56
57
12.4M
    for (;;)
58
38.9M
      {
59
38.9M
        int count;
60
38.9M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
38.9M
        int decomposed_count;
62
38.9M
        int i;
63
64
38.9M
        if (s < s_end)
65
26.4M
          {
66
            /* Fetch the next character.  */
67
26.4M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
26.4M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
26.4M
            {
77
26.4M
              int curr;
78
79
61.5M
              for (curr = 0; curr < decomposed_count; )
80
35.0M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
35.0M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
35.0M
                  int curr_decomposed_count;
85
86
35.0M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
35.0M
                  if (curr_decomposed_count >= 0)
88
2.94M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
2.94M
                      int shift = curr_decomposed_count - 1;
93
94
2.94M
                      if (shift < 0)
95
0
                        abort ();
96
2.94M
                      if (shift > 0)
97
2.53M
                        {
98
2.53M
                          int j;
99
100
2.53M
                          decomposed_count += shift;
101
2.53M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.56M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
27.3k
                            decomposed[j + shift] = decomposed[j];
105
2.53M
                        }
106
11.5M
                      for (; shift >= 0; shift--)
107
8.57M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
2.94M
                    }
109
32.1M
                  else
110
32.1M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
32.1M
                      curr++;
113
32.1M
                    }
114
35.0M
                }
115
26.4M
            }
116
26.4M
          }
117
12.4M
        else
118
12.4M
          {
119
12.4M
            count = 0;
120
12.4M
            decomposed_count = 0;
121
12.4M
          }
122
123
38.9M
        i = 0;
124
38.9M
        for (;;)
125
71.0M
          {
126
71.0M
            ucs4_t uc;
127
71.0M
            int ccc;
128
129
71.0M
            if (s < s_end)
130
58.5M
              {
131
                /* Fetch the next character from the decomposition.  */
132
58.5M
                if (i == decomposed_count)
133
26.4M
                  break;
134
32.1M
                uc = decomposed[i];
135
32.1M
                ccc = uc_combining_class (uc);
136
32.1M
              }
137
12.4M
            else
138
12.4M
              {
139
                /* End of string reached.  */
140
12.4M
                uc = 0;
141
12.4M
                ccc = 0;
142
12.4M
              }
143
144
44.5M
            if (ccc == 0)
145
41.7M
              {
146
41.7M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
41.7M
                if (sortbuf_count > 1)
151
1.48M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.48M
                                                           sortbuf + sortbuf_count);
153
154
41.7M
                if (composer != NULL)
155
41.7M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
41.7M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
29.2M
                      {
178
32.0M
                        for (j = 1; j < sortbuf_count; )
179
2.80M
                          {
180
2.80M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.52M
                              {
182
1.52M
                                ucs4_t combined =
183
1.52M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.52M
                                if (combined)
185
1.47M
                                  {
186
1.47M
                                    size_t k;
187
188
1.47M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.13M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.66M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.47M
                                    sortbuf_count--;
193
1.47M
                                    continue;
194
1.47M
                                  }
195
1.52M
                              }
196
1.33M
                            j++;
197
1.33M
                          }
198
29.2M
                        if (s < s_end && sortbuf_count == 1)
199
16.8M
                          {
200
16.8M
                            ucs4_t combined =
201
16.8M
                              composer (sortbuf[0].code, uc);
202
16.8M
                            if (combined)
203
16.2k
                              {
204
16.2k
                                uc = combined;
205
16.2k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
16.2k
                                sortbuf_count = 0;
210
16.2k
                              }
211
16.8M
                          }
212
29.2M
                      }
213
41.7M
                  }
214
215
72.3M
                for (j = 0; j < sortbuf_count; j++)
216
30.6M
                  {
217
30.6M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
30.6M
                    if (length < allocated)
221
18.2M
                      {
222
18.2M
                        int ret =
223
18.2M
                          U_UCTOMB (result + length, muc, allocated - length);
224
18.2M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
18.2M
                        if (ret >= 0)
230
18.2M
                          {
231
18.2M
                            length += ret;
232
18.2M
                            goto done_appending;
233
18.2M
                          }
234
18.2M
                      }
235
12.3M
                    {
236
12.3M
                      size_t old_allocated = allocated;
237
12.3M
                      size_t new_allocated = 2 * old_allocated;
238
12.3M
                      if (new_allocated < 64)
239
12.3M
                        new_allocated = 64;
240
12.3M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
12.3M
                      {
243
12.3M
                        UNIT *larger_result;
244
12.3M
                        if (result == NULL)
245
12.3M
                          {
246
12.3M
                            larger_result =
247
12.3M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
12.3M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
12.3M
                          }
254
15.5k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
15.5k
                        else
266
15.5k
                          {
267
15.5k
                            larger_result =
268
15.5k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
15.5k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
15.5k
                          }
275
12.3M
                        result = larger_result;
276
12.3M
                        allocated = new_allocated;
277
12.3M
                        {
278
12.3M
                          int ret =
279
12.3M
                            U_UCTOMB (result + length, muc, allocated - length);
280
12.3M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
12.3M
                          if (ret < 0)
286
0
                            abort ();
287
12.3M
                          length += ret;
288
12.3M
                          goto done_appending;
289
12.3M
                        }
290
12.3M
                      }
291
12.3M
                    }
292
30.6M
                   done_appending: ;
293
30.6M
                  }
294
295
                /* sortbuf is now empty.  */
296
41.7M
                sortbuf_count = 0;
297
41.7M
              }
298
299
44.5M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
12.4M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
32.1M
            if (sortbuf_count == sortbuf_allocated)
305
1.87k
              {
306
1.87k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.87k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.87k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.87k
                new_sortbuf =
312
1.87k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.87k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.87k
                memcpy (new_sortbuf, sortbuf,
319
1.87k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.87k
                if (sortbuf != sortbuf_preallocated)
321
1.87k
                  free (sortbuf);
322
1.87k
                sortbuf = new_sortbuf;
323
1.87k
              }
324
32.1M
            sortbuf[sortbuf_count].code = uc;
325
32.1M
            sortbuf[sortbuf_count].ccc = ccc;
326
32.1M
            sortbuf_count++;
327
328
32.1M
            i++;
329
32.1M
          }
330
331
38.9M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
12.4M
          break;
334
335
26.4M
        s += count;
336
26.4M
      }
337
12.4M
  }
338
339
12.4M
  if (length == 0)
340
112k
    {
341
112k
      if (result == NULL)
342
112k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
112k
          result = (UNIT *) malloc (1);
345
112k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
112k
        }
351
112k
    }
352
12.3M
  else if (result != resultbuf && length < allocated)
353
12.3M
    {
354
      /* Shrink the allocated memory if possible.  */
355
12.3M
      UNIT *memory;
356
357
12.3M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
12.3M
      if (memory != NULL)
359
12.3M
        result = memory;
360
12.3M
    }
361
362
12.4M
  if (sortbuf_count > 0)
363
0
    abort ();
364
12.4M
  if (sortbuf != sortbuf_preallocated)
365
12.4M
    free (sortbuf);
366
367
12.4M
  *lengthp = length;
368
12.4M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
12.4M
}
u8_normalize
Line
Count
Source
21
4.03M
{
22
4.03M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
4.03M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
4.03M
  UNIT *result;
27
4.03M
  size_t length;
28
4.03M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
4.03M
  #define SORTBUF_PREALLOCATED 64
31
4.03M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
4.03M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
4.03M
  size_t sortbuf_allocated;
34
4.03M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
4.03M
  if (resultbuf == NULL)
38
4.03M
    {
39
4.03M
      result = NULL;
40
4.03M
      allocated = 0;
41
4.03M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
4.03M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
4.03M
  sortbuf = sortbuf_preallocated;
51
4.03M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
4.03M
  sortbuf_count = 0;
53
54
4.03M
  {
55
4.03M
    const UNIT *s_end = s + n;
56
57
4.03M
    for (;;)
58
18.3M
      {
59
18.3M
        int count;
60
18.3M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
18.3M
        int decomposed_count;
62
18.3M
        int i;
63
64
18.3M
        if (s < s_end)
65
14.2M
          {
66
            /* Fetch the next character.  */
67
14.2M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
14.2M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
14.2M
            {
77
14.2M
              int curr;
78
79
37.0M
              for (curr = 0; curr < decomposed_count; )
80
22.7M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
22.7M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
22.7M
                  int curr_decomposed_count;
85
86
22.7M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
22.7M
                  if (curr_decomposed_count >= 0)
88
2.91M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
2.91M
                      int shift = curr_decomposed_count - 1;
93
94
2.91M
                      if (shift < 0)
95
0
                        abort ();
96
2.91M
                      if (shift > 0)
97
2.50M
                        {
98
2.50M
                          int j;
99
100
2.50M
                          decomposed_count += shift;
101
2.50M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.52M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
17.6k
                            decomposed[j + shift] = decomposed[j];
105
2.50M
                        }
106
11.4M
                      for (; shift >= 0; shift--)
107
8.50M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
2.91M
                    }
109
19.8M
                  else
110
19.8M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
19.8M
                      curr++;
113
19.8M
                    }
114
22.7M
                }
115
14.2M
            }
116
14.2M
          }
117
4.03M
        else
118
4.03M
          {
119
4.03M
            count = 0;
120
4.03M
            decomposed_count = 0;
121
4.03M
          }
122
123
18.3M
        i = 0;
124
18.3M
        for (;;)
125
38.2M
          {
126
38.2M
            ucs4_t uc;
127
38.2M
            int ccc;
128
129
38.2M
            if (s < s_end)
130
34.1M
              {
131
                /* Fetch the next character from the decomposition.  */
132
34.1M
                if (i == decomposed_count)
133
14.2M
                  break;
134
19.8M
                uc = decomposed[i];
135
19.8M
                ccc = uc_combining_class (uc);
136
19.8M
              }
137
4.03M
            else
138
4.03M
              {
139
                /* End of string reached.  */
140
4.03M
                uc = 0;
141
4.03M
                ccc = 0;
142
4.03M
              }
143
144
23.9M
            if (ccc == 0)
145
21.1M
              {
146
21.1M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
21.1M
                if (sortbuf_count > 1)
151
1.46M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.46M
                                                           sortbuf + sortbuf_count);
153
154
21.1M
                if (composer != NULL)
155
21.1M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
21.1M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
17.1M
                      {
178
19.8M
                        for (j = 1; j < sortbuf_count; )
179
2.70M
                          {
180
2.70M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.48M
                              {
182
1.48M
                                ucs4_t combined =
183
1.48M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.48M
                                if (combined)
185
1.44M
                                  {
186
1.44M
                                    size_t k;
187
188
1.44M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.09M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.64M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.44M
                                    sortbuf_count--;
193
1.44M
                                    continue;
194
1.44M
                                  }
195
1.48M
                              }
196
1.25M
                            j++;
197
1.25M
                          }
198
17.1M
                        if (s < s_end && sortbuf_count == 1)
199
13.0M
                          {
200
13.0M
                            ucs4_t combined =
201
13.0M
                              composer (sortbuf[0].code, uc);
202
13.0M
                            if (combined)
203
7.11k
                              {
204
7.11k
                                uc = combined;
205
7.11k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
7.11k
                                sortbuf_count = 0;
210
7.11k
                              }
211
13.0M
                          }
212
17.1M
                      }
213
21.1M
                  }
214
215
39.6M
                for (j = 0; j < sortbuf_count; j++)
216
18.4M
                  {
217
18.4M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
18.4M
                    if (length < allocated)
221
14.3M
                      {
222
14.3M
                        int ret =
223
14.3M
                          U_UCTOMB (result + length, muc, allocated - length);
224
14.3M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
14.3M
                        if (ret >= 0)
230
14.3M
                          {
231
14.3M
                            length += ret;
232
14.3M
                            goto done_appending;
233
14.3M
                          }
234
14.3M
                      }
235
4.05M
                    {
236
4.05M
                      size_t old_allocated = allocated;
237
4.05M
                      size_t new_allocated = 2 * old_allocated;
238
4.05M
                      if (new_allocated < 64)
239
4.03M
                        new_allocated = 64;
240
4.05M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
4.05M
                      {
243
4.05M
                        UNIT *larger_result;
244
4.05M
                        if (result == NULL)
245
4.03M
                          {
246
4.03M
                            larger_result =
247
4.03M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
4.03M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
4.03M
                          }
254
12.5k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
12.5k
                        else
266
12.5k
                          {
267
12.5k
                            larger_result =
268
12.5k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
12.5k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
12.5k
                          }
275
4.05M
                        result = larger_result;
276
4.05M
                        allocated = new_allocated;
277
4.05M
                        {
278
4.05M
                          int ret =
279
4.05M
                            U_UCTOMB (result + length, muc, allocated - length);
280
4.05M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
4.05M
                          if (ret < 0)
286
0
                            abort ();
287
4.05M
                          length += ret;
288
4.05M
                          goto done_appending;
289
4.05M
                        }
290
4.05M
                      }
291
4.05M
                    }
292
18.4M
                   done_appending: ;
293
18.4M
                  }
294
295
                /* sortbuf is now empty.  */
296
21.1M
                sortbuf_count = 0;
297
21.1M
              }
298
299
23.9M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
4.03M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
19.8M
            if (sortbuf_count == sortbuf_allocated)
305
1.05k
              {
306
1.05k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.05k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.05k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.05k
                new_sortbuf =
312
1.05k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.05k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.05k
                memcpy (new_sortbuf, sortbuf,
319
1.05k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.05k
                if (sortbuf != sortbuf_preallocated)
321
1.05k
                  free (sortbuf);
322
1.05k
                sortbuf = new_sortbuf;
323
1.05k
              }
324
19.8M
            sortbuf[sortbuf_count].code = uc;
325
19.8M
            sortbuf[sortbuf_count].ccc = ccc;
326
19.8M
            sortbuf_count++;
327
328
19.8M
            i++;
329
19.8M
          }
330
331
18.3M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
4.03M
          break;
334
335
14.2M
        s += count;
336
14.2M
      }
337
4.03M
  }
338
339
4.03M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
4.03M
  else if (result != resultbuf && length < allocated)
353
4.03M
    {
354
      /* Shrink the allocated memory if possible.  */
355
4.03M
      UNIT *memory;
356
357
4.03M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
4.03M
      if (memory != NULL)
359
4.03M
        result = memory;
360
4.03M
    }
361
362
4.03M
  if (sortbuf_count > 0)
363
0
    abort ();
364
4.03M
  if (sortbuf != sortbuf_preallocated)
365
4.03M
    free (sortbuf);
366
367
4.03M
  *lengthp = length;
368
4.03M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
4.03M
}
u32_normalize
Line
Count
Source
21
8.41M
{
22
8.41M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
8.41M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
8.41M
  UNIT *result;
27
8.41M
  size_t length;
28
8.41M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
8.41M
  #define SORTBUF_PREALLOCATED 64
31
8.41M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
8.41M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
8.41M
  size_t sortbuf_allocated;
34
8.41M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
8.41M
  if (resultbuf == NULL)
38
8.41M
    {
39
8.41M
      result = NULL;
40
8.41M
      allocated = 0;
41
8.41M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
8.41M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
8.41M
  sortbuf = sortbuf_preallocated;
51
8.41M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
8.41M
  sortbuf_count = 0;
53
54
8.41M
  {
55
8.41M
    const UNIT *s_end = s + n;
56
57
8.41M
    for (;;)
58
20.6M
      {
59
20.6M
        int count;
60
20.6M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
20.6M
        int decomposed_count;
62
20.6M
        int i;
63
64
20.6M
        if (s < s_end)
65
12.1M
          {
66
            /* Fetch the next character.  */
67
12.1M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
12.1M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
12.1M
            {
77
12.1M
              int curr;
78
79
24.4M
              for (curr = 0; curr < decomposed_count; )
80
12.2M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
12.2M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
12.2M
                  int curr_decomposed_count;
85
86
12.2M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
12.2M
                  if (curr_decomposed_count >= 0)
88
33.9k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
33.9k
                      int shift = curr_decomposed_count - 1;
93
94
33.9k
                      if (shift < 0)
95
0
                        abort ();
96
33.9k
                      if (shift > 0)
97
33.5k
                        {
98
33.5k
                          int j;
99
100
33.5k
                          decomposed_count += shift;
101
33.5k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
43.2k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
9.67k
                            decomposed[j + shift] = decomposed[j];
105
33.5k
                        }
106
101k
                      for (; shift >= 0; shift--)
107
67.4k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
33.9k
                    }
109
12.2M
                  else
110
12.2M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
12.2M
                      curr++;
113
12.2M
                    }
114
12.2M
                }
115
12.1M
            }
116
12.1M
          }
117
8.41M
        else
118
8.41M
          {
119
8.41M
            count = 0;
120
8.41M
            decomposed_count = 0;
121
8.41M
          }
122
123
20.6M
        i = 0;
124
20.6M
        for (;;)
125
32.8M
          {
126
32.8M
            ucs4_t uc;
127
32.8M
            int ccc;
128
129
32.8M
            if (s < s_end)
130
24.4M
              {
131
                /* Fetch the next character from the decomposition.  */
132
24.4M
                if (i == decomposed_count)
133
12.1M
                  break;
134
12.2M
                uc = decomposed[i];
135
12.2M
                ccc = uc_combining_class (uc);
136
12.2M
              }
137
8.41M
            else
138
8.41M
              {
139
                /* End of string reached.  */
140
8.41M
                uc = 0;
141
8.41M
                ccc = 0;
142
8.41M
              }
143
144
20.6M
            if (ccc == 0)
145
20.5M
              {
146
20.5M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
20.5M
                if (sortbuf_count > 1)
151
26.0k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
26.0k
                                                           sortbuf + sortbuf_count);
153
154
20.5M
                if (composer != NULL)
155
20.5M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
20.5M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
12.1M
                      {
178
12.2M
                        for (j = 1; j < sortbuf_count; )
179
99.7k
                          {
180
99.7k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
37.1k
                              {
182
37.1k
                                ucs4_t combined =
183
37.1k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
37.1k
                                if (combined)
185
22.5k
                                  {
186
22.5k
                                    size_t k;
187
188
22.5k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
36.0k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
13.5k
                                      sortbuf[k - 1] = sortbuf[k];
192
22.5k
                                    sortbuf_count--;
193
22.5k
                                    continue;
194
22.5k
                                  }
195
37.1k
                              }
196
77.2k
                            j++;
197
77.2k
                          }
198
12.1M
                        if (s < s_end && sortbuf_count == 1)
199
3.80M
                          {
200
3.80M
                            ucs4_t combined =
201
3.80M
                              composer (sortbuf[0].code, uc);
202
3.80M
                            if (combined)
203
9.16k
                              {
204
9.16k
                                uc = combined;
205
9.16k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
9.16k
                                sortbuf_count = 0;
210
9.16k
                              }
211
3.80M
                          }
212
12.1M
                      }
213
20.5M
                  }
214
215
32.7M
                for (j = 0; j < sortbuf_count; j++)
216
12.1M
                  {
217
12.1M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
12.1M
                    if (length < allocated)
221
3.88M
                      {
222
3.88M
                        int ret =
223
3.88M
                          U_UCTOMB (result + length, muc, allocated - length);
224
3.88M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
3.88M
                        if (ret >= 0)
230
3.88M
                          {
231
3.88M
                            length += ret;
232
3.88M
                            goto done_appending;
233
3.88M
                          }
234
3.88M
                      }
235
8.30M
                    {
236
8.30M
                      size_t old_allocated = allocated;
237
8.30M
                      size_t new_allocated = 2 * old_allocated;
238
8.30M
                      if (new_allocated < 64)
239
8.30M
                        new_allocated = 64;
240
8.30M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
8.30M
                      {
243
8.30M
                        UNIT *larger_result;
244
8.30M
                        if (result == NULL)
245
8.30M
                          {
246
8.30M
                            larger_result =
247
8.30M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
8.30M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
8.30M
                          }
254
2.97k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
2.97k
                        else
266
2.97k
                          {
267
2.97k
                            larger_result =
268
2.97k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
2.97k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
2.97k
                          }
275
8.30M
                        result = larger_result;
276
8.30M
                        allocated = new_allocated;
277
8.30M
                        {
278
8.30M
                          int ret =
279
8.30M
                            U_UCTOMB (result + length, muc, allocated - length);
280
8.30M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
8.30M
                          if (ret < 0)
286
0
                            abort ();
287
8.30M
                          length += ret;
288
8.30M
                          goto done_appending;
289
8.30M
                        }
290
8.30M
                      }
291
8.30M
                    }
292
12.1M
                   done_appending: ;
293
12.1M
                  }
294
295
                /* sortbuf is now empty.  */
296
20.5M
                sortbuf_count = 0;
297
20.5M
              }
298
299
20.6M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
8.41M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
12.2M
            if (sortbuf_count == sortbuf_allocated)
305
825
              {
306
825
                struct ucs4_with_ccc *new_sortbuf;
307
308
825
                sortbuf_allocated = 2 * sortbuf_allocated;
309
825
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
825
                new_sortbuf =
312
825
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
825
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
825
                memcpy (new_sortbuf, sortbuf,
319
825
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
825
                if (sortbuf != sortbuf_preallocated)
321
825
                  free (sortbuf);
322
825
                sortbuf = new_sortbuf;
323
825
              }
324
12.2M
            sortbuf[sortbuf_count].code = uc;
325
12.2M
            sortbuf[sortbuf_count].ccc = ccc;
326
12.2M
            sortbuf_count++;
327
328
12.2M
            i++;
329
12.2M
          }
330
331
20.6M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
8.41M
          break;
334
335
12.1M
        s += count;
336
12.1M
      }
337
8.41M
  }
338
339
8.41M
  if (length == 0)
340
112k
    {
341
112k
      if (result == NULL)
342
112k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
112k
          result = (UNIT *) malloc (1);
345
112k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
112k
        }
351
112k
    }
352
8.30M
  else if (result != resultbuf && length < allocated)
353
8.30M
    {
354
      /* Shrink the allocated memory if possible.  */
355
8.30M
      UNIT *memory;
356
357
8.30M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
8.30M
      if (memory != NULL)
359
8.30M
        result = memory;
360
8.30M
    }
361
362
8.41M
  if (sortbuf_count > 0)
363
0
    abort ();
364
8.41M
  if (sortbuf != sortbuf_preallocated)
365
8.41M
    free (sortbuf);
366
367
8.41M
  *lengthp = length;
368
8.41M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
8.41M
}