Coverage Report

Created: 2025-10-12 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
14.2M
{
22
14.2M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
14.2M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
14.2M
  UNIT *result;
27
14.2M
  size_t length;
28
14.2M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
14.2M
  #define SORTBUF_PREALLOCATED 64
31
14.2M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
14.2M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
14.2M
  size_t sortbuf_allocated;
34
14.2M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
14.2M
  if (resultbuf == NULL)
38
14.2M
    {
39
14.2M
      result = NULL;
40
14.2M
      allocated = 0;
41
14.2M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
14.2M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
14.2M
  sortbuf = sortbuf_preallocated;
51
14.2M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
14.2M
  sortbuf_count = 0;
53
54
14.2M
  {
55
14.2M
    const UNIT *s_end = s + n;
56
57
14.2M
    for (;;)
58
43.7M
      {
59
43.7M
        int count;
60
43.7M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
43.7M
        int decomposed_count;
62
43.7M
        int i;
63
64
43.7M
        if (s < s_end)
65
29.4M
          {
66
            /* Fetch the next character.  */
67
29.4M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
29.4M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
29.4M
            {
77
29.4M
              int curr;
78
79
67.3M
              for (curr = 0; curr < decomposed_count; )
80
37.9M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
37.9M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
37.9M
                  int curr_decomposed_count;
85
86
37.9M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
37.9M
                  if (curr_decomposed_count >= 0)
88
3.03M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.03M
                      int shift = curr_decomposed_count - 1;
93
94
3.03M
                      if (shift < 0)
95
0
                        abort ();
96
3.03M
                      if (shift > 0)
97
2.65M
                        {
98
2.65M
                          int j;
99
100
2.65M
                          decomposed_count += shift;
101
2.65M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.68M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
28.4k
                            decomposed[j + shift] = decomposed[j];
105
2.65M
                        }
106
11.5M
                      for (; shift >= 0; shift--)
107
8.52M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.03M
                    }
109
34.8M
                  else
110
34.8M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
34.8M
                      curr++;
113
34.8M
                    }
114
37.9M
                }
115
29.4M
            }
116
29.4M
          }
117
14.2M
        else
118
14.2M
          {
119
14.2M
            count = 0;
120
14.2M
            decomposed_count = 0;
121
14.2M
          }
122
123
43.7M
        i = 0;
124
43.7M
        for (;;)
125
78.6M
          {
126
78.6M
            ucs4_t uc;
127
78.6M
            int ccc;
128
129
78.6M
            if (s < s_end)
130
64.3M
              {
131
                /* Fetch the next character from the decomposition.  */
132
64.3M
                if (i == decomposed_count)
133
29.4M
                  break;
134
34.8M
                uc = decomposed[i];
135
34.8M
                ccc = uc_combining_class (uc);
136
34.8M
              }
137
14.2M
            else
138
14.2M
              {
139
                /* End of string reached.  */
140
14.2M
                uc = 0;
141
14.2M
                ccc = 0;
142
14.2M
              }
143
144
49.1M
            if (ccc == 0)
145
46.2M
              {
146
46.2M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
46.2M
                if (sortbuf_count > 1)
151
1.51M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.51M
                                                           sortbuf + sortbuf_count);
153
154
46.2M
                if (composer != NULL)
155
46.2M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
46.2M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
31.9M
                      {
178
34.8M
                        for (j = 1; j < sortbuf_count; )
179
2.95M
                          {
180
2.95M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.55M
                              {
182
1.55M
                                ucs4_t combined =
183
1.55M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.55M
                                if (combined)
185
1.49M
                                  {
186
1.49M
                                    size_t k;
187
188
1.49M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.29M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.80M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.49M
                                    sortbuf_count--;
193
1.49M
                                    continue;
194
1.49M
                                  }
195
1.55M
                              }
196
1.46M
                            j++;
197
1.46M
                          }
198
31.9M
                        if (s < s_end && sortbuf_count == 1)
199
17.6M
                          {
200
17.6M
                            ucs4_t combined =
201
17.6M
                              composer (sortbuf[0].code, uc);
202
17.6M
                            if (combined)
203
18.9k
                              {
204
18.9k
                                uc = combined;
205
18.9k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
18.9k
                                sortbuf_count = 0;
210
18.9k
                              }
211
17.6M
                          }
212
31.9M
                      }
213
46.2M
                  }
214
215
79.5M
                for (j = 0; j < sortbuf_count; j++)
216
33.3M
                  {
217
33.3M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
33.3M
                    if (length < allocated)
221
19.1M
                      {
222
19.1M
                        int ret =
223
19.1M
                          U_UCTOMB (result + length, muc, allocated - length);
224
19.1M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
19.1M
                        if (ret >= 0)
230
19.1M
                          {
231
19.1M
                            length += ret;
232
19.1M
                            goto done_appending;
233
19.1M
                          }
234
19.1M
                      }
235
14.2M
                    {
236
14.2M
                      size_t old_allocated = allocated;
237
14.2M
                      size_t new_allocated = 2 * old_allocated;
238
14.2M
                      if (new_allocated < 64)
239
14.1M
                        new_allocated = 64;
240
14.2M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
14.2M
                      {
243
14.2M
                        UNIT *larger_result;
244
14.2M
                        if (result == NULL)
245
14.1M
                          {
246
14.1M
                            larger_result =
247
14.1M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
14.1M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
14.1M
                          }
254
15.3k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
15.3k
                        else
266
15.3k
                          {
267
15.3k
                            larger_result =
268
15.3k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
15.3k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
15.3k
                          }
275
14.2M
                        result = larger_result;
276
14.2M
                        allocated = new_allocated;
277
14.2M
                        {
278
14.2M
                          int ret =
279
14.2M
                            U_UCTOMB (result + length, muc, allocated - length);
280
14.2M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
14.2M
                          if (ret < 0)
286
0
                            abort ();
287
14.2M
                          length += ret;
288
14.2M
                          goto done_appending;
289
14.2M
                        }
290
14.2M
                      }
291
14.2M
                    }
292
33.3M
                   done_appending: ;
293
33.3M
                  }
294
295
                /* sortbuf is now empty.  */
296
46.2M
                sortbuf_count = 0;
297
46.2M
              }
298
299
49.1M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
14.2M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
34.8M
            if (sortbuf_count == sortbuf_allocated)
305
1.91k
              {
306
1.91k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.91k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.91k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.91k
                new_sortbuf =
312
1.91k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.91k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.91k
                memcpy (new_sortbuf, sortbuf,
319
1.91k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.91k
                if (sortbuf != sortbuf_preallocated)
321
1.91k
                  free (sortbuf);
322
1.91k
                sortbuf = new_sortbuf;
323
1.91k
              }
324
34.8M
            sortbuf[sortbuf_count].code = uc;
325
34.8M
            sortbuf[sortbuf_count].ccc = ccc;
326
34.8M
            sortbuf_count++;
327
328
34.8M
            i++;
329
34.8M
          }
330
331
43.7M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
14.2M
          break;
334
335
29.4M
        s += count;
336
29.4M
      }
337
14.2M
  }
338
339
14.2M
  if (length == 0)
340
107k
    {
341
107k
      if (result == NULL)
342
107k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
107k
          result = (UNIT *) malloc (1);
345
107k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
107k
        }
351
107k
    }
352
14.1M
  else if (result != resultbuf && length < allocated)
353
14.1M
    {
354
      /* Shrink the allocated memory if possible.  */
355
14.1M
      UNIT *memory;
356
357
14.1M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
14.1M
      if (memory != NULL)
359
14.1M
        result = memory;
360
14.1M
    }
361
362
14.2M
  if (sortbuf_count > 0)
363
0
    abort ();
364
14.2M
  if (sortbuf != sortbuf_preallocated)
365
14.2M
    free (sortbuf);
366
367
14.2M
  *lengthp = length;
368
14.2M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
14.2M
}
u8_normalize
Line
Count
Source
21
4.63M
{
22
4.63M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
4.63M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
4.63M
  UNIT *result;
27
4.63M
  size_t length;
28
4.63M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
4.63M
  #define SORTBUF_PREALLOCATED 64
31
4.63M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
4.63M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
4.63M
  size_t sortbuf_allocated;
34
4.63M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
4.63M
  if (resultbuf == NULL)
38
4.63M
    {
39
4.63M
      result = NULL;
40
4.63M
      allocated = 0;
41
4.63M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
4.63M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
4.63M
  sortbuf = sortbuf_preallocated;
51
4.63M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
4.63M
  sortbuf_count = 0;
53
54
4.63M
  {
55
4.63M
    const UNIT *s_end = s + n;
56
57
4.63M
    for (;;)
58
20.1M
      {
59
20.1M
        int count;
60
20.1M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
20.1M
        int decomposed_count;
62
20.1M
        int i;
63
64
20.1M
        if (s < s_end)
65
15.5M
          {
66
            /* Fetch the next character.  */
67
15.5M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
15.5M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
15.5M
            {
77
15.5M
              int curr;
78
79
39.5M
              for (curr = 0; curr < decomposed_count; )
80
24.0M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
24.0M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
24.0M
                  int curr_decomposed_count;
85
86
24.0M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
24.0M
                  if (curr_decomposed_count >= 0)
88
2.99M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
2.99M
                      int shift = curr_decomposed_count - 1;
93
94
2.99M
                      if (shift < 0)
95
0
                        abort ();
96
2.99M
                      if (shift > 0)
97
2.61M
                        {
98
2.61M
                          int j;
99
100
2.61M
                          decomposed_count += shift;
101
2.61M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.63M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
18.2k
                            decomposed[j + shift] = decomposed[j];
105
2.61M
                        }
106
11.4M
                      for (; shift >= 0; shift--)
107
8.45M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
2.99M
                    }
109
21.0M
                  else
110
21.0M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
21.0M
                      curr++;
113
21.0M
                    }
114
24.0M
                }
115
15.5M
            }
116
15.5M
          }
117
4.63M
        else
118
4.63M
          {
119
4.63M
            count = 0;
120
4.63M
            decomposed_count = 0;
121
4.63M
          }
122
123
20.1M
        i = 0;
124
20.1M
        for (;;)
125
41.2M
          {
126
41.2M
            ucs4_t uc;
127
41.2M
            int ccc;
128
129
41.2M
            if (s < s_end)
130
36.5M
              {
131
                /* Fetch the next character from the decomposition.  */
132
36.5M
                if (i == decomposed_count)
133
15.5M
                  break;
134
21.0M
                uc = decomposed[i];
135
21.0M
                ccc = uc_combining_class (uc);
136
21.0M
              }
137
4.63M
            else
138
4.63M
              {
139
                /* End of string reached.  */
140
4.63M
                uc = 0;
141
4.63M
                ccc = 0;
142
4.63M
              }
143
144
25.6M
            if (ccc == 0)
145
22.7M
              {
146
22.7M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
22.7M
                if (sortbuf_count > 1)
151
1.48M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.48M
                                                           sortbuf + sortbuf_count);
153
154
22.7M
                if (composer != NULL)
155
22.7M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
22.7M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
18.1M
                      {
178
20.9M
                        for (j = 1; j < sortbuf_count; )
179
2.85M
                          {
180
2.85M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.51M
                              {
182
1.51M
                                ucs4_t combined =
183
1.51M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.51M
                                if (combined)
185
1.47M
                                  {
186
1.47M
                                    size_t k;
187
188
1.47M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.26M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.79M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.47M
                                    sortbuf_count--;
193
1.47M
                                    continue;
194
1.47M
                                  }
195
1.51M
                              }
196
1.38M
                            j++;
197
1.38M
                          }
198
18.1M
                        if (s < s_end && sortbuf_count == 1)
199
13.4M
                          {
200
13.4M
                            ucs4_t combined =
201
13.4M
                              composer (sortbuf[0].code, uc);
202
13.4M
                            if (combined)
203
7.86k
                              {
204
7.86k
                                uc = combined;
205
7.86k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
7.86k
                                sortbuf_count = 0;
210
7.86k
                              }
211
13.4M
                          }
212
18.1M
                      }
213
22.7M
                  }
214
215
42.3M
                for (j = 0; j < sortbuf_count; j++)
216
19.5M
                  {
217
19.5M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
19.5M
                    if (length < allocated)
221
14.8M
                      {
222
14.8M
                        int ret =
223
14.8M
                          U_UCTOMB (result + length, muc, allocated - length);
224
14.8M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
14.8M
                        if (ret >= 0)
230
14.8M
                          {
231
14.8M
                            length += ret;
232
14.8M
                            goto done_appending;
233
14.8M
                          }
234
14.8M
                      }
235
4.65M
                    {
236
4.65M
                      size_t old_allocated = allocated;
237
4.65M
                      size_t new_allocated = 2 * old_allocated;
238
4.65M
                      if (new_allocated < 64)
239
4.63M
                        new_allocated = 64;
240
4.65M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
4.65M
                      {
243
4.65M
                        UNIT *larger_result;
244
4.65M
                        if (result == NULL)
245
4.63M
                          {
246
4.63M
                            larger_result =
247
4.63M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
4.63M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
4.63M
                          }
254
12.5k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
12.5k
                        else
266
12.5k
                          {
267
12.5k
                            larger_result =
268
12.5k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
12.5k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
12.5k
                          }
275
4.65M
                        result = larger_result;
276
4.65M
                        allocated = new_allocated;
277
4.65M
                        {
278
4.65M
                          int ret =
279
4.65M
                            U_UCTOMB (result + length, muc, allocated - length);
280
4.65M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
4.65M
                          if (ret < 0)
286
0
                            abort ();
287
4.65M
                          length += ret;
288
4.65M
                          goto done_appending;
289
4.65M
                        }
290
4.65M
                      }
291
4.65M
                    }
292
19.5M
                   done_appending: ;
293
19.5M
                  }
294
295
                /* sortbuf is now empty.  */
296
22.7M
                sortbuf_count = 0;
297
22.7M
              }
298
299
25.6M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
4.63M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
21.0M
            if (sortbuf_count == sortbuf_allocated)
305
1.08k
              {
306
1.08k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.08k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.08k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.08k
                new_sortbuf =
312
1.08k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.08k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.08k
                memcpy (new_sortbuf, sortbuf,
319
1.08k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.08k
                if (sortbuf != sortbuf_preallocated)
321
1.08k
                  free (sortbuf);
322
1.08k
                sortbuf = new_sortbuf;
323
1.08k
              }
324
21.0M
            sortbuf[sortbuf_count].code = uc;
325
21.0M
            sortbuf[sortbuf_count].ccc = ccc;
326
21.0M
            sortbuf_count++;
327
328
21.0M
            i++;
329
21.0M
          }
330
331
20.1M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
4.63M
          break;
334
335
15.5M
        s += count;
336
15.5M
      }
337
4.63M
  }
338
339
4.63M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
4.63M
  else if (result != resultbuf && length < allocated)
353
4.63M
    {
354
      /* Shrink the allocated memory if possible.  */
355
4.63M
      UNIT *memory;
356
357
4.63M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
4.63M
      if (memory != NULL)
359
4.63M
        result = memory;
360
4.63M
    }
361
362
4.63M
  if (sortbuf_count > 0)
363
0
    abort ();
364
4.63M
  if (sortbuf != sortbuf_preallocated)
365
4.63M
    free (sortbuf);
366
367
4.63M
  *lengthp = length;
368
4.63M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
4.63M
}
u32_normalize
Line
Count
Source
21
9.65M
{
22
9.65M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
9.65M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
9.65M
  UNIT *result;
27
9.65M
  size_t length;
28
9.65M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
9.65M
  #define SORTBUF_PREALLOCATED 64
31
9.65M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
9.65M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
9.65M
  size_t sortbuf_allocated;
34
9.65M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
9.65M
  if (resultbuf == NULL)
38
9.65M
    {
39
9.65M
      result = NULL;
40
9.65M
      allocated = 0;
41
9.65M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
9.65M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
9.65M
  sortbuf = sortbuf_preallocated;
51
9.65M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
9.65M
  sortbuf_count = 0;
53
54
9.65M
  {
55
9.65M
    const UNIT *s_end = s + n;
56
57
9.65M
    for (;;)
58
23.5M
      {
59
23.5M
        int count;
60
23.5M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
23.5M
        int decomposed_count;
62
23.5M
        int i;
63
64
23.5M
        if (s < s_end)
65
13.8M
          {
66
            /* Fetch the next character.  */
67
13.8M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
13.8M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
13.8M
            {
77
13.8M
              int curr;
78
79
27.7M
              for (curr = 0; curr < decomposed_count; )
80
13.9M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
13.9M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
13.9M
                  int curr_decomposed_count;
85
86
13.9M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
13.9M
                  if (curr_decomposed_count >= 0)
88
35.7k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
35.7k
                      int shift = curr_decomposed_count - 1;
93
94
35.7k
                      if (shift < 0)
95
0
                        abort ();
96
35.7k
                      if (shift > 0)
97
35.3k
                        {
98
35.3k
                          int j;
99
100
35.3k
                          decomposed_count += shift;
101
35.3k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
45.5k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
10.2k
                            decomposed[j + shift] = decomposed[j];
105
35.3k
                        }
106
106k
                      for (; shift >= 0; shift--)
107
71.0k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
35.7k
                    }
109
13.8M
                  else
110
13.8M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
13.8M
                      curr++;
113
13.8M
                    }
114
13.9M
                }
115
13.8M
            }
116
13.8M
          }
117
9.65M
        else
118
9.65M
          {
119
9.65M
            count = 0;
120
9.65M
            decomposed_count = 0;
121
9.65M
          }
122
123
23.5M
        i = 0;
124
23.5M
        for (;;)
125
37.3M
          {
126
37.3M
            ucs4_t uc;
127
37.3M
            int ccc;
128
129
37.3M
            if (s < s_end)
130
27.7M
              {
131
                /* Fetch the next character from the decomposition.  */
132
27.7M
                if (i == decomposed_count)
133
13.8M
                  break;
134
13.8M
                uc = decomposed[i];
135
13.8M
                ccc = uc_combining_class (uc);
136
13.8M
              }
137
9.65M
            else
138
9.65M
              {
139
                /* End of string reached.  */
140
9.65M
                uc = 0;
141
9.65M
                ccc = 0;
142
9.65M
              }
143
144
23.5M
            if (ccc == 0)
145
23.4M
              {
146
23.4M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
23.4M
                if (sortbuf_count > 1)
151
26.1k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
26.1k
                                                           sortbuf + sortbuf_count);
153
154
23.4M
                if (composer != NULL)
155
23.4M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
23.4M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
13.7M
                      {
178
13.8M
                        for (j = 1; j < sortbuf_count; )
179
100k
                          {
180
100k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
36.8k
                              {
182
36.8k
                                ucs4_t combined =
183
36.8k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
36.8k
                                if (combined)
185
22.2k
                                  {
186
22.2k
                                    size_t k;
187
188
22.2k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
34.4k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
12.1k
                                      sortbuf[k - 1] = sortbuf[k];
192
22.2k
                                    sortbuf_count--;
193
22.2k
                                    continue;
194
22.2k
                                  }
195
36.8k
                              }
196
78.5k
                            j++;
197
78.5k
                          }
198
13.7M
                        if (s < s_end && sortbuf_count == 1)
199
4.21M
                          {
200
4.21M
                            ucs4_t combined =
201
4.21M
                              composer (sortbuf[0].code, uc);
202
4.21M
                            if (combined)
203
11.1k
                              {
204
11.1k
                                uc = combined;
205
11.1k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
11.1k
                                sortbuf_count = 0;
210
11.1k
                              }
211
4.21M
                          }
212
13.7M
                      }
213
23.4M
                  }
214
215
37.2M
                for (j = 0; j < sortbuf_count; j++)
216
13.8M
                  {
217
13.8M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
13.8M
                    if (length < allocated)
221
4.29M
                      {
222
4.29M
                        int ret =
223
4.29M
                          U_UCTOMB (result + length, muc, allocated - length);
224
4.29M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
4.29M
                        if (ret >= 0)
230
4.29M
                          {
231
4.29M
                            length += ret;
232
4.29M
                            goto done_appending;
233
4.29M
                          }
234
4.29M
                      }
235
9.55M
                    {
236
9.55M
                      size_t old_allocated = allocated;
237
9.55M
                      size_t new_allocated = 2 * old_allocated;
238
9.55M
                      if (new_allocated < 64)
239
9.54M
                        new_allocated = 64;
240
9.55M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
9.55M
                      {
243
9.55M
                        UNIT *larger_result;
244
9.55M
                        if (result == NULL)
245
9.54M
                          {
246
9.54M
                            larger_result =
247
9.54M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
9.54M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
9.54M
                          }
254
2.80k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
2.80k
                        else
266
2.80k
                          {
267
2.80k
                            larger_result =
268
2.80k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
2.80k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
2.80k
                          }
275
9.55M
                        result = larger_result;
276
9.55M
                        allocated = new_allocated;
277
9.55M
                        {
278
9.55M
                          int ret =
279
9.55M
                            U_UCTOMB (result + length, muc, allocated - length);
280
9.55M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
9.55M
                          if (ret < 0)
286
0
                            abort ();
287
9.55M
                          length += ret;
288
9.55M
                          goto done_appending;
289
9.55M
                        }
290
9.55M
                      }
291
9.55M
                    }
292
13.8M
                   done_appending: ;
293
13.8M
                  }
294
295
                /* sortbuf is now empty.  */
296
23.4M
                sortbuf_count = 0;
297
23.4M
              }
298
299
23.5M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
9.65M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
13.8M
            if (sortbuf_count == sortbuf_allocated)
305
830
              {
306
830
                struct ucs4_with_ccc *new_sortbuf;
307
308
830
                sortbuf_allocated = 2 * sortbuf_allocated;
309
830
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
830
                new_sortbuf =
312
830
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
830
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
830
                memcpy (new_sortbuf, sortbuf,
319
830
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
830
                if (sortbuf != sortbuf_preallocated)
321
830
                  free (sortbuf);
322
830
                sortbuf = new_sortbuf;
323
830
              }
324
13.8M
            sortbuf[sortbuf_count].code = uc;
325
13.8M
            sortbuf[sortbuf_count].ccc = ccc;
326
13.8M
            sortbuf_count++;
327
328
13.8M
            i++;
329
13.8M
          }
330
331
23.5M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
9.65M
          break;
334
335
13.8M
        s += count;
336
13.8M
      }
337
9.65M
  }
338
339
9.65M
  if (length == 0)
340
107k
    {
341
107k
      if (result == NULL)
342
107k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
107k
          result = (UNIT *) malloc (1);
345
107k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
107k
        }
351
107k
    }
352
9.54M
  else if (result != resultbuf && length < allocated)
353
9.54M
    {
354
      /* Shrink the allocated memory if possible.  */
355
9.54M
      UNIT *memory;
356
357
9.54M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
9.54M
      if (memory != NULL)
359
9.54M
        result = memory;
360
9.54M
    }
361
362
9.65M
  if (sortbuf_count > 0)
363
0
    abort ();
364
9.65M
  if (sortbuf != sortbuf_preallocated)
365
9.65M
    free (sortbuf);
366
367
9.65M
  *lengthp = length;
368
9.65M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
9.65M
}