Coverage Report

Created: 2025-11-02 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
13.5M
{
22
13.5M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
13.5M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
13.5M
  UNIT *result;
27
13.5M
  size_t length;
28
13.5M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
13.5M
  #define SORTBUF_PREALLOCATED 64
31
13.5M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
13.5M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
13.5M
  size_t sortbuf_allocated;
34
13.5M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
13.5M
  if (resultbuf == NULL)
38
13.5M
    {
39
13.5M
      result = NULL;
40
13.5M
      allocated = 0;
41
13.5M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
13.5M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
13.5M
  sortbuf = sortbuf_preallocated;
51
13.5M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
13.5M
  sortbuf_count = 0;
53
54
13.5M
  {
55
13.5M
    const UNIT *s_end = s + n;
56
57
13.5M
    for (;;)
58
41.9M
      {
59
41.9M
        int count;
60
41.9M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
41.9M
        int decomposed_count;
62
41.9M
        int i;
63
64
41.9M
        if (s < s_end)
65
28.3M
          {
66
            /* Fetch the next character.  */
67
28.3M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
28.3M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
28.3M
            {
77
28.3M
              int curr;
78
79
65.3M
              for (curr = 0; curr < decomposed_count; )
80
37.0M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
37.0M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
37.0M
                  int curr_decomposed_count;
85
86
37.0M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
37.0M
                  if (curr_decomposed_count >= 0)
88
3.01M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.01M
                      int shift = curr_decomposed_count - 1;
93
94
3.01M
                      if (shift < 0)
95
0
                        abort ();
96
3.01M
                      if (shift > 0)
97
2.61M
                        {
98
2.61M
                          int j;
99
100
2.61M
                          decomposed_count += shift;
101
2.61M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.64M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
29.1k
                            decomposed[j + shift] = decomposed[j];
105
2.61M
                        }
106
11.7M
                      for (; shift >= 0; shift--)
107
8.72M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.01M
                    }
109
34.0M
                  else
110
34.0M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
34.0M
                      curr++;
113
34.0M
                    }
114
37.0M
                }
115
28.3M
            }
116
28.3M
          }
117
13.5M
        else
118
13.5M
          {
119
13.5M
            count = 0;
120
13.5M
            decomposed_count = 0;
121
13.5M
          }
122
123
41.9M
        i = 0;
124
41.9M
        for (;;)
125
75.9M
          {
126
75.9M
            ucs4_t uc;
127
75.9M
            int ccc;
128
129
75.9M
            if (s < s_end)
130
62.3M
              {
131
                /* Fetch the next character from the decomposition.  */
132
62.3M
                if (i == decomposed_count)
133
28.3M
                  break;
134
34.0M
                uc = decomposed[i];
135
34.0M
                ccc = uc_combining_class (uc);
136
34.0M
              }
137
13.5M
            else
138
13.5M
              {
139
                /* End of string reached.  */
140
13.5M
                uc = 0;
141
13.5M
                ccc = 0;
142
13.5M
              }
143
144
47.6M
            if (ccc == 0)
145
44.6M
              {
146
44.6M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
44.6M
                if (sortbuf_count > 1)
151
1.52M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.52M
                                                           sortbuf + sortbuf_count);
153
154
44.6M
                if (composer != NULL)
155
44.6M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
44.6M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
31.1M
                      {
178
34.0M
                        for (j = 1; j < sortbuf_count; )
179
2.89M
                          {
180
2.89M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.56M
                              {
182
1.56M
                                ucs4_t combined =
183
1.56M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.56M
                                if (combined)
185
1.51M
                                  {
186
1.51M
                                    size_t k;
187
188
1.51M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.26M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.74M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.51M
                                    sortbuf_count--;
193
1.51M
                                    continue;
194
1.51M
                                  }
195
1.56M
                              }
196
1.38M
                            j++;
197
1.38M
                          }
198
31.1M
                        if (s < s_end && sortbuf_count == 1)
199
17.6M
                          {
200
17.6M
                            ucs4_t combined =
201
17.6M
                              composer (sortbuf[0].code, uc);
202
17.6M
                            if (combined)
203
16.3k
                              {
204
16.3k
                                uc = combined;
205
16.3k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
16.3k
                                sortbuf_count = 0;
210
16.3k
                              }
211
17.6M
                          }
212
31.1M
                      }
213
44.6M
                  }
214
215
77.2M
                for (j = 0; j < sortbuf_count; j++)
216
32.5M
                  {
217
32.5M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
32.5M
                    if (length < allocated)
221
19.0M
                      {
222
19.0M
                        int ret =
223
19.0M
                          U_UCTOMB (result + length, muc, allocated - length);
224
19.0M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
19.0M
                        if (ret >= 0)
230
19.0M
                          {
231
19.0M
                            length += ret;
232
19.0M
                            goto done_appending;
233
19.0M
                          }
234
19.0M
                      }
235
13.4M
                    {
236
13.4M
                      size_t old_allocated = allocated;
237
13.4M
                      size_t new_allocated = 2 * old_allocated;
238
13.4M
                      if (new_allocated < 64)
239
13.4M
                        new_allocated = 64;
240
13.4M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
13.4M
                      {
243
13.4M
                        UNIT *larger_result;
244
13.4M
                        if (result == NULL)
245
13.4M
                          {
246
13.4M
                            larger_result =
247
13.4M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
13.4M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
13.4M
                          }
254
15.6k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
15.6k
                        else
266
15.6k
                          {
267
15.6k
                            larger_result =
268
15.6k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
15.6k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
15.6k
                          }
275
13.4M
                        result = larger_result;
276
13.4M
                        allocated = new_allocated;
277
13.4M
                        {
278
13.4M
                          int ret =
279
13.4M
                            U_UCTOMB (result + length, muc, allocated - length);
280
13.4M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
13.4M
                          if (ret < 0)
286
0
                            abort ();
287
13.4M
                          length += ret;
288
13.4M
                          goto done_appending;
289
13.4M
                        }
290
13.4M
                      }
291
13.4M
                    }
292
32.5M
                   done_appending: ;
293
32.5M
                  }
294
295
                /* sortbuf is now empty.  */
296
44.6M
                sortbuf_count = 0;
297
44.6M
              }
298
299
47.6M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
13.5M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
34.0M
            if (sortbuf_count == sortbuf_allocated)
305
1.86k
              {
306
1.86k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.86k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.86k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.86k
                new_sortbuf =
312
1.86k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.86k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.86k
                memcpy (new_sortbuf, sortbuf,
319
1.86k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.86k
                if (sortbuf != sortbuf_preallocated)
321
1.86k
                  free (sortbuf);
322
1.86k
                sortbuf = new_sortbuf;
323
1.86k
              }
324
34.0M
            sortbuf[sortbuf_count].code = uc;
325
34.0M
            sortbuf[sortbuf_count].ccc = ccc;
326
34.0M
            sortbuf_count++;
327
328
34.0M
            i++;
329
34.0M
          }
330
331
41.9M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
13.5M
          break;
334
335
28.3M
        s += count;
336
28.3M
      }
337
13.5M
  }
338
339
13.5M
  if (length == 0)
340
112k
    {
341
112k
      if (result == NULL)
342
112k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
112k
          result = (UNIT *) malloc (1);
345
112k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
112k
        }
351
112k
    }
352
13.4M
  else if (result != resultbuf && length < allocated)
353
13.4M
    {
354
      /* Shrink the allocated memory if possible.  */
355
13.4M
      UNIT *memory;
356
357
13.4M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
13.4M
      if (memory != NULL)
359
13.4M
        result = memory;
360
13.4M
    }
361
362
13.5M
  if (sortbuf_count > 0)
363
0
    abort ();
364
13.5M
  if (sortbuf != sortbuf_preallocated)
365
13.5M
    free (sortbuf);
366
367
13.5M
  *lengthp = length;
368
13.5M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
13.5M
}
u8_normalize
Line
Count
Source
21
4.42M
{
22
4.42M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
4.42M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
4.42M
  UNIT *result;
27
4.42M
  size_t length;
28
4.42M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
4.42M
  #define SORTBUF_PREALLOCATED 64
31
4.42M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
4.42M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
4.42M
  size_t sortbuf_allocated;
34
4.42M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
4.42M
  if (resultbuf == NULL)
38
4.42M
    {
39
4.42M
      result = NULL;
40
4.42M
      allocated = 0;
41
4.42M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
4.42M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
4.42M
  sortbuf = sortbuf_preallocated;
51
4.42M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
4.42M
  sortbuf_count = 0;
53
54
4.42M
  {
55
4.42M
    const UNIT *s_end = s + n;
56
57
4.42M
    for (;;)
58
19.6M
      {
59
19.6M
        int count;
60
19.6M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
19.6M
        int decomposed_count;
62
19.6M
        int i;
63
64
19.6M
        if (s < s_end)
65
15.1M
          {
66
            /* Fetch the next character.  */
67
15.1M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
15.1M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
15.1M
            {
77
15.1M
              int curr;
78
79
39.0M
              for (curr = 0; curr < decomposed_count; )
80
23.8M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
23.8M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
23.8M
                  int curr_decomposed_count;
85
86
23.8M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
23.8M
                  if (curr_decomposed_count >= 0)
88
2.98M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
2.98M
                      int shift = curr_decomposed_count - 1;
93
94
2.98M
                      if (shift < 0)
95
0
                        abort ();
96
2.98M
                      if (shift > 0)
97
2.57M
                        {
98
2.57M
                          int j;
99
100
2.57M
                          decomposed_count += shift;
101
2.57M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.59M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
19.2k
                            decomposed[j + shift] = decomposed[j];
105
2.57M
                        }
106
11.6M
                      for (; shift >= 0; shift--)
107
8.65M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
2.98M
                    }
109
20.8M
                  else
110
20.8M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
20.8M
                      curr++;
113
20.8M
                    }
114
23.8M
                }
115
15.1M
            }
116
15.1M
          }
117
4.42M
        else
118
4.42M
          {
119
4.42M
            count = 0;
120
4.42M
            decomposed_count = 0;
121
4.42M
          }
122
123
19.6M
        i = 0;
124
19.6M
        for (;;)
125
40.4M
          {
126
40.4M
            ucs4_t uc;
127
40.4M
            int ccc;
128
129
40.4M
            if (s < s_end)
130
36.0M
              {
131
                /* Fetch the next character from the decomposition.  */
132
36.0M
                if (i == decomposed_count)
133
15.1M
                  break;
134
20.8M
                uc = decomposed[i];
135
20.8M
                ccc = uc_combining_class (uc);
136
20.8M
              }
137
4.42M
            else
138
4.42M
              {
139
                /* End of string reached.  */
140
4.42M
                uc = 0;
141
4.42M
                ccc = 0;
142
4.42M
              }
143
144
25.2M
            if (ccc == 0)
145
22.4M
              {
146
22.4M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
22.4M
                if (sortbuf_count > 1)
151
1.50M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.50M
                                                           sortbuf + sortbuf_count);
153
154
22.4M
                if (composer != NULL)
155
22.4M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
22.4M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
18.0M
                      {
178
20.8M
                        for (j = 1; j < sortbuf_count; )
179
2.79M
                          {
180
2.79M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.52M
                              {
182
1.52M
                                ucs4_t combined =
183
1.52M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.52M
                                if (combined)
185
1.48M
                                  {
186
1.48M
                                    size_t k;
187
188
1.48M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.22M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.73M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.48M
                                    sortbuf_count--;
193
1.48M
                                    continue;
194
1.48M
                                  }
195
1.52M
                              }
196
1.30M
                            j++;
197
1.30M
                          }
198
18.0M
                        if (s < s_end && sortbuf_count == 1)
199
13.5M
                          {
200
13.5M
                            ucs4_t combined =
201
13.5M
                              composer (sortbuf[0].code, uc);
202
13.5M
                            if (combined)
203
7.34k
                              {
204
7.34k
                                uc = combined;
205
7.34k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
7.34k
                                sortbuf_count = 0;
210
7.34k
                              }
211
13.5M
                          }
212
18.0M
                      }
213
22.4M
                  }
214
215
41.8M
                for (j = 0; j < sortbuf_count; j++)
216
19.3M
                  {
217
19.3M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
19.3M
                    if (length < allocated)
221
14.9M
                      {
222
14.9M
                        int ret =
223
14.9M
                          U_UCTOMB (result + length, muc, allocated - length);
224
14.9M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
14.9M
                        if (ret >= 0)
230
14.9M
                          {
231
14.9M
                            length += ret;
232
14.9M
                            goto done_appending;
233
14.9M
                          }
234
14.9M
                      }
235
4.43M
                    {
236
4.43M
                      size_t old_allocated = allocated;
237
4.43M
                      size_t new_allocated = 2 * old_allocated;
238
4.43M
                      if (new_allocated < 64)
239
4.42M
                        new_allocated = 64;
240
4.43M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
4.43M
                      {
243
4.43M
                        UNIT *larger_result;
244
4.43M
                        if (result == NULL)
245
4.42M
                          {
246
4.42M
                            larger_result =
247
4.42M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
4.42M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
4.42M
                          }
254
12.6k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
12.6k
                        else
266
12.6k
                          {
267
12.6k
                            larger_result =
268
12.6k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
12.6k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
12.6k
                          }
275
4.43M
                        result = larger_result;
276
4.43M
                        allocated = new_allocated;
277
4.43M
                        {
278
4.43M
                          int ret =
279
4.43M
                            U_UCTOMB (result + length, muc, allocated - length);
280
4.43M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
4.43M
                          if (ret < 0)
286
0
                            abort ();
287
4.43M
                          length += ret;
288
4.43M
                          goto done_appending;
289
4.43M
                        }
290
4.43M
                      }
291
4.43M
                    }
292
19.3M
                   done_appending: ;
293
19.3M
                  }
294
295
                /* sortbuf is now empty.  */
296
22.4M
                sortbuf_count = 0;
297
22.4M
              }
298
299
25.2M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
4.42M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
20.8M
            if (sortbuf_count == sortbuf_allocated)
305
1.03k
              {
306
1.03k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.03k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.03k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.03k
                new_sortbuf =
312
1.03k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.03k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.03k
                memcpy (new_sortbuf, sortbuf,
319
1.03k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.03k
                if (sortbuf != sortbuf_preallocated)
321
1.03k
                  free (sortbuf);
322
1.03k
                sortbuf = new_sortbuf;
323
1.03k
              }
324
20.8M
            sortbuf[sortbuf_count].code = uc;
325
20.8M
            sortbuf[sortbuf_count].ccc = ccc;
326
20.8M
            sortbuf_count++;
327
328
20.8M
            i++;
329
20.8M
          }
330
331
19.6M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
4.42M
          break;
334
335
15.1M
        s += count;
336
15.1M
      }
337
4.42M
  }
338
339
4.42M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
4.42M
  else if (result != resultbuf && length < allocated)
353
4.42M
    {
354
      /* Shrink the allocated memory if possible.  */
355
4.42M
      UNIT *memory;
356
357
4.42M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
4.42M
      if (memory != NULL)
359
4.42M
        result = memory;
360
4.42M
    }
361
362
4.42M
  if (sortbuf_count > 0)
363
0
    abort ();
364
4.42M
  if (sortbuf != sortbuf_preallocated)
365
4.42M
    free (sortbuf);
366
367
4.42M
  *lengthp = length;
368
4.42M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
4.42M
}
u32_normalize
Line
Count
Source
21
9.15M
{
22
9.15M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
9.15M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
9.15M
  UNIT *result;
27
9.15M
  size_t length;
28
9.15M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
9.15M
  #define SORTBUF_PREALLOCATED 64
31
9.15M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
9.15M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
9.15M
  size_t sortbuf_allocated;
34
9.15M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
9.15M
  if (resultbuf == NULL)
38
9.15M
    {
39
9.15M
      result = NULL;
40
9.15M
      allocated = 0;
41
9.15M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
9.15M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
9.15M
  sortbuf = sortbuf_preallocated;
51
9.15M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
9.15M
  sortbuf_count = 0;
53
54
9.15M
  {
55
9.15M
    const UNIT *s_end = s + n;
56
57
9.15M
    for (;;)
58
22.3M
      {
59
22.3M
        int count;
60
22.3M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
22.3M
        int decomposed_count;
62
22.3M
        int i;
63
64
22.3M
        if (s < s_end)
65
13.1M
          {
66
            /* Fetch the next character.  */
67
13.1M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
13.1M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
13.1M
            {
77
13.1M
              int curr;
78
79
26.3M
              for (curr = 0; curr < decomposed_count; )
80
13.2M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
13.2M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
13.2M
                  int curr_decomposed_count;
85
86
13.2M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
13.2M
                  if (curr_decomposed_count >= 0)
88
34.0k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
34.0k
                      int shift = curr_decomposed_count - 1;
93
94
34.0k
                      if (shift < 0)
95
0
                        abort ();
96
34.0k
                      if (shift > 0)
97
33.6k
                        {
98
33.6k
                          int j;
99
100
33.6k
                          decomposed_count += shift;
101
33.6k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
43.5k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
9.87k
                            decomposed[j + shift] = decomposed[j];
105
33.6k
                        }
106
101k
                      for (; shift >= 0; shift--)
107
67.6k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
34.0k
                    }
109
13.1M
                  else
110
13.1M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
13.1M
                      curr++;
113
13.1M
                    }
114
13.2M
                }
115
13.1M
            }
116
13.1M
          }
117
9.15M
        else
118
9.15M
          {
119
9.15M
            count = 0;
120
9.15M
            decomposed_count = 0;
121
9.15M
          }
122
123
22.3M
        i = 0;
124
22.3M
        for (;;)
125
35.4M
          {
126
35.4M
            ucs4_t uc;
127
35.4M
            int ccc;
128
129
35.4M
            if (s < s_end)
130
26.3M
              {
131
                /* Fetch the next character from the decomposition.  */
132
26.3M
                if (i == decomposed_count)
133
13.1M
                  break;
134
13.1M
                uc = decomposed[i];
135
13.1M
                ccc = uc_combining_class (uc);
136
13.1M
              }
137
9.15M
            else
138
9.15M
              {
139
                /* End of string reached.  */
140
9.15M
                uc = 0;
141
9.15M
                ccc = 0;
142
9.15M
              }
143
144
22.3M
            if (ccc == 0)
145
22.2M
              {
146
22.2M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
22.2M
                if (sortbuf_count > 1)
151
26.3k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
26.3k
                                                           sortbuf + sortbuf_count);
153
154
22.2M
                if (composer != NULL)
155
22.2M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
22.2M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
13.0M
                      {
178
13.1M
                        for (j = 1; j < sortbuf_count; )
179
100k
                          {
180
100k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
37.6k
                              {
182
37.6k
                                ucs4_t combined =
183
37.6k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
37.6k
                                if (combined)
185
22.8k
                                  {
186
22.8k
                                    size_t k;
187
188
22.8k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
36.4k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
13.6k
                                      sortbuf[k - 1] = sortbuf[k];
192
22.8k
                                    sortbuf_count--;
193
22.8k
                                    continue;
194
22.8k
                                  }
195
37.6k
                              }
196
77.5k
                            j++;
197
77.5k
                          }
198
13.0M
                        if (s < s_end && sortbuf_count == 1)
199
4.02M
                          {
200
4.02M
                            ucs4_t combined =
201
4.02M
                              composer (sortbuf[0].code, uc);
202
4.02M
                            if (combined)
203
9.00k
                              {
204
9.00k
                                uc = combined;
205
9.00k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
9.00k
                                sortbuf_count = 0;
210
9.00k
                              }
211
4.02M
                          }
212
13.0M
                      }
213
22.2M
                  }
214
215
35.3M
                for (j = 0; j < sortbuf_count; j++)
216
13.1M
                  {
217
13.1M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
13.1M
                    if (length < allocated)
221
4.10M
                      {
222
4.10M
                        int ret =
223
4.10M
                          U_UCTOMB (result + length, muc, allocated - length);
224
4.10M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
4.10M
                        if (ret >= 0)
230
4.10M
                          {
231
4.10M
                            length += ret;
232
4.10M
                            goto done_appending;
233
4.10M
                          }
234
4.10M
                      }
235
9.04M
                    {
236
9.04M
                      size_t old_allocated = allocated;
237
9.04M
                      size_t new_allocated = 2 * old_allocated;
238
9.04M
                      if (new_allocated < 64)
239
9.04M
                        new_allocated = 64;
240
9.04M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
9.04M
                      {
243
9.04M
                        UNIT *larger_result;
244
9.04M
                        if (result == NULL)
245
9.04M
                          {
246
9.04M
                            larger_result =
247
9.04M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
9.04M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
9.04M
                          }
254
2.96k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
2.96k
                        else
266
2.96k
                          {
267
2.96k
                            larger_result =
268
2.96k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
2.96k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
2.96k
                          }
275
9.04M
                        result = larger_result;
276
9.04M
                        allocated = new_allocated;
277
9.04M
                        {
278
9.04M
                          int ret =
279
9.04M
                            U_UCTOMB (result + length, muc, allocated - length);
280
9.04M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
9.04M
                          if (ret < 0)
286
0
                            abort ();
287
9.04M
                          length += ret;
288
9.04M
                          goto done_appending;
289
9.04M
                        }
290
9.04M
                      }
291
9.04M
                    }
292
13.1M
                   done_appending: ;
293
13.1M
                  }
294
295
                /* sortbuf is now empty.  */
296
22.2M
                sortbuf_count = 0;
297
22.2M
              }
298
299
22.3M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
9.15M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
13.1M
            if (sortbuf_count == sortbuf_allocated)
305
825
              {
306
825
                struct ucs4_with_ccc *new_sortbuf;
307
308
825
                sortbuf_allocated = 2 * sortbuf_allocated;
309
825
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
825
                new_sortbuf =
312
825
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
825
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
825
                memcpy (new_sortbuf, sortbuf,
319
825
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
825
                if (sortbuf != sortbuf_preallocated)
321
825
                  free (sortbuf);
322
825
                sortbuf = new_sortbuf;
323
825
              }
324
13.1M
            sortbuf[sortbuf_count].code = uc;
325
13.1M
            sortbuf[sortbuf_count].ccc = ccc;
326
13.1M
            sortbuf_count++;
327
328
13.1M
            i++;
329
13.1M
          }
330
331
22.3M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
9.15M
          break;
334
335
13.1M
        s += count;
336
13.1M
      }
337
9.15M
  }
338
339
9.15M
  if (length == 0)
340
112k
    {
341
112k
      if (result == NULL)
342
112k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
112k
          result = (UNIT *) malloc (1);
345
112k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
112k
        }
351
112k
    }
352
9.04M
  else if (result != resultbuf && length < allocated)
353
9.04M
    {
354
      /* Shrink the allocated memory if possible.  */
355
9.04M
      UNIT *memory;
356
357
9.04M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
9.04M
      if (memory != NULL)
359
9.04M
        result = memory;
360
9.04M
    }
361
362
9.15M
  if (sortbuf_count > 0)
363
0
    abort ();
364
9.15M
  if (sortbuf != sortbuf_preallocated)
365
9.15M
    free (sortbuf);
366
367
9.15M
  *lengthp = length;
368
9.15M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
9.15M
}