Coverage Report

Created: 2025-10-26 06:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
11.9M
{
22
11.9M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
11.9M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
11.9M
  UNIT *result;
27
11.9M
  size_t length;
28
11.9M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
11.9M
  #define SORTBUF_PREALLOCATED 64
31
11.9M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
11.9M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
11.9M
  size_t sortbuf_allocated;
34
11.9M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
11.9M
  if (resultbuf == NULL)
38
11.9M
    {
39
11.9M
      result = NULL;
40
11.9M
      allocated = 0;
41
11.9M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
11.9M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
11.9M
  sortbuf = sortbuf_preallocated;
51
11.9M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
11.9M
  sortbuf_count = 0;
53
54
11.9M
  {
55
11.9M
    const UNIT *s_end = s + n;
56
57
11.9M
    for (;;)
58
37.5M
      {
59
37.5M
        int count;
60
37.5M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
37.5M
        int decomposed_count;
62
37.5M
        int i;
63
64
37.5M
        if (s < s_end)
65
25.6M
          {
66
            /* Fetch the next character.  */
67
25.6M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
25.6M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
25.6M
            {
77
25.6M
              int curr;
78
79
59.7M
              for (curr = 0; curr < decomposed_count; )
80
34.1M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
34.1M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
34.1M
                  int curr_decomposed_count;
85
86
34.1M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
34.1M
                  if (curr_decomposed_count >= 0)
88
2.96M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
2.96M
                      int shift = curr_decomposed_count - 1;
93
94
2.96M
                      if (shift < 0)
95
0
                        abort ();
96
2.96M
                      if (shift > 0)
97
2.53M
                        {
98
2.53M
                          int j;
99
100
2.53M
                          decomposed_count += shift;
101
2.53M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.56M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
27.7k
                            decomposed[j + shift] = decomposed[j];
105
2.53M
                        }
106
11.5M
                      for (; shift >= 0; shift--)
107
8.55M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
2.96M
                    }
109
31.1M
                  else
110
31.1M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
31.1M
                      curr++;
113
31.1M
                    }
114
34.1M
                }
115
25.6M
            }
116
25.6M
          }
117
11.9M
        else
118
11.9M
          {
119
11.9M
            count = 0;
120
11.9M
            decomposed_count = 0;
121
11.9M
          }
122
123
37.5M
        i = 0;
124
37.5M
        for (;;)
125
68.7M
          {
126
68.7M
            ucs4_t uc;
127
68.7M
            int ccc;
128
129
68.7M
            if (s < s_end)
130
56.7M
              {
131
                /* Fetch the next character from the decomposition.  */
132
56.7M
                if (i == decomposed_count)
133
25.6M
                  break;
134
31.1M
                uc = decomposed[i];
135
31.1M
                ccc = uc_combining_class (uc);
136
31.1M
              }
137
11.9M
            else
138
11.9M
              {
139
                /* End of string reached.  */
140
11.9M
                uc = 0;
141
11.9M
                ccc = 0;
142
11.9M
              }
143
144
43.1M
            if (ccc == 0)
145
40.2M
              {
146
40.2M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
40.2M
                if (sortbuf_count > 1)
151
1.43M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.43M
                                                           sortbuf + sortbuf_count);
153
154
40.2M
                if (composer != NULL)
155
40.2M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
40.2M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
28.3M
                      {
178
31.1M
                        for (j = 1; j < sortbuf_count; )
179
2.83M
                          {
180
2.83M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.46M
                              {
182
1.46M
                                ucs4_t combined =
183
1.46M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.46M
                                if (combined)
185
1.41M
                                  {
186
1.41M
                                    size_t k;
187
188
1.41M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.18M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.77M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.41M
                                    sortbuf_count--;
193
1.41M
                                    continue;
194
1.41M
                                  }
195
1.46M
                              }
196
1.42M
                            j++;
197
1.42M
                          }
198
28.3M
                        if (s < s_end && sortbuf_count == 1)
199
16.4M
                          {
200
16.4M
                            ucs4_t combined =
201
16.4M
                              composer (sortbuf[0].code, uc);
202
16.4M
                            if (combined)
203
16.2k
                              {
204
16.2k
                                uc = combined;
205
16.2k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
16.2k
                                sortbuf_count = 0;
210
16.2k
                              }
211
16.4M
                          }
212
28.3M
                      }
213
40.2M
                  }
214
215
70.0M
                for (j = 0; j < sortbuf_count; j++)
216
29.7M
                  {
217
29.7M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
29.7M
                    if (length < allocated)
221
17.9M
                      {
222
17.9M
                        int ret =
223
17.9M
                          U_UCTOMB (result + length, muc, allocated - length);
224
17.9M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
17.9M
                        if (ret >= 0)
230
17.9M
                          {
231
17.9M
                            length += ret;
232
17.9M
                            goto done_appending;
233
17.9M
                          }
234
17.9M
                      }
235
11.8M
                    {
236
11.8M
                      size_t old_allocated = allocated;
237
11.8M
                      size_t new_allocated = 2 * old_allocated;
238
11.8M
                      if (new_allocated < 64)
239
11.8M
                        new_allocated = 64;
240
11.8M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
11.8M
                      {
243
11.8M
                        UNIT *larger_result;
244
11.8M
                        if (result == NULL)
245
11.8M
                          {
246
11.8M
                            larger_result =
247
11.8M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
11.8M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
11.8M
                          }
254
15.4k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
15.4k
                        else
266
15.4k
                          {
267
15.4k
                            larger_result =
268
15.4k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
15.4k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
15.4k
                          }
275
11.8M
                        result = larger_result;
276
11.8M
                        allocated = new_allocated;
277
11.8M
                        {
278
11.8M
                          int ret =
279
11.8M
                            U_UCTOMB (result + length, muc, allocated - length);
280
11.8M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
11.8M
                          if (ret < 0)
286
0
                            abort ();
287
11.8M
                          length += ret;
288
11.8M
                          goto done_appending;
289
11.8M
                        }
290
11.8M
                      }
291
11.8M
                    }
292
29.7M
                   done_appending: ;
293
29.7M
                  }
294
295
                /* sortbuf is now empty.  */
296
40.2M
                sortbuf_count = 0;
297
40.2M
              }
298
299
43.1M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
11.9M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
31.1M
            if (sortbuf_count == sortbuf_allocated)
305
1.87k
              {
306
1.87k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.87k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.87k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.87k
                new_sortbuf =
312
1.87k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.87k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.87k
                memcpy (new_sortbuf, sortbuf,
319
1.87k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.87k
                if (sortbuf != sortbuf_preallocated)
321
1.87k
                  free (sortbuf);
322
1.87k
                sortbuf = new_sortbuf;
323
1.87k
              }
324
31.1M
            sortbuf[sortbuf_count].code = uc;
325
31.1M
            sortbuf[sortbuf_count].ccc = ccc;
326
31.1M
            sortbuf_count++;
327
328
31.1M
            i++;
329
31.1M
          }
330
331
37.5M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
11.9M
          break;
334
335
25.6M
        s += count;
336
25.6M
      }
337
11.9M
  }
338
339
11.9M
  if (length == 0)
340
112k
    {
341
112k
      if (result == NULL)
342
112k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
112k
          result = (UNIT *) malloc (1);
345
112k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
112k
        }
351
112k
    }
352
11.8M
  else if (result != resultbuf && length < allocated)
353
11.8M
    {
354
      /* Shrink the allocated memory if possible.  */
355
11.8M
      UNIT *memory;
356
357
11.8M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
11.8M
      if (memory != NULL)
359
11.8M
        result = memory;
360
11.8M
    }
361
362
11.9M
  if (sortbuf_count > 0)
363
0
    abort ();
364
11.9M
  if (sortbuf != sortbuf_preallocated)
365
11.9M
    free (sortbuf);
366
367
11.9M
  *lengthp = length;
368
11.9M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
11.9M
}
u8_normalize
Line
Count
Source
21
3.87M
{
22
3.87M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
3.87M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
3.87M
  UNIT *result;
27
3.87M
  size_t length;
28
3.87M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
3.87M
  #define SORTBUF_PREALLOCATED 64
31
3.87M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
3.87M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
3.87M
  size_t sortbuf_allocated;
34
3.87M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
3.87M
  if (resultbuf == NULL)
38
3.87M
    {
39
3.87M
      result = NULL;
40
3.87M
      allocated = 0;
41
3.87M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
3.87M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
3.87M
  sortbuf = sortbuf_preallocated;
51
3.87M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
3.87M
  sortbuf_count = 0;
53
54
3.87M
  {
55
3.87M
    const UNIT *s_end = s + n;
56
57
3.87M
    for (;;)
58
17.7M
      {
59
17.7M
        int count;
60
17.7M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
17.7M
        int decomposed_count;
62
17.7M
        int i;
63
64
17.7M
        if (s < s_end)
65
13.8M
          {
66
            /* Fetch the next character.  */
67
13.8M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
13.8M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
13.8M
            {
77
13.8M
              int curr;
78
79
36.2M
              for (curr = 0; curr < decomposed_count; )
80
22.3M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
22.3M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
22.3M
                  int curr_decomposed_count;
85
86
22.3M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
22.3M
                  if (curr_decomposed_count >= 0)
88
2.92M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
2.92M
                      int shift = curr_decomposed_count - 1;
93
94
2.92M
                      if (shift < 0)
95
0
                        abort ();
96
2.92M
                      if (shift > 0)
97
2.50M
                        {
98
2.50M
                          int j;
99
100
2.50M
                          decomposed_count += shift;
101
2.50M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.52M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
18.1k
                            decomposed[j + shift] = decomposed[j];
105
2.50M
                        }
106
11.4M
                      for (; shift >= 0; shift--)
107
8.49M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
2.92M
                    }
109
19.4M
                  else
110
19.4M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
19.4M
                      curr++;
113
19.4M
                    }
114
22.3M
                }
115
13.8M
            }
116
13.8M
          }
117
3.87M
        else
118
3.87M
          {
119
3.87M
            count = 0;
120
3.87M
            decomposed_count = 0;
121
3.87M
          }
122
123
17.7M
        i = 0;
124
17.7M
        for (;;)
125
37.1M
          {
126
37.1M
            ucs4_t uc;
127
37.1M
            int ccc;
128
129
37.1M
            if (s < s_end)
130
33.2M
              {
131
                /* Fetch the next character from the decomposition.  */
132
33.2M
                if (i == decomposed_count)
133
13.8M
                  break;
134
19.4M
                uc = decomposed[i];
135
19.4M
                ccc = uc_combining_class (uc);
136
19.4M
              }
137
3.87M
            else
138
3.87M
              {
139
                /* End of string reached.  */
140
3.87M
                uc = 0;
141
3.87M
                ccc = 0;
142
3.87M
              }
143
144
23.2M
            if (ccc == 0)
145
20.5M
              {
146
20.5M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
20.5M
                if (sortbuf_count > 1)
151
1.40M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.40M
                                                           sortbuf + sortbuf_count);
153
154
20.5M
                if (composer != NULL)
155
20.5M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
20.5M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
16.6M
                      {
178
19.3M
                        for (j = 1; j < sortbuf_count; )
179
2.73M
                          {
180
2.73M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.43M
                              {
182
1.43M
                                ucs4_t combined =
183
1.43M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.43M
                                if (combined)
185
1.39M
                                  {
186
1.39M
                                    size_t k;
187
188
1.39M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.15M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.76M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.39M
                                    sortbuf_count--;
193
1.39M
                                    continue;
194
1.39M
                                  }
195
1.43M
                              }
196
1.34M
                            j++;
197
1.34M
                          }
198
16.6M
                        if (s < s_end && sortbuf_count == 1)
199
12.7M
                          {
200
12.7M
                            ucs4_t combined =
201
12.7M
                              composer (sortbuf[0].code, uc);
202
12.7M
                            if (combined)
203
7.27k
                              {
204
7.27k
                                uc = combined;
205
7.27k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
7.27k
                                sortbuf_count = 0;
210
7.27k
                              }
211
12.7M
                          }
212
16.6M
                      }
213
20.5M
                  }
214
215
38.5M
                for (j = 0; j < sortbuf_count; j++)
216
18.0M
                  {
217
18.0M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
18.0M
                    if (length < allocated)
221
14.1M
                      {
222
14.1M
                        int ret =
223
14.1M
                          U_UCTOMB (result + length, muc, allocated - length);
224
14.1M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
14.1M
                        if (ret >= 0)
230
14.1M
                          {
231
14.1M
                            length += ret;
232
14.1M
                            goto done_appending;
233
14.1M
                          }
234
14.1M
                      }
235
3.88M
                    {
236
3.88M
                      size_t old_allocated = allocated;
237
3.88M
                      size_t new_allocated = 2 * old_allocated;
238
3.88M
                      if (new_allocated < 64)
239
3.87M
                        new_allocated = 64;
240
3.88M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
3.88M
                      {
243
3.88M
                        UNIT *larger_result;
244
3.88M
                        if (result == NULL)
245
3.87M
                          {
246
3.87M
                            larger_result =
247
3.87M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
3.87M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
3.87M
                          }
254
12.4k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
12.4k
                        else
266
12.4k
                          {
267
12.4k
                            larger_result =
268
12.4k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
12.4k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
12.4k
                          }
275
3.88M
                        result = larger_result;
276
3.88M
                        allocated = new_allocated;
277
3.88M
                        {
278
3.88M
                          int ret =
279
3.88M
                            U_UCTOMB (result + length, muc, allocated - length);
280
3.88M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
3.88M
                          if (ret < 0)
286
0
                            abort ();
287
3.88M
                          length += ret;
288
3.88M
                          goto done_appending;
289
3.88M
                        }
290
3.88M
                      }
291
3.88M
                    }
292
18.0M
                   done_appending: ;
293
18.0M
                  }
294
295
                /* sortbuf is now empty.  */
296
20.5M
                sortbuf_count = 0;
297
20.5M
              }
298
299
23.2M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
3.87M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
19.4M
            if (sortbuf_count == sortbuf_allocated)
305
1.04k
              {
306
1.04k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.04k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.04k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.04k
                new_sortbuf =
312
1.04k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.04k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.04k
                memcpy (new_sortbuf, sortbuf,
319
1.04k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.04k
                if (sortbuf != sortbuf_preallocated)
321
1.04k
                  free (sortbuf);
322
1.04k
                sortbuf = new_sortbuf;
323
1.04k
              }
324
19.4M
            sortbuf[sortbuf_count].code = uc;
325
19.4M
            sortbuf[sortbuf_count].ccc = ccc;
326
19.4M
            sortbuf_count++;
327
328
19.4M
            i++;
329
19.4M
          }
330
331
17.7M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
3.87M
          break;
334
335
13.8M
        s += count;
336
13.8M
      }
337
3.87M
  }
338
339
3.87M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
3.87M
  else if (result != resultbuf && length < allocated)
353
3.87M
    {
354
      /* Shrink the allocated memory if possible.  */
355
3.87M
      UNIT *memory;
356
357
3.87M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
3.87M
      if (memory != NULL)
359
3.87M
        result = memory;
360
3.87M
    }
361
362
3.87M
  if (sortbuf_count > 0)
363
0
    abort ();
364
3.87M
  if (sortbuf != sortbuf_preallocated)
365
3.87M
    free (sortbuf);
366
367
3.87M
  *lengthp = length;
368
3.87M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
3.87M
}
u32_normalize
Line
Count
Source
21
8.08M
{
22
8.08M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
8.08M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
8.08M
  UNIT *result;
27
8.08M
  size_t length;
28
8.08M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
8.08M
  #define SORTBUF_PREALLOCATED 64
31
8.08M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
8.08M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
8.08M
  size_t sortbuf_allocated;
34
8.08M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
8.08M
  if (resultbuf == NULL)
38
8.08M
    {
39
8.08M
      result = NULL;
40
8.08M
      allocated = 0;
41
8.08M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
8.08M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
8.08M
  sortbuf = sortbuf_preallocated;
51
8.08M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
8.08M
  sortbuf_count = 0;
53
54
8.08M
  {
55
8.08M
    const UNIT *s_end = s + n;
56
57
8.08M
    for (;;)
58
19.8M
      {
59
19.8M
        int count;
60
19.8M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
19.8M
        int decomposed_count;
62
19.8M
        int i;
63
64
19.8M
        if (s < s_end)
65
11.7M
          {
66
            /* Fetch the next character.  */
67
11.7M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
11.7M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
11.7M
            {
77
11.7M
              int curr;
78
79
23.5M
              for (curr = 0; curr < decomposed_count; )
80
11.8M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
11.8M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
11.8M
                  int curr_decomposed_count;
85
86
11.8M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
11.8M
                  if (curr_decomposed_count >= 0)
88
33.6k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
33.6k
                      int shift = curr_decomposed_count - 1;
93
94
33.6k
                      if (shift < 0)
95
0
                        abort ();
96
33.6k
                      if (shift > 0)
97
33.2k
                        {
98
33.2k
                          int j;
99
100
33.2k
                          decomposed_count += shift;
101
33.2k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
42.8k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
9.62k
                            decomposed[j + shift] = decomposed[j];
105
33.2k
                        }
106
100k
                      for (; shift >= 0; shift--)
107
66.8k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
33.6k
                    }
109
11.7M
                  else
110
11.7M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
11.7M
                      curr++;
113
11.7M
                    }
114
11.8M
                }
115
11.7M
            }
116
11.7M
          }
117
8.08M
        else
118
8.08M
          {
119
8.08M
            count = 0;
120
8.08M
            decomposed_count = 0;
121
8.08M
          }
122
123
19.8M
        i = 0;
124
19.8M
        for (;;)
125
31.6M
          {
126
31.6M
            ucs4_t uc;
127
31.6M
            int ccc;
128
129
31.6M
            if (s < s_end)
130
23.5M
              {
131
                /* Fetch the next character from the decomposition.  */
132
23.5M
                if (i == decomposed_count)
133
11.7M
                  break;
134
11.7M
                uc = decomposed[i];
135
11.7M
                ccc = uc_combining_class (uc);
136
11.7M
              }
137
8.08M
            else
138
8.08M
              {
139
                /* End of string reached.  */
140
8.08M
                uc = 0;
141
8.08M
                ccc = 0;
142
8.08M
              }
143
144
19.8M
            if (ccc == 0)
145
19.7M
              {
146
19.7M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
19.7M
                if (sortbuf_count > 1)
151
25.9k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
25.9k
                                                           sortbuf + sortbuf_count);
153
154
19.7M
                if (composer != NULL)
155
19.7M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
19.7M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
11.6M
                      {
178
11.7M
                        for (j = 1; j < sortbuf_count; )
179
100k
                          {
180
100k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
37.0k
                              {
182
37.0k
                                ucs4_t combined =
183
37.0k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
37.0k
                                if (combined)
185
22.4k
                                  {
186
22.4k
                                    size_t k;
187
188
22.4k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
35.8k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
13.4k
                                      sortbuf[k - 1] = sortbuf[k];
192
22.4k
                                    sortbuf_count--;
193
22.4k
                                    continue;
194
22.4k
                                  }
195
37.0k
                              }
196
78.0k
                            j++;
197
78.0k
                          }
198
11.6M
                        if (s < s_end && sortbuf_count == 1)
199
3.68M
                          {
200
3.68M
                            ucs4_t combined =
201
3.68M
                              composer (sortbuf[0].code, uc);
202
3.68M
                            if (combined)
203
8.96k
                              {
204
8.96k
                                uc = combined;
205
8.96k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
8.96k
                                sortbuf_count = 0;
210
8.96k
                              }
211
3.68M
                          }
212
11.6M
                      }
213
19.7M
                  }
214
215
31.5M
                for (j = 0; j < sortbuf_count; j++)
216
11.7M
                  {
217
11.7M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
11.7M
                    if (length < allocated)
221
3.77M
                      {
222
3.77M
                        int ret =
223
3.77M
                          U_UCTOMB (result + length, muc, allocated - length);
224
3.77M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
3.77M
                        if (ret >= 0)
230
3.77M
                          {
231
3.77M
                            length += ret;
232
3.77M
                            goto done_appending;
233
3.77M
                          }
234
3.77M
                      }
235
7.97M
                    {
236
7.97M
                      size_t old_allocated = allocated;
237
7.97M
                      size_t new_allocated = 2 * old_allocated;
238
7.97M
                      if (new_allocated < 64)
239
7.97M
                        new_allocated = 64;
240
7.97M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
7.97M
                      {
243
7.97M
                        UNIT *larger_result;
244
7.97M
                        if (result == NULL)
245
7.97M
                          {
246
7.97M
                            larger_result =
247
7.97M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
7.97M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
7.97M
                          }
254
2.96k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
2.96k
                        else
266
2.96k
                          {
267
2.96k
                            larger_result =
268
2.96k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
2.96k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
2.96k
                          }
275
7.97M
                        result = larger_result;
276
7.97M
                        allocated = new_allocated;
277
7.97M
                        {
278
7.97M
                          int ret =
279
7.97M
                            U_UCTOMB (result + length, muc, allocated - length);
280
7.97M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
7.97M
                          if (ret < 0)
286
0
                            abort ();
287
7.97M
                          length += ret;
288
7.97M
                          goto done_appending;
289
7.97M
                        }
290
7.97M
                      }
291
7.97M
                    }
292
11.7M
                   done_appending: ;
293
11.7M
                  }
294
295
                /* sortbuf is now empty.  */
296
19.7M
                sortbuf_count = 0;
297
19.7M
              }
298
299
19.8M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
8.08M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
11.7M
            if (sortbuf_count == sortbuf_allocated)
305
829
              {
306
829
                struct ucs4_with_ccc *new_sortbuf;
307
308
829
                sortbuf_allocated = 2 * sortbuf_allocated;
309
829
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
829
                new_sortbuf =
312
829
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
829
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
829
                memcpy (new_sortbuf, sortbuf,
319
829
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
829
                if (sortbuf != sortbuf_preallocated)
321
829
                  free (sortbuf);
322
829
                sortbuf = new_sortbuf;
323
829
              }
324
11.7M
            sortbuf[sortbuf_count].code = uc;
325
11.7M
            sortbuf[sortbuf_count].ccc = ccc;
326
11.7M
            sortbuf_count++;
327
328
11.7M
            i++;
329
11.7M
          }
330
331
19.8M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
8.08M
          break;
334
335
11.7M
        s += count;
336
11.7M
      }
337
8.08M
  }
338
339
8.08M
  if (length == 0)
340
112k
    {
341
112k
      if (result == NULL)
342
112k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
112k
          result = (UNIT *) malloc (1);
345
112k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
112k
        }
351
112k
    }
352
7.97M
  else if (result != resultbuf && length < allocated)
353
7.97M
    {
354
      /* Shrink the allocated memory if possible.  */
355
7.97M
      UNIT *memory;
356
357
7.97M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
7.97M
      if (memory != NULL)
359
7.97M
        result = memory;
360
7.97M
    }
361
362
8.08M
  if (sortbuf_count > 0)
363
0
    abort ();
364
8.08M
  if (sortbuf != sortbuf_preallocated)
365
8.08M
    free (sortbuf);
366
367
8.08M
  *lengthp = length;
368
8.08M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
8.08M
}