Coverage Report

Created: 2025-07-11 06:23

/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source (jump to first uncovered line)
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
13.8M
{
22
13.8M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
13.8M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
13.8M
  UNIT *result;
27
13.8M
  size_t length;
28
13.8M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
13.8M
  #define SORTBUF_PREALLOCATED 64
31
13.8M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
13.8M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
13.8M
  size_t sortbuf_allocated;
34
13.8M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
13.8M
  if (resultbuf == NULL)
38
13.8M
    {
39
13.8M
      result = NULL;
40
13.8M
      allocated = 0;
41
13.8M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
13.8M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
13.8M
  sortbuf = sortbuf_preallocated;
51
13.8M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
13.8M
  sortbuf_count = 0;
53
54
13.8M
  {
55
13.8M
    const UNIT *s_end = s + n;
56
57
13.8M
    for (;;)
58
43.3M
      {
59
43.3M
        int count;
60
43.3M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
43.3M
        int decomposed_count;
62
43.3M
        int i;
63
64
43.3M
        if (s < s_end)
65
29.5M
          {
66
            /* Fetch the next character.  */
67
29.5M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
29.5M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
29.5M
            {
77
29.5M
              int curr;
78
79
67.2M
              for (curr = 0; curr < decomposed_count; )
80
37.7M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
37.7M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
37.7M
                  int curr_decomposed_count;
85
86
37.7M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
37.7M
                  if (curr_decomposed_count >= 0)
88
2.90M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
2.90M
                      int shift = curr_decomposed_count - 1;
93
94
2.90M
                      if (shift < 0)
95
0
                        abort ();
96
2.90M
                      if (shift > 0)
97
2.39M
                        {
98
2.39M
                          int j;
99
100
2.39M
                          decomposed_count += shift;
101
2.39M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.42M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
28.5k
                            decomposed[j + shift] = decomposed[j];
105
2.39M
                        }
106
11.0M
                      for (; shift >= 0; shift--)
107
8.17M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
2.90M
                    }
109
34.8M
                  else
110
34.8M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
34.8M
                      curr++;
113
34.8M
                    }
114
37.7M
                }
115
29.5M
            }
116
29.5M
          }
117
13.8M
        else
118
13.8M
          {
119
13.8M
            count = 0;
120
13.8M
            decomposed_count = 0;
121
13.8M
          }
122
123
43.3M
        i = 0;
124
43.3M
        for (;;)
125
78.1M
          {
126
78.1M
            ucs4_t uc;
127
78.1M
            int ccc;
128
129
78.1M
            if (s < s_end)
130
64.3M
              {
131
                /* Fetch the next character from the decomposition.  */
132
64.3M
                if (i == decomposed_count)
133
29.5M
                  break;
134
34.8M
                uc = decomposed[i];
135
34.8M
                ccc = uc_combining_class (uc);
136
34.8M
              }
137
13.8M
            else
138
13.8M
              {
139
                /* End of string reached.  */
140
13.8M
                uc = 0;
141
13.8M
                ccc = 0;
142
13.8M
              }
143
144
48.6M
            if (ccc == 0)
145
45.9M
              {
146
45.9M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
45.9M
                if (sortbuf_count > 1)
151
1.32M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.32M
                                                           sortbuf + sortbuf_count);
153
154
45.9M
                if (composer != NULL)
155
45.9M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
45.9M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
32.1M
                      {
178
34.7M
                        for (j = 1; j < sortbuf_count; )
179
2.59M
                          {
180
2.59M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.36M
                              {
182
1.36M
                                ucs4_t combined =
183
1.36M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.36M
                                if (combined)
185
1.31M
                                  {
186
1.31M
                                    size_t k;
187
188
1.31M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
2.98M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.66M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.31M
                                    sortbuf_count--;
193
1.31M
                                    continue;
194
1.31M
                                  }
195
1.36M
                              }
196
1.27M
                            j++;
197
1.27M
                          }
198
32.1M
                        if (s < s_end && sortbuf_count == 1)
199
18.5M
                          {
200
18.5M
                            ucs4_t combined =
201
18.5M
                              composer (sortbuf[0].code, uc);
202
18.5M
                            if (combined)
203
17.0k
                              {
204
17.0k
                                uc = combined;
205
17.0k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
17.0k
                                sortbuf_count = 0;
210
17.0k
                              }
211
18.5M
                          }
212
32.1M
                      }
213
45.9M
                  }
214
215
79.4M
                for (j = 0; j < sortbuf_count; j++)
216
33.4M
                  {
217
33.4M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
33.4M
                    if (length < allocated)
221
19.8M
                      {
222
19.8M
                        int ret =
223
19.8M
                          U_UCTOMB (result + length, muc, allocated - length);
224
19.8M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
19.8M
                        if (ret >= 0)
230
19.8M
                          {
231
19.8M
                            length += ret;
232
19.8M
                            goto done_appending;
233
19.8M
                          }
234
19.8M
                      }
235
13.6M
                    {
236
13.6M
                      size_t old_allocated = allocated;
237
13.6M
                      size_t new_allocated = 2 * old_allocated;
238
13.6M
                      if (new_allocated < 64)
239
13.6M
                        new_allocated = 64;
240
13.6M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
13.6M
                      {
243
13.6M
                        UNIT *larger_result;
244
13.6M
                        if (result == NULL)
245
13.6M
                          {
246
13.6M
                            larger_result =
247
13.6M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
13.6M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
13.6M
                          }
254
18.9k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
18.9k
                        else
266
18.9k
                          {
267
18.9k
                            larger_result =
268
18.9k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
18.9k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
18.9k
                          }
275
13.6M
                        result = larger_result;
276
13.6M
                        allocated = new_allocated;
277
13.6M
                        {
278
13.6M
                          int ret =
279
13.6M
                            U_UCTOMB (result + length, muc, allocated - length);
280
13.6M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
13.6M
                          if (ret < 0)
286
0
                            abort ();
287
13.6M
                          length += ret;
288
13.6M
                          goto done_appending;
289
13.6M
                        }
290
13.6M
                      }
291
13.6M
                    }
292
33.4M
                   done_appending: ;
293
33.4M
                  }
294
295
                /* sortbuf is now empty.  */
296
45.9M
                sortbuf_count = 0;
297
45.9M
              }
298
299
48.6M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
13.8M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
34.8M
            if (sortbuf_count == sortbuf_allocated)
305
1.87k
              {
306
1.87k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.87k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.87k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.87k
                new_sortbuf =
312
1.87k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.87k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.87k
                memcpy (new_sortbuf, sortbuf,
319
1.87k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.87k
                if (sortbuf != sortbuf_preallocated)
321
606
                  free (sortbuf);
322
1.87k
                sortbuf = new_sortbuf;
323
1.87k
              }
324
34.8M
            sortbuf[sortbuf_count].code = uc;
325
34.8M
            sortbuf[sortbuf_count].ccc = ccc;
326
34.8M
            sortbuf_count++;
327
328
34.8M
            i++;
329
34.8M
          }
330
331
43.3M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
13.8M
          break;
334
335
29.5M
        s += count;
336
29.5M
      }
337
13.8M
  }
338
339
13.8M
  if (length == 0)
340
186k
    {
341
186k
      if (result == NULL)
342
186k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
186k
          result = (UNIT *) malloc (1);
345
186k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
186k
        }
351
186k
    }
352
13.6M
  else if (result != resultbuf && length < allocated)
353
13.6M
    {
354
      /* Shrink the allocated memory if possible.  */
355
13.6M
      UNIT *memory;
356
357
13.6M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
13.6M
      if (memory != NULL)
359
13.6M
        result = memory;
360
13.6M
    }
361
362
13.8M
  if (sortbuf_count > 0)
363
0
    abort ();
364
13.8M
  if (sortbuf != sortbuf_preallocated)
365
1.26k
    free (sortbuf);
366
367
13.8M
  *lengthp = length;
368
13.8M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
13.8M
}
u8_normalize
Line
Count
Source
21
4.42M
{
22
4.42M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
4.42M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
4.42M
  UNIT *result;
27
4.42M
  size_t length;
28
4.42M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
4.42M
  #define SORTBUF_PREALLOCATED 64
31
4.42M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
4.42M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
4.42M
  size_t sortbuf_allocated;
34
4.42M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
4.42M
  if (resultbuf == NULL)
38
4.42M
    {
39
4.42M
      result = NULL;
40
4.42M
      allocated = 0;
41
4.42M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
4.42M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
4.42M
  sortbuf = sortbuf_preallocated;
51
4.42M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
4.42M
  sortbuf_count = 0;
53
54
4.42M
  {
55
4.42M
    const UNIT *s_end = s + n;
56
57
4.42M
    for (;;)
58
19.9M
      {
59
19.9M
        int count;
60
19.9M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
19.9M
        int decomposed_count;
62
19.9M
        int i;
63
64
19.9M
        if (s < s_end)
65
15.5M
          {
66
            /* Fetch the next character.  */
67
15.5M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
15.5M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
15.5M
            {
77
15.5M
              int curr;
78
79
39.1M
              for (curr = 0; curr < decomposed_count; )
80
23.6M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
23.6M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
23.6M
                  int curr_decomposed_count;
85
86
23.6M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
23.6M
                  if (curr_decomposed_count >= 0)
88
2.87M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
2.87M
                      int shift = curr_decomposed_count - 1;
93
94
2.87M
                      if (shift < 0)
95
0
                        abort ();
96
2.87M
                      if (shift > 0)
97
2.36M
                        {
98
2.36M
                          int j;
99
100
2.36M
                          decomposed_count += shift;
101
2.36M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.37M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
17.5k
                            decomposed[j + shift] = decomposed[j];
105
2.36M
                        }
106
10.9M
                      for (; shift >= 0; shift--)
107
8.09M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
2.87M
                    }
109
20.7M
                  else
110
20.7M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
20.7M
                      curr++;
113
20.7M
                    }
114
23.6M
                }
115
15.5M
            }
116
15.5M
          }
117
4.42M
        else
118
4.42M
          {
119
4.42M
            count = 0;
120
4.42M
            decomposed_count = 0;
121
4.42M
          }
122
123
19.9M
        i = 0;
124
19.9M
        for (;;)
125
40.6M
          {
126
40.6M
            ucs4_t uc;
127
40.6M
            int ccc;
128
129
40.6M
            if (s < s_end)
130
36.2M
              {
131
                /* Fetch the next character from the decomposition.  */
132
36.2M
                if (i == decomposed_count)
133
15.5M
                  break;
134
20.7M
                uc = decomposed[i];
135
20.7M
                ccc = uc_combining_class (uc);
136
20.7M
              }
137
4.42M
            else
138
4.42M
              {
139
                /* End of string reached.  */
140
4.42M
                uc = 0;
141
4.42M
                ccc = 0;
142
4.42M
              }
143
144
25.1M
            if (ccc == 0)
145
22.6M
              {
146
22.6M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
22.6M
                if (sortbuf_count > 1)
151
1.29M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.29M
                                                           sortbuf + sortbuf_count);
153
154
22.6M
                if (composer != NULL)
155
22.6M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
22.6M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
18.2M
                      {
178
20.7M
                        for (j = 1; j < sortbuf_count; )
179
2.49M
                          {
180
2.49M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.32M
                              {
182
1.32M
                                ucs4_t combined =
183
1.32M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.32M
                                if (combined)
185
1.29M
                                  {
186
1.29M
                                    size_t k;
187
188
1.29M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
2.94M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.65M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.29M
                                    sortbuf_count--;
193
1.29M
                                    continue;
194
1.29M
                                  }
195
1.32M
                              }
196
1.19M
                            j++;
197
1.19M
                          }
198
18.2M
                        if (s < s_end && sortbuf_count == 1)
199
13.7M
                          {
200
13.7M
                            ucs4_t combined =
201
13.7M
                              composer (sortbuf[0].code, uc);
202
13.7M
                            if (combined)
203
7.80k
                              {
204
7.80k
                                uc = combined;
205
7.80k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
7.80k
                                sortbuf_count = 0;
210
7.80k
                              }
211
13.7M
                          }
212
18.2M
                      }
213
22.6M
                  }
214
215
42.1M
                for (j = 0; j < sortbuf_count; j++)
216
19.4M
                  {
217
19.4M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
19.4M
                    if (length < allocated)
221
15.0M
                      {
222
15.0M
                        int ret =
223
15.0M
                          U_UCTOMB (result + length, muc, allocated - length);
224
15.0M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
15.0M
                        if (ret >= 0)
230
15.0M
                          {
231
15.0M
                            length += ret;
232
15.0M
                            goto done_appending;
233
15.0M
                          }
234
15.0M
                      }
235
4.43M
                    {
236
4.43M
                      size_t old_allocated = allocated;
237
4.43M
                      size_t new_allocated = 2 * old_allocated;
238
4.43M
                      if (new_allocated < 64)
239
4.42M
                        new_allocated = 64;
240
4.43M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
4.43M
                      {
243
4.43M
                        UNIT *larger_result;
244
4.43M
                        if (result == NULL)
245
4.42M
                          {
246
4.42M
                            larger_result =
247
4.42M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
4.42M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
4.42M
                          }
254
15.1k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
15.1k
                        else
266
15.1k
                          {
267
15.1k
                            larger_result =
268
15.1k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
15.1k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
15.1k
                          }
275
4.43M
                        result = larger_result;
276
4.43M
                        allocated = new_allocated;
277
4.43M
                        {
278
4.43M
                          int ret =
279
4.43M
                            U_UCTOMB (result + length, muc, allocated - length);
280
4.43M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
4.43M
                          if (ret < 0)
286
0
                            abort ();
287
4.43M
                          length += ret;
288
4.43M
                          goto done_appending;
289
4.43M
                        }
290
4.43M
                      }
291
4.43M
                    }
292
19.4M
                   done_appending: ;
293
19.4M
                  }
294
295
                /* sortbuf is now empty.  */
296
22.6M
                sortbuf_count = 0;
297
22.6M
              }
298
299
25.1M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
4.42M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
20.7M
            if (sortbuf_count == sortbuf_allocated)
305
1.06k
              {
306
1.06k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.06k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.06k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.06k
                new_sortbuf =
312
1.06k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.06k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.06k
                memcpy (new_sortbuf, sortbuf,
319
1.06k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.06k
                if (sortbuf != sortbuf_preallocated)
321
606
                  free (sortbuf);
322
1.06k
                sortbuf = new_sortbuf;
323
1.06k
              }
324
20.7M
            sortbuf[sortbuf_count].code = uc;
325
20.7M
            sortbuf[sortbuf_count].ccc = ccc;
326
20.7M
            sortbuf_count++;
327
328
20.7M
            i++;
329
20.7M
          }
330
331
19.9M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
4.42M
          break;
334
335
15.5M
        s += count;
336
15.5M
      }
337
4.42M
  }
338
339
4.42M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
4.42M
  else if (result != resultbuf && length < allocated)
353
4.42M
    {
354
      /* Shrink the allocated memory if possible.  */
355
4.42M
      UNIT *memory;
356
357
4.42M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
4.42M
      if (memory != NULL)
359
4.42M
        result = memory;
360
4.42M
    }
361
362
4.42M
  if (sortbuf_count > 0)
363
0
    abort ();
364
4.42M
  if (sortbuf != sortbuf_preallocated)
365
460
    free (sortbuf);
366
367
4.42M
  *lengthp = length;
368
4.42M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
4.42M
}
u32_normalize
Line
Count
Source
21
9.38M
{
22
9.38M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
9.38M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
9.38M
  UNIT *result;
27
9.38M
  size_t length;
28
9.38M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
9.38M
  #define SORTBUF_PREALLOCATED 64
31
9.38M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
9.38M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
9.38M
  size_t sortbuf_allocated;
34
9.38M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
9.38M
  if (resultbuf == NULL)
38
9.38M
    {
39
9.38M
      result = NULL;
40
9.38M
      allocated = 0;
41
9.38M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
9.38M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
9.38M
  sortbuf = sortbuf_preallocated;
51
9.38M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
9.38M
  sortbuf_count = 0;
53
54
9.38M
  {
55
9.38M
    const UNIT *s_end = s + n;
56
57
9.38M
    for (;;)
58
23.4M
      {
59
23.4M
        int count;
60
23.4M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
23.4M
        int decomposed_count;
62
23.4M
        int i;
63
64
23.4M
        if (s < s_end)
65
14.0M
          {
66
            /* Fetch the next character.  */
67
14.0M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
14.0M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
14.0M
            {
77
14.0M
              int curr;
78
79
28.1M
              for (curr = 0; curr < decomposed_count; )
80
14.0M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
14.0M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
14.0M
                  int curr_decomposed_count;
85
86
14.0M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
14.0M
                  if (curr_decomposed_count >= 0)
88
35.3k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
35.3k
                      int shift = curr_decomposed_count - 1;
93
94
35.3k
                      if (shift < 0)
95
0
                        abort ();
96
35.3k
                      if (shift > 0)
97
35.0k
                        {
98
35.0k
                          int j;
99
100
35.0k
                          decomposed_count += shift;
101
35.0k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
46.0k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
11.0k
                            decomposed[j + shift] = decomposed[j];
105
35.0k
                        }
106
105k
                      for (; shift >= 0; shift--)
107
70.3k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
35.3k
                    }
109
14.0M
                  else
110
14.0M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
14.0M
                      curr++;
113
14.0M
                    }
114
14.0M
                }
115
14.0M
            }
116
14.0M
          }
117
9.38M
        else
118
9.38M
          {
119
9.38M
            count = 0;
120
9.38M
            decomposed_count = 0;
121
9.38M
          }
122
123
23.4M
        i = 0;
124
23.4M
        for (;;)
125
37.4M
          {
126
37.4M
            ucs4_t uc;
127
37.4M
            int ccc;
128
129
37.4M
            if (s < s_end)
130
28.0M
              {
131
                /* Fetch the next character from the decomposition.  */
132
28.0M
                if (i == decomposed_count)
133
14.0M
                  break;
134
14.0M
                uc = decomposed[i];
135
14.0M
                ccc = uc_combining_class (uc);
136
14.0M
              }
137
9.38M
            else
138
9.38M
              {
139
                /* End of string reached.  */
140
9.38M
                uc = 0;
141
9.38M
                ccc = 0;
142
9.38M
              }
143
144
23.4M
            if (ccc == 0)
145
23.3M
              {
146
23.3M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
23.3M
                if (sortbuf_count > 1)
151
27.9k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
27.9k
                                                           sortbuf + sortbuf_count);
153
154
23.3M
                if (composer != NULL)
155
23.3M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
23.3M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
13.9M
                      {
178
14.0M
                        for (j = 1; j < sortbuf_count; )
179
103k
                          {
180
103k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
41.0k
                              {
182
41.0k
                                ucs4_t combined =
183
41.0k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
41.0k
                                if (combined)
185
24.0k
                                  {
186
24.0k
                                    size_t k;
187
188
24.0k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
38.6k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
14.6k
                                      sortbuf[k - 1] = sortbuf[k];
192
24.0k
                                    sortbuf_count--;
193
24.0k
                                    continue;
194
24.0k
                                  }
195
41.0k
                              }
196
79.9k
                            j++;
197
79.9k
                          }
198
13.9M
                        if (s < s_end && sortbuf_count == 1)
199
4.72M
                          {
200
4.72M
                            ucs4_t combined =
201
4.72M
                              composer (sortbuf[0].code, uc);
202
4.72M
                            if (combined)
203
9.29k
                              {
204
9.29k
                                uc = combined;
205
9.29k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
9.29k
                                sortbuf_count = 0;
210
9.29k
                              }
211
4.72M
                          }
212
13.9M
                      }
213
23.3M
                  }
214
215
37.3M
                for (j = 0; j < sortbuf_count; j++)
216
14.0M
                  {
217
14.0M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
14.0M
                    if (length < allocated)
221
4.81M
                      {
222
4.81M
                        int ret =
223
4.81M
                          U_UCTOMB (result + length, muc, allocated - length);
224
4.81M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
4.81M
                        if (ret >= 0)
230
4.81M
                          {
231
4.81M
                            length += ret;
232
4.81M
                            goto done_appending;
233
4.81M
                          }
234
4.81M
                      }
235
9.20M
                    {
236
9.20M
                      size_t old_allocated = allocated;
237
9.20M
                      size_t new_allocated = 2 * old_allocated;
238
9.20M
                      if (new_allocated < 64)
239
9.20M
                        new_allocated = 64;
240
9.20M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
9.20M
                      {
243
9.20M
                        UNIT *larger_result;
244
9.20M
                        if (result == NULL)
245
9.20M
                          {
246
9.20M
                            larger_result =
247
9.20M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
9.20M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
9.20M
                          }
254
3.72k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
3.72k
                        else
266
3.72k
                          {
267
3.72k
                            larger_result =
268
3.72k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
3.72k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
3.72k
                          }
275
9.20M
                        result = larger_result;
276
9.20M
                        allocated = new_allocated;
277
9.20M
                        {
278
9.20M
                          int ret =
279
9.20M
                            U_UCTOMB (result + length, muc, allocated - length);
280
9.20M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
9.20M
                          if (ret < 0)
286
0
                            abort ();
287
9.20M
                          length += ret;
288
9.20M
                          goto done_appending;
289
9.20M
                        }
290
9.20M
                      }
291
9.20M
                    }
292
14.0M
                   done_appending: ;
293
14.0M
                  }
294
295
                /* sortbuf is now empty.  */
296
23.3M
                sortbuf_count = 0;
297
23.3M
              }
298
299
23.4M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
9.38M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
14.0M
            if (sortbuf_count == sortbuf_allocated)
305
807
              {
306
807
                struct ucs4_with_ccc *new_sortbuf;
307
308
807
                sortbuf_allocated = 2 * sortbuf_allocated;
309
807
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
807
                new_sortbuf =
312
807
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
807
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
807
                memcpy (new_sortbuf, sortbuf,
319
807
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
807
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
807
                sortbuf = new_sortbuf;
323
807
              }
324
14.0M
            sortbuf[sortbuf_count].code = uc;
325
14.0M
            sortbuf[sortbuf_count].ccc = ccc;
326
14.0M
            sortbuf_count++;
327
328
14.0M
            i++;
329
14.0M
          }
330
331
23.4M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
9.38M
          break;
334
335
14.0M
        s += count;
336
14.0M
      }
337
9.38M
  }
338
339
9.38M
  if (length == 0)
340
186k
    {
341
186k
      if (result == NULL)
342
186k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
186k
          result = (UNIT *) malloc (1);
345
186k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
186k
        }
351
186k
    }
352
9.20M
  else if (result != resultbuf && length < allocated)
353
9.20M
    {
354
      /* Shrink the allocated memory if possible.  */
355
9.20M
      UNIT *memory;
356
357
9.20M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
9.20M
      if (memory != NULL)
359
9.20M
        result = memory;
360
9.20M
    }
361
362
9.38M
  if (sortbuf_count > 0)
363
0
    abort ();
364
9.38M
  if (sortbuf != sortbuf_preallocated)
365
807
    free (sortbuf);
366
367
9.38M
  *lengthp = length;
368
9.38M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
9.38M
}