Coverage Report

Created: 2025-11-09 06:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
15.0M
{
22
15.0M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
15.0M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
15.0M
  UNIT *result;
27
15.0M
  size_t length;
28
15.0M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
15.0M
  #define SORTBUF_PREALLOCATED 64
31
15.0M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
15.0M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
15.0M
  size_t sortbuf_allocated;
34
15.0M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
15.0M
  if (resultbuf == NULL)
38
15.0M
    {
39
15.0M
      result = NULL;
40
15.0M
      allocated = 0;
41
15.0M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
15.0M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
15.0M
  sortbuf = sortbuf_preallocated;
51
15.0M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
15.0M
  sortbuf_count = 0;
53
54
15.0M
  {
55
15.0M
    const UNIT *s_end = s + n;
56
57
15.0M
    for (;;)
58
45.6M
      {
59
45.6M
        int count;
60
45.6M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
45.6M
        int decomposed_count;
62
45.6M
        int i;
63
64
45.6M
        if (s < s_end)
65
30.5M
          {
66
            /* Fetch the next character.  */
67
30.5M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
30.5M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
30.5M
            {
77
30.5M
              int curr;
78
79
69.9M
              for (curr = 0; curr < decomposed_count; )
80
39.3M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
39.3M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
39.3M
                  int curr_decomposed_count;
85
86
39.3M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
39.3M
                  if (curr_decomposed_count >= 0)
88
3.04M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.04M
                      int shift = curr_decomposed_count - 1;
93
94
3.04M
                      if (shift < 0)
95
0
                        abort ();
96
3.04M
                      if (shift > 0)
97
2.64M
                        {
98
2.64M
                          int j;
99
100
2.64M
                          decomposed_count += shift;
101
2.64M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.67M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
27.5k
                            decomposed[j + shift] = decomposed[j];
105
2.64M
                        }
106
11.8M
                      for (; shift >= 0; shift--)
107
8.79M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.04M
                    }
109
36.3M
                  else
110
36.3M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
36.3M
                      curr++;
113
36.3M
                    }
114
39.3M
                }
115
30.5M
            }
116
30.5M
          }
117
15.0M
        else
118
15.0M
          {
119
15.0M
            count = 0;
120
15.0M
            decomposed_count = 0;
121
15.0M
          }
122
123
45.6M
        i = 0;
124
45.6M
        for (;;)
125
81.9M
          {
126
81.9M
            ucs4_t uc;
127
81.9M
            int ccc;
128
129
81.9M
            if (s < s_end)
130
66.8M
              {
131
                /* Fetch the next character from the decomposition.  */
132
66.8M
                if (i == decomposed_count)
133
30.5M
                  break;
134
36.3M
                uc = decomposed[i];
135
36.3M
                ccc = uc_combining_class (uc);
136
36.3M
              }
137
15.0M
            else
138
15.0M
              {
139
                /* End of string reached.  */
140
15.0M
                uc = 0;
141
15.0M
                ccc = 0;
142
15.0M
              }
143
144
51.3M
            if (ccc == 0)
145
48.3M
              {
146
48.3M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
48.3M
                if (sortbuf_count > 1)
151
1.49M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.49M
                                                           sortbuf + sortbuf_count);
153
154
48.3M
                if (composer != NULL)
155
48.3M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
48.3M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
33.3M
                      {
178
36.2M
                        for (j = 1; j < sortbuf_count; )
179
2.94M
                          {
180
2.94M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.52M
                              {
182
1.52M
                                ucs4_t combined =
183
1.52M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.52M
                                if (combined)
185
1.47M
                                  {
186
1.47M
                                    size_t k;
187
188
1.47M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.28M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.80M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.47M
                                    sortbuf_count--;
193
1.47M
                                    continue;
194
1.47M
                                  }
195
1.52M
                              }
196
1.46M
                            j++;
197
1.46M
                          }
198
33.3M
                        if (s < s_end && sortbuf_count == 1)
199
18.3M
                          {
200
18.3M
                            ucs4_t combined =
201
18.3M
                              composer (sortbuf[0].code, uc);
202
18.3M
                            if (combined)
203
16.2k
                              {
204
16.2k
                                uc = combined;
205
16.2k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
16.2k
                                sortbuf_count = 0;
210
16.2k
                              }
211
18.3M
                          }
212
33.3M
                      }
213
48.3M
                  }
214
215
83.2M
                for (j = 0; j < sortbuf_count; j++)
216
34.8M
                  {
217
34.8M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
34.8M
                    if (length < allocated)
221
19.8M
                      {
222
19.8M
                        int ret =
223
19.8M
                          U_UCTOMB (result + length, muc, allocated - length);
224
19.8M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
19.8M
                        if (ret >= 0)
230
19.8M
                          {
231
19.8M
                            length += ret;
232
19.8M
                            goto done_appending;
233
19.8M
                          }
234
19.8M
                      }
235
14.9M
                    {
236
14.9M
                      size_t old_allocated = allocated;
237
14.9M
                      size_t new_allocated = 2 * old_allocated;
238
14.9M
                      if (new_allocated < 64)
239
14.9M
                        new_allocated = 64;
240
14.9M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
14.9M
                      {
243
14.9M
                        UNIT *larger_result;
244
14.9M
                        if (result == NULL)
245
14.9M
                          {
246
14.9M
                            larger_result =
247
14.9M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
14.9M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
14.9M
                          }
254
15.6k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
15.6k
                        else
266
15.6k
                          {
267
15.6k
                            larger_result =
268
15.6k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
15.6k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
15.6k
                          }
275
14.9M
                        result = larger_result;
276
14.9M
                        allocated = new_allocated;
277
14.9M
                        {
278
14.9M
                          int ret =
279
14.9M
                            U_UCTOMB (result + length, muc, allocated - length);
280
14.9M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
14.9M
                          if (ret < 0)
286
0
                            abort ();
287
14.9M
                          length += ret;
288
14.9M
                          goto done_appending;
289
14.9M
                        }
290
14.9M
                      }
291
14.9M
                    }
292
34.8M
                   done_appending: ;
293
34.8M
                  }
294
295
                /* sortbuf is now empty.  */
296
48.3M
                sortbuf_count = 0;
297
48.3M
              }
298
299
51.3M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
15.0M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
36.3M
            if (sortbuf_count == sortbuf_allocated)
305
1.89k
              {
306
1.89k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.89k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.89k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.89k
                new_sortbuf =
312
1.89k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.89k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.89k
                memcpy (new_sortbuf, sortbuf,
319
1.89k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.89k
                if (sortbuf != sortbuf_preallocated)
321
1.89k
                  free (sortbuf);
322
1.89k
                sortbuf = new_sortbuf;
323
1.89k
              }
324
36.3M
            sortbuf[sortbuf_count].code = uc;
325
36.3M
            sortbuf[sortbuf_count].ccc = ccc;
326
36.3M
            sortbuf_count++;
327
328
36.3M
            i++;
329
36.3M
          }
330
331
45.6M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
15.0M
          break;
334
335
30.5M
        s += count;
336
30.5M
      }
337
15.0M
  }
338
339
15.0M
  if (length == 0)
340
112k
    {
341
112k
      if (result == NULL)
342
112k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
112k
          result = (UNIT *) malloc (1);
345
112k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
112k
        }
351
112k
    }
352
14.9M
  else if (result != resultbuf && length < allocated)
353
14.9M
    {
354
      /* Shrink the allocated memory if possible.  */
355
14.9M
      UNIT *memory;
356
357
14.9M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
14.9M
      if (memory != NULL)
359
14.9M
        result = memory;
360
14.9M
    }
361
362
15.0M
  if (sortbuf_count > 0)
363
0
    abort ();
364
15.0M
  if (sortbuf != sortbuf_preallocated)
365
15.0M
    free (sortbuf);
366
367
15.0M
  *lengthp = length;
368
15.0M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
15.0M
}
u8_normalize
Line
Count
Source
21
4.92M
{
22
4.92M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
4.92M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
4.92M
  UNIT *result;
27
4.92M
  size_t length;
28
4.92M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
4.92M
  #define SORTBUF_PREALLOCATED 64
31
4.92M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
4.92M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
4.92M
  size_t sortbuf_allocated;
34
4.92M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
4.92M
  if (resultbuf == NULL)
38
4.92M
    {
39
4.92M
      result = NULL;
40
4.92M
      allocated = 0;
41
4.92M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
4.92M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
4.92M
  sortbuf = sortbuf_preallocated;
51
4.92M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
4.92M
  sortbuf_count = 0;
53
54
4.92M
  {
55
4.92M
    const UNIT *s_end = s + n;
56
57
4.92M
    for (;;)
58
21.0M
      {
59
21.0M
        int count;
60
21.0M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
21.0M
        int decomposed_count;
62
21.0M
        int i;
63
64
21.0M
        if (s < s_end)
65
16.1M
          {
66
            /* Fetch the next character.  */
67
16.1M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
16.1M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
16.1M
            {
77
16.1M
              int curr;
78
79
41.0M
              for (curr = 0; curr < decomposed_count; )
80
24.8M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
24.8M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
24.8M
                  int curr_decomposed_count;
85
86
24.8M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
24.8M
                  if (curr_decomposed_count >= 0)
88
3.01M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.01M
                      int shift = curr_decomposed_count - 1;
93
94
3.01M
                      if (shift < 0)
95
0
                        abort ();
96
3.01M
                      if (shift > 0)
97
2.61M
                        {
98
2.61M
                          int j;
99
100
2.61M
                          decomposed_count += shift;
101
2.61M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.63M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
17.8k
                            decomposed[j + shift] = decomposed[j];
105
2.61M
                        }
106
11.7M
                      for (; shift >= 0; shift--)
107
8.72M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.01M
                    }
109
21.8M
                  else
110
21.8M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
21.8M
                      curr++;
113
21.8M
                    }
114
24.8M
                }
115
16.1M
            }
116
16.1M
          }
117
4.92M
        else
118
4.92M
          {
119
4.92M
            count = 0;
120
4.92M
            decomposed_count = 0;
121
4.92M
          }
122
123
21.0M
        i = 0;
124
21.0M
        for (;;)
125
42.9M
          {
126
42.9M
            ucs4_t uc;
127
42.9M
            int ccc;
128
129
42.9M
            if (s < s_end)
130
38.0M
              {
131
                /* Fetch the next character from the decomposition.  */
132
38.0M
                if (i == decomposed_count)
133
16.1M
                  break;
134
21.8M
                uc = decomposed[i];
135
21.8M
                ccc = uc_combining_class (uc);
136
21.8M
              }
137
4.92M
            else
138
4.92M
              {
139
                /* End of string reached.  */
140
4.92M
                uc = 0;
141
4.92M
                ccc = 0;
142
4.92M
              }
143
144
26.7M
            if (ccc == 0)
145
23.9M
              {
146
23.9M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
23.9M
                if (sortbuf_count > 1)
151
1.46M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.46M
                                                           sortbuf + sortbuf_count);
153
154
23.9M
                if (composer != NULL)
155
23.9M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
23.9M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
18.9M
                      {
178
21.8M
                        for (j = 1; j < sortbuf_count; )
179
2.84M
                          {
180
2.84M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.48M
                              {
182
1.48M
                                ucs4_t combined =
183
1.48M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.48M
                                if (combined)
185
1.45M
                                  {
186
1.45M
                                    size_t k;
187
188
1.45M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.24M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.79M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.45M
                                    sortbuf_count--;
193
1.45M
                                    continue;
194
1.45M
                                  }
195
1.48M
                              }
196
1.38M
                            j++;
197
1.38M
                          }
198
18.9M
                        if (s < s_end && sortbuf_count == 1)
199
14.0M
                          {
200
14.0M
                            ucs4_t combined =
201
14.0M
                              composer (sortbuf[0].code, uc);
202
14.0M
                            if (combined)
203
7.16k
                              {
204
7.16k
                                uc = combined;
205
7.16k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
7.16k
                                sortbuf_count = 0;
210
7.16k
                              }
211
14.0M
                          }
212
18.9M
                      }
213
23.9M
                  }
214
215
44.3M
                for (j = 0; j < sortbuf_count; j++)
216
20.3M
                  {
217
20.3M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
20.3M
                    if (length < allocated)
221
15.4M
                      {
222
15.4M
                        int ret =
223
15.4M
                          U_UCTOMB (result + length, muc, allocated - length);
224
15.4M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
15.4M
                        if (ret >= 0)
230
15.4M
                          {
231
15.4M
                            length += ret;
232
15.4M
                            goto done_appending;
233
15.4M
                          }
234
15.4M
                      }
235
4.93M
                    {
236
4.93M
                      size_t old_allocated = allocated;
237
4.93M
                      size_t new_allocated = 2 * old_allocated;
238
4.93M
                      if (new_allocated < 64)
239
4.92M
                        new_allocated = 64;
240
4.93M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
4.93M
                      {
243
4.93M
                        UNIT *larger_result;
244
4.93M
                        if (result == NULL)
245
4.92M
                          {
246
4.92M
                            larger_result =
247
4.92M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
4.92M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
4.92M
                          }
254
12.7k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
12.7k
                        else
266
12.7k
                          {
267
12.7k
                            larger_result =
268
12.7k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
12.7k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
12.7k
                          }
275
4.93M
                        result = larger_result;
276
4.93M
                        allocated = new_allocated;
277
4.93M
                        {
278
4.93M
                          int ret =
279
4.93M
                            U_UCTOMB (result + length, muc, allocated - length);
280
4.93M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
4.93M
                          if (ret < 0)
286
0
                            abort ();
287
4.93M
                          length += ret;
288
4.93M
                          goto done_appending;
289
4.93M
                        }
290
4.93M
                      }
291
4.93M
                    }
292
20.3M
                   done_appending: ;
293
20.3M
                  }
294
295
                /* sortbuf is now empty.  */
296
23.9M
                sortbuf_count = 0;
297
23.9M
              }
298
299
26.7M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
4.92M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
21.8M
            if (sortbuf_count == sortbuf_allocated)
305
1.07k
              {
306
1.07k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.07k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.07k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.07k
                new_sortbuf =
312
1.07k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.07k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.07k
                memcpy (new_sortbuf, sortbuf,
319
1.07k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.07k
                if (sortbuf != sortbuf_preallocated)
321
1.07k
                  free (sortbuf);
322
1.07k
                sortbuf = new_sortbuf;
323
1.07k
              }
324
21.8M
            sortbuf[sortbuf_count].code = uc;
325
21.8M
            sortbuf[sortbuf_count].ccc = ccc;
326
21.8M
            sortbuf_count++;
327
328
21.8M
            i++;
329
21.8M
          }
330
331
21.0M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
4.92M
          break;
334
335
16.1M
        s += count;
336
16.1M
      }
337
4.92M
  }
338
339
4.92M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
4.92M
  else if (result != resultbuf && length < allocated)
353
4.92M
    {
354
      /* Shrink the allocated memory if possible.  */
355
4.92M
      UNIT *memory;
356
357
4.92M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
4.92M
      if (memory != NULL)
359
4.92M
        result = memory;
360
4.92M
    }
361
362
4.92M
  if (sortbuf_count > 0)
363
0
    abort ();
364
4.92M
  if (sortbuf != sortbuf_preallocated)
365
4.92M
    free (sortbuf);
366
367
4.92M
  *lengthp = length;
368
4.92M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
4.92M
}
u32_normalize
Line
Count
Source
21
10.1M
{
22
10.1M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
10.1M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
10.1M
  UNIT *result;
27
10.1M
  size_t length;
28
10.1M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
10.1M
  #define SORTBUF_PREALLOCATED 64
31
10.1M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
10.1M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
10.1M
  size_t sortbuf_allocated;
34
10.1M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
10.1M
  if (resultbuf == NULL)
38
10.1M
    {
39
10.1M
      result = NULL;
40
10.1M
      allocated = 0;
41
10.1M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
10.1M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
10.1M
  sortbuf = sortbuf_preallocated;
51
10.1M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
10.1M
  sortbuf_count = 0;
53
54
10.1M
  {
55
10.1M
    const UNIT *s_end = s + n;
56
57
10.1M
    for (;;)
58
24.5M
      {
59
24.5M
        int count;
60
24.5M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
24.5M
        int decomposed_count;
62
24.5M
        int i;
63
64
24.5M
        if (s < s_end)
65
14.4M
          {
66
            /* Fetch the next character.  */
67
14.4M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
14.4M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
14.4M
            {
77
14.4M
              int curr;
78
79
28.9M
              for (curr = 0; curr < decomposed_count; )
80
14.4M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
14.4M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
14.4M
                  int curr_decomposed_count;
85
86
14.4M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
14.4M
                  if (curr_decomposed_count >= 0)
88
34.4k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
34.4k
                      int shift = curr_decomposed_count - 1;
93
94
34.4k
                      if (shift < 0)
95
0
                        abort ();
96
34.4k
                      if (shift > 0)
97
34.0k
                        {
98
34.0k
                          int j;
99
100
34.0k
                          decomposed_count += shift;
101
34.0k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
43.7k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
9.69k
                            decomposed[j + shift] = decomposed[j];
105
34.0k
                        }
106
102k
                      for (; shift >= 0; shift--)
107
68.4k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
34.4k
                    }
109
14.4M
                  else
110
14.4M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
14.4M
                      curr++;
113
14.4M
                    }
114
14.4M
                }
115
14.4M
            }
116
14.4M
          }
117
10.1M
        else
118
10.1M
          {
119
10.1M
            count = 0;
120
10.1M
            decomposed_count = 0;
121
10.1M
          }
122
123
24.5M
        i = 0;
124
24.5M
        for (;;)
125
39.0M
          {
126
39.0M
            ucs4_t uc;
127
39.0M
            int ccc;
128
129
39.0M
            if (s < s_end)
130
28.8M
              {
131
                /* Fetch the next character from the decomposition.  */
132
28.8M
                if (i == decomposed_count)
133
14.4M
                  break;
134
14.4M
                uc = decomposed[i];
135
14.4M
                ccc = uc_combining_class (uc);
136
14.4M
              }
137
10.1M
            else
138
10.1M
              {
139
                /* End of string reached.  */
140
10.1M
                uc = 0;
141
10.1M
                ccc = 0;
142
10.1M
              }
143
144
24.5M
            if (ccc == 0)
145
24.4M
              {
146
24.4M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
24.4M
                if (sortbuf_count > 1)
151
27.0k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
27.0k
                                                           sortbuf + sortbuf_count);
153
154
24.4M
                if (composer != NULL)
155
24.4M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
24.4M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
14.3M
                      {
178
14.4M
                        for (j = 1; j < sortbuf_count; )
179
100k
                          {
180
100k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
38.2k
                              {
182
38.2k
                                ucs4_t combined =
183
38.2k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
38.2k
                                if (combined)
185
23.1k
                                  {
186
23.1k
                                    size_t k;
187
188
23.1k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
36.6k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
13.5k
                                      sortbuf[k - 1] = sortbuf[k];
192
23.1k
                                    sortbuf_count--;
193
23.1k
                                    continue;
194
23.1k
                                  }
195
38.2k
                              }
196
77.7k
                            j++;
197
77.7k
                          }
198
14.3M
                        if (s < s_end && sortbuf_count == 1)
199
4.32M
                          {
200
4.32M
                            ucs4_t combined =
201
4.32M
                              composer (sortbuf[0].code, uc);
202
4.32M
                            if (combined)
203
9.10k
                              {
204
9.10k
                                uc = combined;
205
9.10k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
9.10k
                                sortbuf_count = 0;
210
9.10k
                              }
211
4.32M
                          }
212
14.3M
                      }
213
24.4M
                  }
214
215
38.9M
                for (j = 0; j < sortbuf_count; j++)
216
14.4M
                  {
217
14.4M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
14.4M
                    if (length < allocated)
221
4.40M
                      {
222
4.40M
                        int ret =
223
4.40M
                          U_UCTOMB (result + length, muc, allocated - length);
224
4.40M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
4.40M
                        if (ret >= 0)
230
4.40M
                          {
231
4.40M
                            length += ret;
232
4.40M
                            goto done_appending;
233
4.40M
                          }
234
4.40M
                      }
235
10.0M
                    {
236
10.0M
                      size_t old_allocated = allocated;
237
10.0M
                      size_t new_allocated = 2 * old_allocated;
238
10.0M
                      if (new_allocated < 64)
239
10.0M
                        new_allocated = 64;
240
10.0M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
10.0M
                      {
243
10.0M
                        UNIT *larger_result;
244
10.0M
                        if (result == NULL)
245
10.0M
                          {
246
10.0M
                            larger_result =
247
10.0M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
10.0M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
10.0M
                          }
254
2.98k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
2.98k
                        else
266
2.98k
                          {
267
2.98k
                            larger_result =
268
2.98k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
2.98k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
2.98k
                          }
275
10.0M
                        result = larger_result;
276
10.0M
                        allocated = new_allocated;
277
10.0M
                        {
278
10.0M
                          int ret =
279
10.0M
                            U_UCTOMB (result + length, muc, allocated - length);
280
10.0M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
10.0M
                          if (ret < 0)
286
0
                            abort ();
287
10.0M
                          length += ret;
288
10.0M
                          goto done_appending;
289
10.0M
                        }
290
10.0M
                      }
291
10.0M
                    }
292
14.4M
                   done_appending: ;
293
14.4M
                  }
294
295
                /* sortbuf is now empty.  */
296
24.4M
                sortbuf_count = 0;
297
24.4M
              }
298
299
24.5M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
10.1M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
14.4M
            if (sortbuf_count == sortbuf_allocated)
305
825
              {
306
825
                struct ucs4_with_ccc *new_sortbuf;
307
308
825
                sortbuf_allocated = 2 * sortbuf_allocated;
309
825
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
825
                new_sortbuf =
312
825
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
825
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
825
                memcpy (new_sortbuf, sortbuf,
319
825
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
825
                if (sortbuf != sortbuf_preallocated)
321
825
                  free (sortbuf);
322
825
                sortbuf = new_sortbuf;
323
825
              }
324
14.4M
            sortbuf[sortbuf_count].code = uc;
325
14.4M
            sortbuf[sortbuf_count].ccc = ccc;
326
14.4M
            sortbuf_count++;
327
328
14.4M
            i++;
329
14.4M
          }
330
331
24.5M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
10.1M
          break;
334
335
14.4M
        s += count;
336
14.4M
      }
337
10.1M
  }
338
339
10.1M
  if (length == 0)
340
112k
    {
341
112k
      if (result == NULL)
342
112k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
112k
          result = (UNIT *) malloc (1);
345
112k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
112k
        }
351
112k
    }
352
10.0M
  else if (result != resultbuf && length < allocated)
353
10.0M
    {
354
      /* Shrink the allocated memory if possible.  */
355
10.0M
      UNIT *memory;
356
357
10.0M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
10.0M
      if (memory != NULL)
359
10.0M
        result = memory;
360
10.0M
    }
361
362
10.1M
  if (sortbuf_count > 0)
363
0
    abort ();
364
10.1M
  if (sortbuf != sortbuf_preallocated)
365
10.1M
    free (sortbuf);
366
367
10.1M
  *lengthp = length;
368
10.1M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
10.1M
}