Coverage Report

Created: 2025-11-08 07:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
10.6M
{
22
10.6M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
10.6M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
10.6M
  UNIT *result;
27
10.6M
  size_t length;
28
10.6M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
10.6M
  #define SORTBUF_PREALLOCATED 64
31
10.6M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
10.6M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
10.6M
  size_t sortbuf_allocated;
34
10.6M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
10.6M
  if (resultbuf == NULL)
38
10.6M
    {
39
10.6M
      result = NULL;
40
10.6M
      allocated = 0;
41
10.6M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
10.6M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
10.6M
  sortbuf = sortbuf_preallocated;
51
10.6M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
10.6M
  sortbuf_count = 0;
53
54
10.6M
  {
55
10.6M
    const UNIT *s_end = s + n;
56
57
10.6M
    for (;;)
58
34.3M
      {
59
34.3M
        int count;
60
34.3M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
34.3M
        int decomposed_count;
62
34.3M
        int i;
63
64
34.3M
        if (s < s_end)
65
23.7M
          {
66
            /* Fetch the next character.  */
67
23.7M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
23.7M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
23.7M
            {
77
23.7M
              int curr;
78
79
56.0M
              for (curr = 0; curr < decomposed_count; )
80
32.3M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
32.3M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
32.3M
                  int curr_decomposed_count;
85
86
32.3M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
32.3M
                  if (curr_decomposed_count >= 0)
88
2.98M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
2.98M
                      int shift = curr_decomposed_count - 1;
93
94
2.98M
                      if (shift < 0)
95
0
                        abort ();
96
2.98M
                      if (shift > 0)
97
2.58M
                        {
98
2.58M
                          int j;
99
100
2.58M
                          decomposed_count += shift;
101
2.58M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.61M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
27.6k
                            decomposed[j + shift] = decomposed[j];
105
2.58M
                        }
106
11.6M
                      for (; shift >= 0; shift--)
107
8.66M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
2.98M
                    }
109
29.3M
                  else
110
29.3M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
29.3M
                      curr++;
113
29.3M
                    }
114
32.3M
                }
115
23.7M
            }
116
23.7M
          }
117
10.6M
        else
118
10.6M
          {
119
10.6M
            count = 0;
120
10.6M
            decomposed_count = 0;
121
10.6M
          }
122
123
34.3M
        i = 0;
124
34.3M
        for (;;)
125
63.7M
          {
126
63.7M
            ucs4_t uc;
127
63.7M
            int ccc;
128
129
63.7M
            if (s < s_end)
130
53.0M
              {
131
                /* Fetch the next character from the decomposition.  */
132
53.0M
                if (i == decomposed_count)
133
23.7M
                  break;
134
29.3M
                uc = decomposed[i];
135
29.3M
                ccc = uc_combining_class (uc);
136
29.3M
              }
137
10.6M
            else
138
10.6M
              {
139
                /* End of string reached.  */
140
10.6M
                uc = 0;
141
10.6M
                ccc = 0;
142
10.6M
              }
143
144
40.0M
            if (ccc == 0)
145
37.1M
              {
146
37.1M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
37.1M
                if (sortbuf_count > 1)
151
1.51M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.51M
                                                           sortbuf + sortbuf_count);
153
154
37.1M
                if (composer != NULL)
155
37.1M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
37.1M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
26.4M
                      {
178
29.3M
                        for (j = 1; j < sortbuf_count; )
179
2.88M
                          {
180
2.88M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.54M
                              {
182
1.54M
                                ucs4_t combined =
183
1.54M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.54M
                                if (combined)
185
1.49M
                                  {
186
1.49M
                                    size_t k;
187
188
1.49M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.22M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.72M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.49M
                                    sortbuf_count--;
193
1.49M
                                    continue;
194
1.49M
                                  }
195
1.54M
                              }
196
1.38M
                            j++;
197
1.38M
                          }
198
26.4M
                        if (s < s_end && sortbuf_count == 1)
199
15.8M
                          {
200
15.8M
                            ucs4_t combined =
201
15.8M
                              composer (sortbuf[0].code, uc);
202
15.8M
                            if (combined)
203
16.2k
                              {
204
16.2k
                                uc = combined;
205
16.2k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
16.2k
                                sortbuf_count = 0;
210
16.2k
                              }
211
15.8M
                          }
212
26.4M
                      }
213
37.1M
                  }
214
215
65.0M
                for (j = 0; j < sortbuf_count; j++)
216
27.8M
                  {
217
27.8M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
27.8M
                    if (length < allocated)
221
17.3M
                      {
222
17.3M
                        int ret =
223
17.3M
                          U_UCTOMB (result + length, muc, allocated - length);
224
17.3M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
17.3M
                        if (ret >= 0)
230
17.3M
                          {
231
17.3M
                            length += ret;
232
17.3M
                            goto done_appending;
233
17.3M
                          }
234
17.3M
                      }
235
10.5M
                    {
236
10.5M
                      size_t old_allocated = allocated;
237
10.5M
                      size_t new_allocated = 2 * old_allocated;
238
10.5M
                      if (new_allocated < 64)
239
10.5M
                        new_allocated = 64;
240
10.5M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
10.5M
                      {
243
10.5M
                        UNIT *larger_result;
244
10.5M
                        if (result == NULL)
245
10.5M
                          {
246
10.5M
                            larger_result =
247
10.5M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
10.5M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
10.5M
                          }
254
15.4k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
15.4k
                        else
266
15.4k
                          {
267
15.4k
                            larger_result =
268
15.4k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
15.4k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
15.4k
                          }
275
10.5M
                        result = larger_result;
276
10.5M
                        allocated = new_allocated;
277
10.5M
                        {
278
10.5M
                          int ret =
279
10.5M
                            U_UCTOMB (result + length, muc, allocated - length);
280
10.5M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
10.5M
                          if (ret < 0)
286
0
                            abort ();
287
10.5M
                          length += ret;
288
10.5M
                          goto done_appending;
289
10.5M
                        }
290
10.5M
                      }
291
10.5M
                    }
292
27.8M
                   done_appending: ;
293
27.8M
                  }
294
295
                /* sortbuf is now empty.  */
296
37.1M
                sortbuf_count = 0;
297
37.1M
              }
298
299
40.0M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
10.6M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
29.3M
            if (sortbuf_count == sortbuf_allocated)
305
1.88k
              {
306
1.88k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.88k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.88k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.88k
                new_sortbuf =
312
1.88k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.88k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.88k
                memcpy (new_sortbuf, sortbuf,
319
1.88k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.88k
                if (sortbuf != sortbuf_preallocated)
321
1.88k
                  free (sortbuf);
322
1.88k
                sortbuf = new_sortbuf;
323
1.88k
              }
324
29.3M
            sortbuf[sortbuf_count].code = uc;
325
29.3M
            sortbuf[sortbuf_count].ccc = ccc;
326
29.3M
            sortbuf_count++;
327
328
29.3M
            i++;
329
29.3M
          }
330
331
34.3M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
10.6M
          break;
334
335
23.7M
        s += count;
336
23.7M
      }
337
10.6M
  }
338
339
10.6M
  if (length == 0)
340
112k
    {
341
112k
      if (result == NULL)
342
112k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
112k
          result = (UNIT *) malloc (1);
345
112k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
112k
        }
351
112k
    }
352
10.5M
  else if (result != resultbuf && length < allocated)
353
10.5M
    {
354
      /* Shrink the allocated memory if possible.  */
355
10.5M
      UNIT *memory;
356
357
10.5M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
10.5M
      if (memory != NULL)
359
10.5M
        result = memory;
360
10.5M
    }
361
362
10.6M
  if (sortbuf_count > 0)
363
0
    abort ();
364
10.6M
  if (sortbuf != sortbuf_preallocated)
365
10.6M
    free (sortbuf);
366
367
10.6M
  *lengthp = length;
368
10.6M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
10.6M
}
u8_normalize
Line
Count
Source
21
3.44M
{
22
3.44M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
3.44M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
3.44M
  UNIT *result;
27
3.44M
  size_t length;
28
3.44M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
3.44M
  #define SORTBUF_PREALLOCATED 64
31
3.44M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
3.44M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
3.44M
  size_t sortbuf_allocated;
34
3.44M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
3.44M
  if (resultbuf == NULL)
38
3.44M
    {
39
3.44M
      result = NULL;
40
3.44M
      allocated = 0;
41
3.44M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
3.44M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
3.44M
  sortbuf = sortbuf_preallocated;
51
3.44M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
3.44M
  sortbuf_count = 0;
53
54
3.44M
  {
55
3.44M
    const UNIT *s_end = s + n;
56
57
3.44M
    for (;;)
58
16.5M
      {
59
16.5M
        int count;
60
16.5M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
16.5M
        int decomposed_count;
62
16.5M
        int i;
63
64
16.5M
        if (s < s_end)
65
13.1M
          {
66
            /* Fetch the next character.  */
67
13.1M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
13.1M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
13.1M
            {
77
13.1M
              int curr;
78
79
34.8M
              for (curr = 0; curr < decomposed_count; )
80
21.7M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
21.7M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
21.7M
                  int curr_decomposed_count;
85
86
21.7M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
21.7M
                  if (curr_decomposed_count >= 0)
88
2.95M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
2.95M
                      int shift = curr_decomposed_count - 1;
93
94
2.95M
                      if (shift < 0)
95
0
                        abort ();
96
2.95M
                      if (shift > 0)
97
2.55M
                        {
98
2.55M
                          int j;
99
100
2.55M
                          decomposed_count += shift;
101
2.55M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.57M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
17.8k
                            decomposed[j + shift] = decomposed[j];
105
2.55M
                        }
106
11.5M
                      for (; shift >= 0; shift--)
107
8.60M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
2.95M
                    }
109
18.7M
                  else
110
18.7M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
18.7M
                      curr++;
113
18.7M
                    }
114
21.7M
                }
115
13.1M
            }
116
13.1M
          }
117
3.44M
        else
118
3.44M
          {
119
3.44M
            count = 0;
120
3.44M
            decomposed_count = 0;
121
3.44M
          }
122
123
16.5M
        i = 0;
124
16.5M
        for (;;)
125
35.3M
          {
126
35.3M
            ucs4_t uc;
127
35.3M
            int ccc;
128
129
35.3M
            if (s < s_end)
130
31.8M
              {
131
                /* Fetch the next character from the decomposition.  */
132
31.8M
                if (i == decomposed_count)
133
13.1M
                  break;
134
18.7M
                uc = decomposed[i];
135
18.7M
                ccc = uc_combining_class (uc);
136
18.7M
              }
137
3.44M
            else
138
3.44M
              {
139
                /* End of string reached.  */
140
3.44M
                uc = 0;
141
3.44M
                ccc = 0;
142
3.44M
              }
143
144
22.2M
            if (ccc == 0)
145
19.3M
              {
146
19.3M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
19.3M
                if (sortbuf_count > 1)
151
1.48M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.48M
                                                           sortbuf + sortbuf_count);
153
154
19.3M
                if (composer != NULL)
155
19.3M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
19.3M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
15.9M
                      {
178
18.7M
                        for (j = 1; j < sortbuf_count; )
179
2.78M
                          {
180
2.78M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.50M
                              {
182
1.50M
                                ucs4_t combined =
183
1.50M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.50M
                                if (combined)
185
1.47M
                                  {
186
1.47M
                                    size_t k;
187
188
1.47M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.18M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.70M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.47M
                                    sortbuf_count--;
193
1.47M
                                    continue;
194
1.47M
                                  }
195
1.50M
                              }
196
1.31M
                            j++;
197
1.31M
                          }
198
15.9M
                        if (s < s_end && sortbuf_count == 1)
199
12.4M
                          {
200
12.4M
                            ucs4_t combined =
201
12.4M
                              composer (sortbuf[0].code, uc);
202
12.4M
                            if (combined)
203
7.16k
                              {
204
7.16k
                                uc = combined;
205
7.16k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
7.16k
                                sortbuf_count = 0;
210
7.16k
                              }
211
12.4M
                          }
212
15.9M
                      }
213
19.3M
                  }
214
215
36.6M
                for (j = 0; j < sortbuf_count; j++)
216
17.2M
                  {
217
17.2M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
17.2M
                    if (length < allocated)
221
13.8M
                      {
222
13.8M
                        int ret =
223
13.8M
                          U_UCTOMB (result + length, muc, allocated - length);
224
13.8M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
13.8M
                        if (ret >= 0)
230
13.8M
                          {
231
13.8M
                            length += ret;
232
13.8M
                            goto done_appending;
233
13.8M
                          }
234
13.8M
                      }
235
3.45M
                    {
236
3.45M
                      size_t old_allocated = allocated;
237
3.45M
                      size_t new_allocated = 2 * old_allocated;
238
3.45M
                      if (new_allocated < 64)
239
3.44M
                        new_allocated = 64;
240
3.45M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
3.45M
                      {
243
3.45M
                        UNIT *larger_result;
244
3.45M
                        if (result == NULL)
245
3.44M
                          {
246
3.44M
                            larger_result =
247
3.44M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
3.44M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
3.44M
                          }
254
12.4k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
12.4k
                        else
266
12.4k
                          {
267
12.4k
                            larger_result =
268
12.4k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
12.4k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
12.4k
                          }
275
3.45M
                        result = larger_result;
276
3.45M
                        allocated = new_allocated;
277
3.45M
                        {
278
3.45M
                          int ret =
279
3.45M
                            U_UCTOMB (result + length, muc, allocated - length);
280
3.45M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
3.45M
                          if (ret < 0)
286
0
                            abort ();
287
3.45M
                          length += ret;
288
3.45M
                          goto done_appending;
289
3.45M
                        }
290
3.45M
                      }
291
3.45M
                    }
292
17.2M
                   done_appending: ;
293
17.2M
                  }
294
295
                /* sortbuf is now empty.  */
296
19.3M
                sortbuf_count = 0;
297
19.3M
              }
298
299
22.2M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
3.44M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
18.7M
            if (sortbuf_count == sortbuf_allocated)
305
1.06k
              {
306
1.06k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.06k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.06k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.06k
                new_sortbuf =
312
1.06k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.06k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.06k
                memcpy (new_sortbuf, sortbuf,
319
1.06k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.06k
                if (sortbuf != sortbuf_preallocated)
321
1.06k
                  free (sortbuf);
322
1.06k
                sortbuf = new_sortbuf;
323
1.06k
              }
324
18.7M
            sortbuf[sortbuf_count].code = uc;
325
18.7M
            sortbuf[sortbuf_count].ccc = ccc;
326
18.7M
            sortbuf_count++;
327
328
18.7M
            i++;
329
18.7M
          }
330
331
16.5M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
3.44M
          break;
334
335
13.1M
        s += count;
336
13.1M
      }
337
3.44M
  }
338
339
3.44M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
3.44M
  else if (result != resultbuf && length < allocated)
353
3.44M
    {
354
      /* Shrink the allocated memory if possible.  */
355
3.44M
      UNIT *memory;
356
357
3.44M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
3.44M
      if (memory != NULL)
359
3.44M
        result = memory;
360
3.44M
    }
361
362
3.44M
  if (sortbuf_count > 0)
363
0
    abort ();
364
3.44M
  if (sortbuf != sortbuf_preallocated)
365
3.44M
    free (sortbuf);
366
367
3.44M
  *lengthp = length;
368
3.44M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
3.44M
}
u32_normalize
Line
Count
Source
21
7.21M
{
22
7.21M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
7.21M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
7.21M
  UNIT *result;
27
7.21M
  size_t length;
28
7.21M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
7.21M
  #define SORTBUF_PREALLOCATED 64
31
7.21M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
7.21M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
7.21M
  size_t sortbuf_allocated;
34
7.21M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
7.21M
  if (resultbuf == NULL)
38
7.21M
    {
39
7.21M
      result = NULL;
40
7.21M
      allocated = 0;
41
7.21M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
7.21M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
7.21M
  sortbuf = sortbuf_preallocated;
51
7.21M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
7.21M
  sortbuf_count = 0;
53
54
7.21M
  {
55
7.21M
    const UNIT *s_end = s + n;
56
57
7.21M
    for (;;)
58
17.8M
      {
59
17.8M
        int count;
60
17.8M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
17.8M
        int decomposed_count;
62
17.8M
        int i;
63
64
17.8M
        if (s < s_end)
65
10.5M
          {
66
            /* Fetch the next character.  */
67
10.5M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
10.5M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
10.5M
            {
77
10.5M
              int curr;
78
79
21.2M
              for (curr = 0; curr < decomposed_count; )
80
10.6M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
10.6M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
10.6M
                  int curr_decomposed_count;
85
86
10.6M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
10.6M
                  if (curr_decomposed_count >= 0)
88
34.0k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
34.0k
                      int shift = curr_decomposed_count - 1;
93
94
34.0k
                      if (shift < 0)
95
0
                        abort ();
96
34.0k
                      if (shift > 0)
97
33.6k
                        {
98
33.6k
                          int j;
99
100
33.6k
                          decomposed_count += shift;
101
33.6k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
43.4k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
9.81k
                            decomposed[j + shift] = decomposed[j];
105
33.6k
                        }
106
101k
                      for (; shift >= 0; shift--)
107
67.6k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
34.0k
                    }
109
10.6M
                  else
110
10.6M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
10.6M
                      curr++;
113
10.6M
                    }
114
10.6M
                }
115
10.5M
            }
116
10.5M
          }
117
7.21M
        else
118
7.21M
          {
119
7.21M
            count = 0;
120
7.21M
            decomposed_count = 0;
121
7.21M
          }
122
123
17.8M
        i = 0;
124
17.8M
        for (;;)
125
28.4M
          {
126
28.4M
            ucs4_t uc;
127
28.4M
            int ccc;
128
129
28.4M
            if (s < s_end)
130
21.2M
              {
131
                /* Fetch the next character from the decomposition.  */
132
21.2M
                if (i == decomposed_count)
133
10.5M
                  break;
134
10.6M
                uc = decomposed[i];
135
10.6M
                ccc = uc_combining_class (uc);
136
10.6M
              }
137
7.21M
            else
138
7.21M
              {
139
                /* End of string reached.  */
140
7.21M
                uc = 0;
141
7.21M
                ccc = 0;
142
7.21M
              }
143
144
17.8M
            if (ccc == 0)
145
17.7M
              {
146
17.7M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
17.7M
                if (sortbuf_count > 1)
151
25.6k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
25.6k
                                                           sortbuf + sortbuf_count);
153
154
17.7M
                if (composer != NULL)
155
17.7M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
17.7M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
10.5M
                      {
178
10.6M
                        for (j = 1; j < sortbuf_count; )
179
99.8k
                          {
180
99.8k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
37.1k
                              {
182
37.1k
                                ucs4_t combined =
183
37.1k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
37.1k
                                if (combined)
185
22.6k
                                  {
186
22.6k
                                    size_t k;
187
188
22.6k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
36.3k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
13.6k
                                      sortbuf[k - 1] = sortbuf[k];
192
22.6k
                                    sortbuf_count--;
193
22.6k
                                    continue;
194
22.6k
                                  }
195
37.1k
                              }
196
77.1k
                            j++;
197
77.1k
                          }
198
10.5M
                        if (s < s_end && sortbuf_count == 1)
199
3.40M
                          {
200
3.40M
                            ucs4_t combined =
201
3.40M
                              composer (sortbuf[0].code, uc);
202
3.40M
                            if (combined)
203
9.08k
                              {
204
9.08k
                                uc = combined;
205
9.08k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
9.08k
                                sortbuf_count = 0;
210
9.08k
                              }
211
3.40M
                          }
212
10.5M
                      }
213
17.7M
                  }
214
215
28.3M
                for (j = 0; j < sortbuf_count; j++)
216
10.5M
                  {
217
10.5M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
10.5M
                    if (length < allocated)
221
3.48M
                      {
222
3.48M
                        int ret =
223
3.48M
                          U_UCTOMB (result + length, muc, allocated - length);
224
3.48M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
3.48M
                        if (ret >= 0)
230
3.48M
                          {
231
3.48M
                            length += ret;
232
3.48M
                            goto done_appending;
233
3.48M
                          }
234
3.48M
                      }
235
7.11M
                    {
236
7.11M
                      size_t old_allocated = allocated;
237
7.11M
                      size_t new_allocated = 2 * old_allocated;
238
7.11M
                      if (new_allocated < 64)
239
7.10M
                        new_allocated = 64;
240
7.11M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
7.11M
                      {
243
7.11M
                        UNIT *larger_result;
244
7.11M
                        if (result == NULL)
245
7.10M
                          {
246
7.10M
                            larger_result =
247
7.10M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
7.10M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
7.10M
                          }
254
2.95k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
2.95k
                        else
266
2.95k
                          {
267
2.95k
                            larger_result =
268
2.95k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
2.95k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
2.95k
                          }
275
7.11M
                        result = larger_result;
276
7.11M
                        allocated = new_allocated;
277
7.11M
                        {
278
7.11M
                          int ret =
279
7.11M
                            U_UCTOMB (result + length, muc, allocated - length);
280
7.11M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
7.11M
                          if (ret < 0)
286
0
                            abort ();
287
7.11M
                          length += ret;
288
7.11M
                          goto done_appending;
289
7.11M
                        }
290
7.11M
                      }
291
7.11M
                    }
292
10.5M
                   done_appending: ;
293
10.5M
                  }
294
295
                /* sortbuf is now empty.  */
296
17.7M
                sortbuf_count = 0;
297
17.7M
              }
298
299
17.8M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
7.21M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
10.6M
            if (sortbuf_count == sortbuf_allocated)
305
826
              {
306
826
                struct ucs4_with_ccc *new_sortbuf;
307
308
826
                sortbuf_allocated = 2 * sortbuf_allocated;
309
826
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
826
                new_sortbuf =
312
826
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
826
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
826
                memcpy (new_sortbuf, sortbuf,
319
826
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
826
                if (sortbuf != sortbuf_preallocated)
321
826
                  free (sortbuf);
322
826
                sortbuf = new_sortbuf;
323
826
              }
324
10.6M
            sortbuf[sortbuf_count].code = uc;
325
10.6M
            sortbuf[sortbuf_count].ccc = ccc;
326
10.6M
            sortbuf_count++;
327
328
10.6M
            i++;
329
10.6M
          }
330
331
17.8M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
7.21M
          break;
334
335
10.5M
        s += count;
336
10.5M
      }
337
7.21M
  }
338
339
7.21M
  if (length == 0)
340
112k
    {
341
112k
      if (result == NULL)
342
112k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
112k
          result = (UNIT *) malloc (1);
345
112k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
112k
        }
351
112k
    }
352
7.10M
  else if (result != resultbuf && length < allocated)
353
7.10M
    {
354
      /* Shrink the allocated memory if possible.  */
355
7.10M
      UNIT *memory;
356
357
7.10M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
7.10M
      if (memory != NULL)
359
7.10M
        result = memory;
360
7.10M
    }
361
362
7.21M
  if (sortbuf_count > 0)
363
0
    abort ();
364
7.21M
  if (sortbuf != sortbuf_preallocated)
365
7.21M
    free (sortbuf);
366
367
7.21M
  *lengthp = length;
368
7.21M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
7.21M
}