Coverage Report

Created: 2025-08-23 06:26

/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source (jump to first uncovered line)
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
19.3M
{
22
19.3M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
19.3M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
19.3M
  UNIT *result;
27
19.3M
  size_t length;
28
19.3M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
19.3M
  #define SORTBUF_PREALLOCATED 64
31
19.3M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
19.3M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
19.3M
  size_t sortbuf_allocated;
34
19.3M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
19.3M
  if (resultbuf == NULL)
38
19.3M
    {
39
19.3M
      result = NULL;
40
19.3M
      allocated = 0;
41
19.3M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
19.3M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
19.3M
  sortbuf = sortbuf_preallocated;
51
19.3M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
19.3M
  sortbuf_count = 0;
53
54
19.3M
  {
55
19.3M
    const UNIT *s_end = s + n;
56
57
19.3M
    for (;;)
58
56.7M
      {
59
56.7M
        int count;
60
56.7M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
56.7M
        int decomposed_count;
62
56.7M
        int i;
63
64
56.7M
        if (s < s_end)
65
37.4M
          {
66
            /* Fetch the next character.  */
67
37.4M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
37.4M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
37.4M
            {
77
37.4M
              int curr;
78
79
84.1M
              for (curr = 0; curr < decomposed_count; )
80
46.6M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
46.6M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
46.6M
                  int curr_decomposed_count;
85
86
46.6M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
46.6M
                  if (curr_decomposed_count >= 0)
88
3.09M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.09M
                      int shift = curr_decomposed_count - 1;
93
94
3.09M
                      if (shift < 0)
95
0
                        abort ();
96
3.09M
                      if (shift > 0)
97
2.49M
                        {
98
2.49M
                          int j;
99
100
2.49M
                          decomposed_count += shift;
101
2.49M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.51M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
28.7k
                            decomposed[j + shift] = decomposed[j];
105
2.49M
                        }
106
12.2M
                      for (; shift >= 0; shift--)
107
9.20M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.09M
                    }
109
43.5M
                  else
110
43.5M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
43.5M
                      curr++;
113
43.5M
                    }
114
46.6M
                }
115
37.4M
            }
116
37.4M
          }
117
19.3M
        else
118
19.3M
          {
119
19.3M
            count = 0;
120
19.3M
            decomposed_count = 0;
121
19.3M
          }
122
123
56.7M
        i = 0;
124
56.7M
        for (;;)
125
100M
          {
126
100M
            ucs4_t uc;
127
100M
            int ccc;
128
129
100M
            if (s < s_end)
130
81.0M
              {
131
                /* Fetch the next character from the decomposition.  */
132
81.0M
                if (i == decomposed_count)
133
37.4M
                  break;
134
43.5M
                uc = decomposed[i];
135
43.5M
                ccc = uc_combining_class (uc);
136
43.5M
              }
137
19.3M
            else
138
19.3M
              {
139
                /* End of string reached.  */
140
19.3M
                uc = 0;
141
19.3M
                ccc = 0;
142
19.3M
              }
143
144
62.9M
            if (ccc == 0)
145
60.2M
              {
146
60.2M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
60.2M
                if (sortbuf_count > 1)
151
1.36M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.36M
                                                           sortbuf + sortbuf_count);
153
154
60.2M
                if (composer != NULL)
155
60.2M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
60.2M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
40.9M
                      {
178
43.5M
                        for (j = 1; j < sortbuf_count; )
179
2.61M
                          {
180
2.61M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.39M
                              {
182
1.39M
                                ucs4_t combined =
183
1.39M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.39M
                                if (combined)
185
1.34M
                                  {
186
1.34M
                                    size_t k;
187
188
1.34M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
2.94M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.59M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.34M
                                    sortbuf_count--;
193
1.34M
                                    continue;
194
1.34M
                                  }
195
1.39M
                              }
196
1.26M
                            j++;
197
1.26M
                          }
198
40.9M
                        if (s < s_end && sortbuf_count == 1)
199
21.7M
                          {
200
21.7M
                            ucs4_t combined =
201
21.7M
                              composer (sortbuf[0].code, uc);
202
21.7M
                            if (combined)
203
18.3k
                              {
204
18.3k
                                uc = combined;
205
18.3k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
18.3k
                                sortbuf_count = 0;
210
18.3k
                              }
211
21.7M
                          }
212
40.9M
                      }
213
60.2M
                  }
214
215
102M
                for (j = 0; j < sortbuf_count; j++)
216
42.2M
                  {
217
42.2M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
42.2M
                    if (length < allocated)
221
23.0M
                      {
222
23.0M
                        int ret =
223
23.0M
                          U_UCTOMB (result + length, muc, allocated - length);
224
23.0M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
23.0M
                        if (ret >= 0)
230
23.0M
                          {
231
23.0M
                            length += ret;
232
23.0M
                            goto done_appending;
233
23.0M
                          }
234
23.0M
                      }
235
19.1M
                    {
236
19.1M
                      size_t old_allocated = allocated;
237
19.1M
                      size_t new_allocated = 2 * old_allocated;
238
19.1M
                      if (new_allocated < 64)
239
19.1M
                        new_allocated = 64;
240
19.1M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
19.1M
                      {
243
19.1M
                        UNIT *larger_result;
244
19.1M
                        if (result == NULL)
245
19.1M
                          {
246
19.1M
                            larger_result =
247
19.1M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
19.1M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
19.1M
                          }
254
17.5k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
17.5k
                        else
266
17.5k
                          {
267
17.5k
                            larger_result =
268
17.5k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
17.5k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
17.5k
                          }
275
19.1M
                        result = larger_result;
276
19.1M
                        allocated = new_allocated;
277
19.1M
                        {
278
19.1M
                          int ret =
279
19.1M
                            U_UCTOMB (result + length, muc, allocated - length);
280
19.1M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
19.1M
                          if (ret < 0)
286
0
                            abort ();
287
19.1M
                          length += ret;
288
19.1M
                          goto done_appending;
289
19.1M
                        }
290
19.1M
                      }
291
19.1M
                    }
292
42.2M
                   done_appending: ;
293
42.2M
                  }
294
295
                /* sortbuf is now empty.  */
296
60.2M
                sortbuf_count = 0;
297
60.2M
              }
298
299
62.9M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
19.3M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
43.5M
            if (sortbuf_count == sortbuf_allocated)
305
1.90k
              {
306
1.90k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.90k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.90k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.90k
                new_sortbuf =
312
1.90k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.90k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.90k
                memcpy (new_sortbuf, sortbuf,
319
1.90k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.90k
                if (sortbuf != sortbuf_preallocated)
321
611
                  free (sortbuf);
322
1.90k
                sortbuf = new_sortbuf;
323
1.90k
              }
324
43.5M
            sortbuf[sortbuf_count].code = uc;
325
43.5M
            sortbuf[sortbuf_count].ccc = ccc;
326
43.5M
            sortbuf_count++;
327
328
43.5M
            i++;
329
43.5M
          }
330
331
56.7M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
19.3M
          break;
334
335
37.4M
        s += count;
336
37.4M
      }
337
19.3M
  }
338
339
19.3M
  if (length == 0)
340
169k
    {
341
169k
      if (result == NULL)
342
169k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
169k
          result = (UNIT *) malloc (1);
345
169k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
169k
        }
351
169k
    }
352
19.1M
  else if (result != resultbuf && length < allocated)
353
19.1M
    {
354
      /* Shrink the allocated memory if possible.  */
355
19.1M
      UNIT *memory;
356
357
19.1M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
19.1M
      if (memory != NULL)
359
19.1M
        result = memory;
360
19.1M
    }
361
362
19.3M
  if (sortbuf_count > 0)
363
0
    abort ();
364
19.3M
  if (sortbuf != sortbuf_preallocated)
365
1.29k
    free (sortbuf);
366
367
19.3M
  *lengthp = length;
368
19.3M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
19.3M
}
u8_normalize
Line
Count
Source
21
6.26M
{
22
6.26M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
6.26M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
6.26M
  UNIT *result;
27
6.26M
  size_t length;
28
6.26M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
6.26M
  #define SORTBUF_PREALLOCATED 64
31
6.26M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
6.26M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
6.26M
  size_t sortbuf_allocated;
34
6.26M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
6.26M
  if (resultbuf == NULL)
38
6.26M
    {
39
6.26M
      result = NULL;
40
6.26M
      allocated = 0;
41
6.26M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
6.26M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
6.26M
  sortbuf = sortbuf_preallocated;
51
6.26M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
6.26M
  sortbuf_count = 0;
53
54
6.26M
  {
55
6.26M
    const UNIT *s_end = s + n;
56
57
6.26M
    for (;;)
58
25.2M
      {
59
25.2M
        int count;
60
25.2M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
25.2M
        int decomposed_count;
62
25.2M
        int i;
63
64
25.2M
        if (s < s_end)
65
18.9M
          {
66
            /* Fetch the next character.  */
67
18.9M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
18.9M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
18.9M
            {
77
18.9M
              int curr;
78
79
47.0M
              for (curr = 0; curr < decomposed_count; )
80
28.1M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
28.1M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
28.1M
                  int curr_decomposed_count;
85
86
28.1M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
28.1M
                  if (curr_decomposed_count >= 0)
88
3.05M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.05M
                      int shift = curr_decomposed_count - 1;
93
94
3.05M
                      if (shift < 0)
95
0
                        abort ();
96
3.05M
                      if (shift > 0)
97
2.45M
                        {
98
2.45M
                          int j;
99
100
2.45M
                          decomposed_count += shift;
101
2.45M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.47M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
17.6k
                            decomposed[j + shift] = decomposed[j];
105
2.45M
                        }
106
12.1M
                      for (; shift >= 0; shift--)
107
9.13M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.05M
                    }
109
25.0M
                  else
110
25.0M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
25.0M
                      curr++;
113
25.0M
                    }
114
28.1M
                }
115
18.9M
            }
116
18.9M
          }
117
6.26M
        else
118
6.26M
          {
119
6.26M
            count = 0;
120
6.26M
            decomposed_count = 0;
121
6.26M
          }
122
123
25.2M
        i = 0;
124
25.2M
        for (;;)
125
50.3M
          {
126
50.3M
            ucs4_t uc;
127
50.3M
            int ccc;
128
129
50.3M
            if (s < s_end)
130
44.0M
              {
131
                /* Fetch the next character from the decomposition.  */
132
44.0M
                if (i == decomposed_count)
133
18.9M
                  break;
134
25.0M
                uc = decomposed[i];
135
25.0M
                ccc = uc_combining_class (uc);
136
25.0M
              }
137
6.26M
            else
138
6.26M
              {
139
                /* End of string reached.  */
140
6.26M
                uc = 0;
141
6.26M
                ccc = 0;
142
6.26M
              }
143
144
31.3M
            if (ccc == 0)
145
28.7M
              {
146
28.7M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
28.7M
                if (sortbuf_count > 1)
151
1.33M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.33M
                                                           sortbuf + sortbuf_count);
153
154
28.7M
                if (composer != NULL)
155
28.7M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
28.7M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
22.5M
                      {
178
25.0M
                        for (j = 1; j < sortbuf_count; )
179
2.51M
                          {
180
2.51M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.35M
                              {
182
1.35M
                                ucs4_t combined =
183
1.35M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.35M
                                if (combined)
185
1.32M
                                  {
186
1.32M
                                    size_t k;
187
188
1.32M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
2.90M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.57M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.32M
                                    sortbuf_count--;
193
1.32M
                                    continue;
194
1.32M
                                  }
195
1.35M
                              }
196
1.18M
                            j++;
197
1.18M
                          }
198
22.5M
                        if (s < s_end && sortbuf_count == 1)
199
16.2M
                          {
200
16.2M
                            ucs4_t combined =
201
16.2M
                              composer (sortbuf[0].code, uc);
202
16.2M
                            if (combined)
203
8.01k
                              {
204
8.01k
                                uc = combined;
205
8.01k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
8.01k
                                sortbuf_count = 0;
210
8.01k
                              }
211
16.2M
                          }
212
22.5M
                      }
213
28.7M
                  }
214
215
52.5M
                for (j = 0; j < sortbuf_count; j++)
216
23.7M
                  {
217
23.7M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
23.7M
                    if (length < allocated)
221
17.4M
                      {
222
17.4M
                        int ret =
223
17.4M
                          U_UCTOMB (result + length, muc, allocated - length);
224
17.4M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
17.4M
                        if (ret >= 0)
230
17.4M
                          {
231
17.4M
                            length += ret;
232
17.4M
                            goto done_appending;
233
17.4M
                          }
234
17.4M
                      }
235
6.28M
                    {
236
6.28M
                      size_t old_allocated = allocated;
237
6.28M
                      size_t new_allocated = 2 * old_allocated;
238
6.28M
                      if (new_allocated < 64)
239
6.26M
                        new_allocated = 64;
240
6.28M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
6.28M
                      {
243
6.28M
                        UNIT *larger_result;
244
6.28M
                        if (result == NULL)
245
6.26M
                          {
246
6.26M
                            larger_result =
247
6.26M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
6.26M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
6.26M
                          }
254
14.6k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
14.6k
                        else
266
14.6k
                          {
267
14.6k
                            larger_result =
268
14.6k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
14.6k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
14.6k
                          }
275
6.28M
                        result = larger_result;
276
6.28M
                        allocated = new_allocated;
277
6.28M
                        {
278
6.28M
                          int ret =
279
6.28M
                            U_UCTOMB (result + length, muc, allocated - length);
280
6.28M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
6.28M
                          if (ret < 0)
286
0
                            abort ();
287
6.28M
                          length += ret;
288
6.28M
                          goto done_appending;
289
6.28M
                        }
290
6.28M
                      }
291
6.28M
                    }
292
23.7M
                   done_appending: ;
293
23.7M
                  }
294
295
                /* sortbuf is now empty.  */
296
28.7M
                sortbuf_count = 0;
297
28.7M
              }
298
299
31.3M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
6.26M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
25.0M
            if (sortbuf_count == sortbuf_allocated)
305
1.07k
              {
306
1.07k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.07k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.07k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.07k
                new_sortbuf =
312
1.07k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.07k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.07k
                memcpy (new_sortbuf, sortbuf,
319
1.07k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.07k
                if (sortbuf != sortbuf_preallocated)
321
611
                  free (sortbuf);
322
1.07k
                sortbuf = new_sortbuf;
323
1.07k
              }
324
25.0M
            sortbuf[sortbuf_count].code = uc;
325
25.0M
            sortbuf[sortbuf_count].ccc = ccc;
326
25.0M
            sortbuf_count++;
327
328
25.0M
            i++;
329
25.0M
          }
330
331
25.2M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
6.26M
          break;
334
335
18.9M
        s += count;
336
18.9M
      }
337
6.26M
  }
338
339
6.26M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
6.26M
  else if (result != resultbuf && length < allocated)
353
6.26M
    {
354
      /* Shrink the allocated memory if possible.  */
355
6.26M
      UNIT *memory;
356
357
6.26M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
6.26M
      if (memory != NULL)
359
6.26M
        result = memory;
360
6.26M
    }
361
362
6.26M
  if (sortbuf_count > 0)
363
0
    abort ();
364
6.26M
  if (sortbuf != sortbuf_preallocated)
365
467
    free (sortbuf);
366
367
6.26M
  *lengthp = length;
368
6.26M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
6.26M
}
u32_normalize
Line
Count
Source
21
13.0M
{
22
13.0M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
13.0M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
13.0M
  UNIT *result;
27
13.0M
  size_t length;
28
13.0M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
13.0M
  #define SORTBUF_PREALLOCATED 64
31
13.0M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
13.0M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
13.0M
  size_t sortbuf_allocated;
34
13.0M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
13.0M
  if (resultbuf == NULL)
38
13.0M
    {
39
13.0M
      result = NULL;
40
13.0M
      allocated = 0;
41
13.0M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
13.0M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
13.0M
  sortbuf = sortbuf_preallocated;
51
13.0M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
13.0M
  sortbuf_count = 0;
53
54
13.0M
  {
55
13.0M
    const UNIT *s_end = s + n;
56
57
13.0M
    for (;;)
58
31.5M
      {
59
31.5M
        int count;
60
31.5M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
31.5M
        int decomposed_count;
62
31.5M
        int i;
63
64
31.5M
        if (s < s_end)
65
18.4M
          {
66
            /* Fetch the next character.  */
67
18.4M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
18.4M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
18.4M
            {
77
18.4M
              int curr;
78
79
37.0M
              for (curr = 0; curr < decomposed_count; )
80
18.5M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
18.5M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
18.5M
                  int curr_decomposed_count;
85
86
18.5M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
18.5M
                  if (curr_decomposed_count >= 0)
88
36.1k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
36.1k
                      int shift = curr_decomposed_count - 1;
93
94
36.1k
                      if (shift < 0)
95
0
                        abort ();
96
36.1k
                      if (shift > 0)
97
35.7k
                        {
98
35.7k
                          int j;
99
100
35.7k
                          decomposed_count += shift;
101
35.7k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
46.8k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
11.1k
                            decomposed[j + shift] = decomposed[j];
105
35.7k
                        }
106
108k
                      for (; shift >= 0; shift--)
107
71.9k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
36.1k
                    }
109
18.5M
                  else
110
18.5M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
18.5M
                      curr++;
113
18.5M
                    }
114
18.5M
                }
115
18.4M
            }
116
18.4M
          }
117
13.0M
        else
118
13.0M
          {
119
13.0M
            count = 0;
120
13.0M
            decomposed_count = 0;
121
13.0M
          }
122
123
31.5M
        i = 0;
124
31.5M
        for (;;)
125
50.0M
          {
126
50.0M
            ucs4_t uc;
127
50.0M
            int ccc;
128
129
50.0M
            if (s < s_end)
130
37.0M
              {
131
                /* Fetch the next character from the decomposition.  */
132
37.0M
                if (i == decomposed_count)
133
18.4M
                  break;
134
18.5M
                uc = decomposed[i];
135
18.5M
                ccc = uc_combining_class (uc);
136
18.5M
              }
137
13.0M
            else
138
13.0M
              {
139
                /* End of string reached.  */
140
13.0M
                uc = 0;
141
13.0M
                ccc = 0;
142
13.0M
              }
143
144
31.5M
            if (ccc == 0)
145
31.4M
              {
146
31.4M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
31.4M
                if (sortbuf_count > 1)
151
27.2k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
27.2k
                                                           sortbuf + sortbuf_count);
153
154
31.4M
                if (composer != NULL)
155
31.4M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
31.4M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
18.4M
                      {
178
18.5M
                        for (j = 1; j < sortbuf_count; )
179
105k
                          {
180
105k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
40.6k
                              {
182
40.6k
                                ucs4_t combined =
183
40.6k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
40.6k
                                if (combined)
185
23.6k
                                  {
186
23.6k
                                    size_t k;
187
188
23.6k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
37.5k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
13.9k
                                      sortbuf[k - 1] = sortbuf[k];
192
23.6k
                                    sortbuf_count--;
193
23.6k
                                    continue;
194
23.6k
                                  }
195
40.6k
                              }
196
81.5k
                            j++;
197
81.5k
                          }
198
18.4M
                        if (s < s_end && sortbuf_count == 1)
199
5.51M
                          {
200
5.51M
                            ucs4_t combined =
201
5.51M
                              composer (sortbuf[0].code, uc);
202
5.51M
                            if (combined)
203
10.2k
                              {
204
10.2k
                                uc = combined;
205
10.2k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
10.2k
                                sortbuf_count = 0;
210
10.2k
                              }
211
5.51M
                          }
212
18.4M
                      }
213
31.4M
                  }
214
215
49.9M
                for (j = 0; j < sortbuf_count; j++)
216
18.4M
                  {
217
18.4M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
18.4M
                    if (length < allocated)
221
5.59M
                      {
222
5.59M
                        int ret =
223
5.59M
                          U_UCTOMB (result + length, muc, allocated - length);
224
5.59M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
5.59M
                        if (ret >= 0)
230
5.59M
                          {
231
5.59M
                            length += ret;
232
5.59M
                            goto done_appending;
233
5.59M
                          }
234
5.59M
                      }
235
12.8M
                    {
236
12.8M
                      size_t old_allocated = allocated;
237
12.8M
                      size_t new_allocated = 2 * old_allocated;
238
12.8M
                      if (new_allocated < 64)
239
12.8M
                        new_allocated = 64;
240
12.8M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
12.8M
                      {
243
12.8M
                        UNIT *larger_result;
244
12.8M
                        if (result == NULL)
245
12.8M
                          {
246
12.8M
                            larger_result =
247
12.8M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
12.8M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
12.8M
                          }
254
2.83k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
2.83k
                        else
266
2.83k
                          {
267
2.83k
                            larger_result =
268
2.83k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
2.83k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
2.83k
                          }
275
12.8M
                        result = larger_result;
276
12.8M
                        allocated = new_allocated;
277
12.8M
                        {
278
12.8M
                          int ret =
279
12.8M
                            U_UCTOMB (result + length, muc, allocated - length);
280
12.8M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
12.8M
                          if (ret < 0)
286
0
                            abort ();
287
12.8M
                          length += ret;
288
12.8M
                          goto done_appending;
289
12.8M
                        }
290
12.8M
                      }
291
12.8M
                    }
292
18.4M
                   done_appending: ;
293
18.4M
                  }
294
295
                /* sortbuf is now empty.  */
296
31.4M
                sortbuf_count = 0;
297
31.4M
              }
298
299
31.5M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
13.0M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
18.5M
            if (sortbuf_count == sortbuf_allocated)
305
830
              {
306
830
                struct ucs4_with_ccc *new_sortbuf;
307
308
830
                sortbuf_allocated = 2 * sortbuf_allocated;
309
830
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
830
                new_sortbuf =
312
830
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
830
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
830
                memcpy (new_sortbuf, sortbuf,
319
830
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
830
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
830
                sortbuf = new_sortbuf;
323
830
              }
324
18.5M
            sortbuf[sortbuf_count].code = uc;
325
18.5M
            sortbuf[sortbuf_count].ccc = ccc;
326
18.5M
            sortbuf_count++;
327
328
18.5M
            i++;
329
18.5M
          }
330
331
31.5M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
13.0M
          break;
334
335
18.4M
        s += count;
336
18.4M
      }
337
13.0M
  }
338
339
13.0M
  if (length == 0)
340
169k
    {
341
169k
      if (result == NULL)
342
169k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
169k
          result = (UNIT *) malloc (1);
345
169k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
169k
        }
351
169k
    }
352
12.8M
  else if (result != resultbuf && length < allocated)
353
12.8M
    {
354
      /* Shrink the allocated memory if possible.  */
355
12.8M
      UNIT *memory;
356
357
12.8M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
12.8M
      if (memory != NULL)
359
12.8M
        result = memory;
360
12.8M
    }
361
362
13.0M
  if (sortbuf_count > 0)
363
0
    abort ();
364
13.0M
  if (sortbuf != sortbuf_preallocated)
365
830
    free (sortbuf);
366
367
13.0M
  *lengthp = length;
368
13.0M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
13.0M
}