Coverage Report

Created: 2025-09-04 07:15

/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source (jump to first uncovered line)
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
19.1M
{
22
19.1M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
19.1M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
19.1M
  UNIT *result;
27
19.1M
  size_t length;
28
19.1M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
19.1M
  #define SORTBUF_PREALLOCATED 64
31
19.1M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
19.1M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
19.1M
  size_t sortbuf_allocated;
34
19.1M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
19.1M
  if (resultbuf == NULL)
38
19.1M
    {
39
19.1M
      result = NULL;
40
19.1M
      allocated = 0;
41
19.1M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
19.1M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
19.1M
  sortbuf = sortbuf_preallocated;
51
19.1M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
19.1M
  sortbuf_count = 0;
53
54
19.1M
  {
55
19.1M
    const UNIT *s_end = s + n;
56
57
19.1M
    for (;;)
58
56.6M
      {
59
56.6M
        int count;
60
56.6M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
56.6M
        int decomposed_count;
62
56.6M
        int i;
63
64
56.6M
        if (s < s_end)
65
37.5M
          {
66
            /* Fetch the next character.  */
67
37.5M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
37.5M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
37.5M
            {
77
37.5M
              int curr;
78
79
84.4M
              for (curr = 0; curr < decomposed_count; )
80
46.9M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
46.9M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
46.9M
                  int curr_decomposed_count;
85
86
46.9M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
46.9M
                  if (curr_decomposed_count >= 0)
88
3.26M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.26M
                      int shift = curr_decomposed_count - 1;
93
94
3.26M
                      if (shift < 0)
95
0
                        abort ();
96
3.26M
                      if (shift > 0)
97
2.61M
                        {
98
2.61M
                          int j;
99
100
2.61M
                          decomposed_count += shift;
101
2.61M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.63M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
28.6k
                            decomposed[j + shift] = decomposed[j];
105
2.61M
                        }
106
12.6M
                      for (; shift >= 0; shift--)
107
9.42M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.26M
                    }
109
43.6M
                  else
110
43.6M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
43.6M
                      curr++;
113
43.6M
                    }
114
46.9M
                }
115
37.5M
            }
116
37.5M
          }
117
19.1M
        else
118
19.1M
          {
119
19.1M
            count = 0;
120
19.1M
            decomposed_count = 0;
121
19.1M
          }
122
123
56.6M
        i = 0;
124
56.6M
        for (;;)
125
100M
          {
126
100M
            ucs4_t uc;
127
100M
            int ccc;
128
129
100M
            if (s < s_end)
130
81.2M
              {
131
                /* Fetch the next character from the decomposition.  */
132
81.2M
                if (i == decomposed_count)
133
37.5M
                  break;
134
43.6M
                uc = decomposed[i];
135
43.6M
                ccc = uc_combining_class (uc);
136
43.6M
              }
137
19.1M
            else
138
19.1M
              {
139
                /* End of string reached.  */
140
19.1M
                uc = 0;
141
19.1M
                ccc = 0;
142
19.1M
              }
143
144
62.8M
            if (ccc == 0)
145
59.9M
              {
146
59.9M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
59.9M
                if (sortbuf_count > 1)
151
1.41M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.41M
                                                           sortbuf + sortbuf_count);
153
154
59.9M
                if (composer != NULL)
155
59.9M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
59.9M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
40.8M
                      {
178
43.6M
                        for (j = 1; j < sortbuf_count; )
179
2.81M
                          {
180
2.81M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.44M
                              {
182
1.44M
                                ucs4_t combined =
183
1.44M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.44M
                                if (combined)
185
1.39M
                                  {
186
1.39M
                                    size_t k;
187
188
1.39M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.08M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.69M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.39M
                                    sortbuf_count--;
193
1.39M
                                    continue;
194
1.39M
                                  }
195
1.44M
                              }
196
1.41M
                            j++;
197
1.41M
                          }
198
40.8M
                        if (s < s_end && sortbuf_count == 1)
199
21.8M
                          {
200
21.8M
                            ucs4_t combined =
201
21.8M
                              composer (sortbuf[0].code, uc);
202
21.8M
                            if (combined)
203
17.0k
                              {
204
17.0k
                                uc = combined;
205
17.0k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
17.0k
                                sortbuf_count = 0;
210
17.0k
                              }
211
21.8M
                          }
212
40.8M
                      }
213
59.9M
                  }
214
215
102M
                for (j = 0; j < sortbuf_count; j++)
216
42.2M
                  {
217
42.2M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
42.2M
                    if (length < allocated)
221
23.3M
                      {
222
23.3M
                        int ret =
223
23.3M
                          U_UCTOMB (result + length, muc, allocated - length);
224
23.3M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
23.3M
                        if (ret >= 0)
230
23.2M
                          {
231
23.2M
                            length += ret;
232
23.2M
                            goto done_appending;
233
23.2M
                          }
234
23.3M
                      }
235
18.9M
                    {
236
18.9M
                      size_t old_allocated = allocated;
237
18.9M
                      size_t new_allocated = 2 * old_allocated;
238
18.9M
                      if (new_allocated < 64)
239
18.9M
                        new_allocated = 64;
240
18.9M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
18.9M
                      {
243
18.9M
                        UNIT *larger_result;
244
18.9M
                        if (result == NULL)
245
18.9M
                          {
246
18.9M
                            larger_result =
247
18.9M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
18.9M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
18.9M
                          }
254
16.6k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
16.6k
                        else
266
16.6k
                          {
267
16.6k
                            larger_result =
268
16.6k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
16.6k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
16.6k
                          }
275
18.9M
                        result = larger_result;
276
18.9M
                        allocated = new_allocated;
277
18.9M
                        {
278
18.9M
                          int ret =
279
18.9M
                            U_UCTOMB (result + length, muc, allocated - length);
280
18.9M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
18.9M
                          if (ret < 0)
286
0
                            abort ();
287
18.9M
                          length += ret;
288
18.9M
                          goto done_appending;
289
18.9M
                        }
290
18.9M
                      }
291
18.9M
                    }
292
42.2M
                   done_appending: ;
293
42.2M
                  }
294
295
                /* sortbuf is now empty.  */
296
59.9M
                sortbuf_count = 0;
297
59.9M
              }
298
299
62.8M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
19.1M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
43.6M
            if (sortbuf_count == sortbuf_allocated)
305
1.94k
              {
306
1.94k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.94k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.94k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.94k
                new_sortbuf =
312
1.94k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.94k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.94k
                memcpy (new_sortbuf, sortbuf,
319
1.94k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.94k
                if (sortbuf != sortbuf_preallocated)
321
634
                  free (sortbuf);
322
1.94k
                sortbuf = new_sortbuf;
323
1.94k
              }
324
43.6M
            sortbuf[sortbuf_count].code = uc;
325
43.6M
            sortbuf[sortbuf_count].ccc = ccc;
326
43.6M
            sortbuf_count++;
327
328
43.6M
            i++;
329
43.6M
          }
330
331
56.6M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
19.1M
          break;
334
335
37.5M
        s += count;
336
37.5M
      }
337
19.1M
  }
338
339
19.1M
  if (length == 0)
340
196k
    {
341
196k
      if (result == NULL)
342
196k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
196k
          result = (UNIT *) malloc (1);
345
196k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
196k
        }
351
196k
    }
352
18.9M
  else if (result != resultbuf && length < allocated)
353
18.9M
    {
354
      /* Shrink the allocated memory if possible.  */
355
18.9M
      UNIT *memory;
356
357
18.9M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
18.9M
      if (memory != NULL)
359
18.9M
        result = memory;
360
18.9M
    }
361
362
19.1M
  if (sortbuf_count > 0)
363
0
    abort ();
364
19.1M
  if (sortbuf != sortbuf_preallocated)
365
1.30k
    free (sortbuf);
366
367
19.1M
  *lengthp = length;
368
19.1M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
19.1M
}
u8_normalize
Line
Count
Source
21
6.13M
{
22
6.13M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
6.13M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
6.13M
  UNIT *result;
27
6.13M
  size_t length;
28
6.13M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
6.13M
  #define SORTBUF_PREALLOCATED 64
31
6.13M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
6.13M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
6.13M
  size_t sortbuf_allocated;
34
6.13M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
6.13M
  if (resultbuf == NULL)
38
6.13M
    {
39
6.13M
      result = NULL;
40
6.13M
      allocated = 0;
41
6.13M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
6.13M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
6.13M
  sortbuf = sortbuf_preallocated;
51
6.13M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
6.13M
  sortbuf_count = 0;
53
54
6.13M
  {
55
6.13M
    const UNIT *s_end = s + n;
56
57
6.13M
    for (;;)
58
25.0M
      {
59
25.0M
        int count;
60
25.0M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
25.0M
        int decomposed_count;
62
25.0M
        int i;
63
64
25.0M
        if (s < s_end)
65
18.9M
          {
66
            /* Fetch the next character.  */
67
18.9M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
18.9M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
18.9M
            {
77
18.9M
              int curr;
78
79
47.2M
              for (curr = 0; curr < decomposed_count; )
80
28.2M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
28.2M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
28.2M
                  int curr_decomposed_count;
85
86
28.2M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
28.2M
                  if (curr_decomposed_count >= 0)
88
3.23M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.23M
                      int shift = curr_decomposed_count - 1;
93
94
3.23M
                      if (shift < 0)
95
0
                        abort ();
96
3.23M
                      if (shift > 0)
97
2.57M
                        {
98
2.57M
                          int j;
99
100
2.57M
                          decomposed_count += shift;
101
2.57M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.59M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
18.4k
                            decomposed[j + shift] = decomposed[j];
105
2.57M
                        }
106
12.5M
                      for (; shift >= 0; shift--)
107
9.35M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.23M
                    }
109
25.0M
                  else
110
25.0M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
25.0M
                      curr++;
113
25.0M
                    }
114
28.2M
                }
115
18.9M
            }
116
18.9M
          }
117
6.13M
        else
118
6.13M
          {
119
6.13M
            count = 0;
120
6.13M
            decomposed_count = 0;
121
6.13M
          }
122
123
25.0M
        i = 0;
124
25.0M
        for (;;)
125
50.1M
          {
126
50.1M
            ucs4_t uc;
127
50.1M
            int ccc;
128
129
50.1M
            if (s < s_end)
130
43.9M
              {
131
                /* Fetch the next character from the decomposition.  */
132
43.9M
                if (i == decomposed_count)
133
18.9M
                  break;
134
25.0M
                uc = decomposed[i];
135
25.0M
                ccc = uc_combining_class (uc);
136
25.0M
              }
137
6.13M
            else
138
6.13M
              {
139
                /* End of string reached.  */
140
6.13M
                uc = 0;
141
6.13M
                ccc = 0;
142
6.13M
              }
143
144
31.1M
            if (ccc == 0)
145
28.4M
              {
146
28.4M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
28.4M
                if (sortbuf_count > 1)
151
1.38M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.38M
                                                           sortbuf + sortbuf_count);
153
154
28.4M
                if (composer != NULL)
155
28.4M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
28.4M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
22.3M
                      {
178
25.0M
                        for (j = 1; j < sortbuf_count; )
179
2.71M
                          {
180
2.71M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.40M
                              {
182
1.40M
                                ucs4_t combined =
183
1.40M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.40M
                                if (combined)
185
1.37M
                                  {
186
1.37M
                                    size_t k;
187
188
1.37M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.04M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.67M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.37M
                                    sortbuf_count--;
193
1.37M
                                    continue;
194
1.37M
                                  }
195
1.40M
                              }
196
1.33M
                            j++;
197
1.33M
                          }
198
22.3M
                        if (s < s_end && sortbuf_count == 1)
199
16.1M
                          {
200
16.1M
                            ucs4_t combined =
201
16.1M
                              composer (sortbuf[0].code, uc);
202
16.1M
                            if (combined)
203
7.38k
                              {
204
7.38k
                                uc = combined;
205
7.38k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
7.38k
                                sortbuf_count = 0;
210
7.38k
                              }
211
16.1M
                          }
212
22.3M
                      }
213
28.4M
                  }
214
215
52.1M
                for (j = 0; j < sortbuf_count; j++)
216
23.6M
                  {
217
23.6M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
23.6M
                    if (length < allocated)
221
17.5M
                      {
222
17.5M
                        int ret =
223
17.5M
                          U_UCTOMB (result + length, muc, allocated - length);
224
17.5M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
17.5M
                        if (ret >= 0)
230
17.5M
                          {
231
17.5M
                            length += ret;
232
17.5M
                            goto done_appending;
233
17.5M
                          }
234
17.5M
                      }
235
6.15M
                    {
236
6.15M
                      size_t old_allocated = allocated;
237
6.15M
                      size_t new_allocated = 2 * old_allocated;
238
6.15M
                      if (new_allocated < 64)
239
6.13M
                        new_allocated = 64;
240
6.15M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
6.15M
                      {
243
6.15M
                        UNIT *larger_result;
244
6.15M
                        if (result == NULL)
245
6.13M
                          {
246
6.13M
                            larger_result =
247
6.13M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
6.13M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
6.13M
                          }
254
13.8k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
13.8k
                        else
266
13.8k
                          {
267
13.8k
                            larger_result =
268
13.8k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
13.8k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
13.8k
                          }
275
6.15M
                        result = larger_result;
276
6.15M
                        allocated = new_allocated;
277
6.15M
                        {
278
6.15M
                          int ret =
279
6.15M
                            U_UCTOMB (result + length, muc, allocated - length);
280
6.15M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
6.15M
                          if (ret < 0)
286
0
                            abort ();
287
6.15M
                          length += ret;
288
6.15M
                          goto done_appending;
289
6.15M
                        }
290
6.15M
                      }
291
6.15M
                    }
292
23.6M
                   done_appending: ;
293
23.6M
                  }
294
295
                /* sortbuf is now empty.  */
296
28.4M
                sortbuf_count = 0;
297
28.4M
              }
298
299
31.1M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
6.13M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
25.0M
            if (sortbuf_count == sortbuf_allocated)
305
1.10k
              {
306
1.10k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.10k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.10k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.10k
                new_sortbuf =
312
1.10k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.10k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.10k
                memcpy (new_sortbuf, sortbuf,
319
1.10k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.10k
                if (sortbuf != sortbuf_preallocated)
321
634
                  free (sortbuf);
322
1.10k
                sortbuf = new_sortbuf;
323
1.10k
              }
324
25.0M
            sortbuf[sortbuf_count].code = uc;
325
25.0M
            sortbuf[sortbuf_count].ccc = ccc;
326
25.0M
            sortbuf_count++;
327
328
25.0M
            i++;
329
25.0M
          }
330
331
25.0M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
6.13M
          break;
334
335
18.9M
        s += count;
336
18.9M
      }
337
6.13M
  }
338
339
6.13M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
6.13M
  else if (result != resultbuf && length < allocated)
353
6.13M
    {
354
      /* Shrink the allocated memory if possible.  */
355
6.13M
      UNIT *memory;
356
357
6.13M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
6.13M
      if (memory != NULL)
359
6.13M
        result = memory;
360
6.13M
    }
361
362
6.13M
  if (sortbuf_count > 0)
363
0
    abort ();
364
6.13M
  if (sortbuf != sortbuf_preallocated)
365
467
    free (sortbuf);
366
367
6.13M
  *lengthp = length;
368
6.13M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
6.13M
}
u32_normalize
Line
Count
Source
21
13.0M
{
22
13.0M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
13.0M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
13.0M
  UNIT *result;
27
13.0M
  size_t length;
28
13.0M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
13.0M
  #define SORTBUF_PREALLOCATED 64
31
13.0M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
13.0M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
13.0M
  size_t sortbuf_allocated;
34
13.0M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
13.0M
  if (resultbuf == NULL)
38
13.0M
    {
39
13.0M
      result = NULL;
40
13.0M
      allocated = 0;
41
13.0M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
13.0M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
13.0M
  sortbuf = sortbuf_preallocated;
51
13.0M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
13.0M
  sortbuf_count = 0;
53
54
13.0M
  {
55
13.0M
    const UNIT *s_end = s + n;
56
57
13.0M
    for (;;)
58
31.6M
      {
59
31.6M
        int count;
60
31.6M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
31.6M
        int decomposed_count;
62
31.6M
        int i;
63
64
31.6M
        if (s < s_end)
65
18.6M
          {
66
            /* Fetch the next character.  */
67
18.6M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
18.6M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
18.6M
            {
77
18.6M
              int curr;
78
79
37.2M
              for (curr = 0; curr < decomposed_count; )
80
18.6M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
18.6M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
18.6M
                  int curr_decomposed_count;
85
86
18.6M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
18.6M
                  if (curr_decomposed_count >= 0)
88
36.0k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
36.0k
                      int shift = curr_decomposed_count - 1;
93
94
36.0k
                      if (shift < 0)
95
0
                        abort ();
96
36.0k
                      if (shift > 0)
97
35.6k
                        {
98
35.6k
                          int j;
99
100
35.6k
                          decomposed_count += shift;
101
35.6k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
45.9k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
10.2k
                            decomposed[j + shift] = decomposed[j];
105
35.6k
                        }
106
107k
                      for (; shift >= 0; shift--)
107
71.7k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
36.0k
                    }
109
18.6M
                  else
110
18.6M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
18.6M
                      curr++;
113
18.6M
                    }
114
18.6M
                }
115
18.6M
            }
116
18.6M
          }
117
13.0M
        else
118
13.0M
          {
119
13.0M
            count = 0;
120
13.0M
            decomposed_count = 0;
121
13.0M
          }
122
123
31.6M
        i = 0;
124
31.6M
        for (;;)
125
50.2M
          {
126
50.2M
            ucs4_t uc;
127
50.2M
            int ccc;
128
129
50.2M
            if (s < s_end)
130
37.2M
              {
131
                /* Fetch the next character from the decomposition.  */
132
37.2M
                if (i == decomposed_count)
133
18.6M
                  break;
134
18.6M
                uc = decomposed[i];
135
18.6M
                ccc = uc_combining_class (uc);
136
18.6M
              }
137
13.0M
            else
138
13.0M
              {
139
                /* End of string reached.  */
140
13.0M
                uc = 0;
141
13.0M
                ccc = 0;
142
13.0M
              }
143
144
31.6M
            if (ccc == 0)
145
31.5M
              {
146
31.5M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
31.5M
                if (sortbuf_count > 1)
151
27.7k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
27.7k
                                                           sortbuf + sortbuf_count);
153
154
31.5M
                if (composer != NULL)
155
31.5M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
31.5M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
18.5M
                      {
178
18.6M
                        for (j = 1; j < sortbuf_count; )
179
103k
                          {
180
103k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
39.4k
                              {
182
39.4k
                                ucs4_t combined =
183
39.4k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
39.4k
                                if (combined)
185
23.5k
                                  {
186
23.5k
                                    size_t k;
187
188
23.5k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
36.7k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
13.2k
                                      sortbuf[k - 1] = sortbuf[k];
192
23.5k
                                    sortbuf_count--;
193
23.5k
                                    continue;
194
23.5k
                                  }
195
39.4k
                              }
196
80.3k
                            j++;
197
80.3k
                          }
198
18.5M
                        if (s < s_end && sortbuf_count == 1)
199
5.69M
                          {
200
5.69M
                            ucs4_t combined =
201
5.69M
                              composer (sortbuf[0].code, uc);
202
5.69M
                            if (combined)
203
9.63k
                              {
204
9.63k
                                uc = combined;
205
9.63k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
9.63k
                                sortbuf_count = 0;
210
9.63k
                              }
211
5.69M
                          }
212
18.5M
                      }
213
31.5M
                  }
214
215
50.1M
                for (j = 0; j < sortbuf_count; j++)
216
18.6M
                  {
217
18.6M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
18.6M
                    if (length < allocated)
221
5.78M
                      {
222
5.78M
                        int ret =
223
5.78M
                          U_UCTOMB (result + length, muc, allocated - length);
224
5.78M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
5.78M
                        if (ret >= 0)
230
5.78M
                          {
231
5.78M
                            length += ret;
232
5.78M
                            goto done_appending;
233
5.78M
                          }
234
5.78M
                      }
235
12.8M
                    {
236
12.8M
                      size_t old_allocated = allocated;
237
12.8M
                      size_t new_allocated = 2 * old_allocated;
238
12.8M
                      if (new_allocated < 64)
239
12.8M
                        new_allocated = 64;
240
12.8M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
12.8M
                      {
243
12.8M
                        UNIT *larger_result;
244
12.8M
                        if (result == NULL)
245
12.8M
                          {
246
12.8M
                            larger_result =
247
12.8M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
12.8M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
12.8M
                          }
254
2.81k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
2.81k
                        else
266
2.81k
                          {
267
2.81k
                            larger_result =
268
2.81k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
2.81k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
2.81k
                          }
275
12.8M
                        result = larger_result;
276
12.8M
                        allocated = new_allocated;
277
12.8M
                        {
278
12.8M
                          int ret =
279
12.8M
                            U_UCTOMB (result + length, muc, allocated - length);
280
12.8M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
12.8M
                          if (ret < 0)
286
0
                            abort ();
287
12.8M
                          length += ret;
288
12.8M
                          goto done_appending;
289
12.8M
                        }
290
12.8M
                      }
291
12.8M
                    }
292
18.6M
                   done_appending: ;
293
18.6M
                  }
294
295
                /* sortbuf is now empty.  */
296
31.5M
                sortbuf_count = 0;
297
31.5M
              }
298
299
31.6M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
13.0M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
18.6M
            if (sortbuf_count == sortbuf_allocated)
305
842
              {
306
842
                struct ucs4_with_ccc *new_sortbuf;
307
308
842
                sortbuf_allocated = 2 * sortbuf_allocated;
309
842
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
842
                new_sortbuf =
312
842
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
842
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
842
                memcpy (new_sortbuf, sortbuf,
319
842
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
842
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
842
                sortbuf = new_sortbuf;
323
842
              }
324
18.6M
            sortbuf[sortbuf_count].code = uc;
325
18.6M
            sortbuf[sortbuf_count].ccc = ccc;
326
18.6M
            sortbuf_count++;
327
328
18.6M
            i++;
329
18.6M
          }
330
331
31.6M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
13.0M
          break;
334
335
18.6M
        s += count;
336
18.6M
      }
337
13.0M
  }
338
339
13.0M
  if (length == 0)
340
196k
    {
341
196k
      if (result == NULL)
342
196k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
196k
          result = (UNIT *) malloc (1);
345
196k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
196k
        }
351
196k
    }
352
12.8M
  else if (result != resultbuf && length < allocated)
353
12.8M
    {
354
      /* Shrink the allocated memory if possible.  */
355
12.8M
      UNIT *memory;
356
357
12.8M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
12.8M
      if (memory != NULL)
359
12.8M
        result = memory;
360
12.8M
    }
361
362
13.0M
  if (sortbuf_count > 0)
363
0
    abort ();
364
13.0M
  if (sortbuf != sortbuf_preallocated)
365
842
    free (sortbuf);
366
367
13.0M
  *lengthp = length;
368
13.0M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
13.0M
}