Coverage Report

Created: 2025-07-17 06:59

/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source (jump to first uncovered line)
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
17.5M
{
22
17.5M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
17.5M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
17.5M
  UNIT *result;
27
17.5M
  size_t length;
28
17.5M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
17.5M
  #define SORTBUF_PREALLOCATED 64
31
17.5M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
17.5M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
17.5M
  size_t sortbuf_allocated;
34
17.5M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
17.5M
  if (resultbuf == NULL)
38
17.5M
    {
39
17.5M
      result = NULL;
40
17.5M
      allocated = 0;
41
17.5M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
17.5M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
17.5M
  sortbuf = sortbuf_preallocated;
51
17.5M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
17.5M
  sortbuf_count = 0;
53
54
17.5M
  {
55
17.5M
    const UNIT *s_end = s + n;
56
57
17.5M
    for (;;)
58
52.5M
      {
59
52.5M
        int count;
60
52.5M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
52.5M
        int decomposed_count;
62
52.5M
        int i;
63
64
52.5M
        if (s < s_end)
65
35.0M
          {
66
            /* Fetch the next character.  */
67
35.0M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
35.0M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
35.0M
            {
77
35.0M
              int curr;
78
79
78.8M
              for (curr = 0; curr < decomposed_count; )
80
43.8M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
43.8M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
43.8M
                  int curr_decomposed_count;
85
86
43.8M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
43.8M
                  if (curr_decomposed_count >= 0)
88
3.12M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.12M
                      int shift = curr_decomposed_count - 1;
93
94
3.12M
                      if (shift < 0)
95
0
                        abort ();
96
3.12M
                      if (shift > 0)
97
2.53M
                        {
98
2.53M
                          int j;
99
100
2.53M
                          decomposed_count += shift;
101
2.53M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.56M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
28.0k
                            decomposed[j + shift] = decomposed[j];
105
2.53M
                        }
106
12.0M
                      for (; shift >= 0; shift--)
107
8.88M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.12M
                    }
109
40.7M
                  else
110
40.7M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
40.7M
                      curr++;
113
40.7M
                    }
114
43.8M
                }
115
35.0M
            }
116
35.0M
          }
117
17.5M
        else
118
17.5M
          {
119
17.5M
            count = 0;
120
17.5M
            decomposed_count = 0;
121
17.5M
          }
122
123
52.5M
        i = 0;
124
52.5M
        for (;;)
125
93.3M
          {
126
93.3M
            ucs4_t uc;
127
93.3M
            int ccc;
128
129
93.3M
            if (s < s_end)
130
75.7M
              {
131
                /* Fetch the next character from the decomposition.  */
132
75.7M
                if (i == decomposed_count)
133
35.0M
                  break;
134
40.7M
                uc = decomposed[i];
135
40.7M
                ccc = uc_combining_class (uc);
136
40.7M
              }
137
17.5M
            else
138
17.5M
              {
139
                /* End of string reached.  */
140
17.5M
                uc = 0;
141
17.5M
                ccc = 0;
142
17.5M
              }
143
144
58.2M
            if (ccc == 0)
145
55.5M
              {
146
55.5M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
55.5M
                if (sortbuf_count > 1)
151
1.40M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.40M
                                                           sortbuf + sortbuf_count);
153
154
55.5M
                if (composer != NULL)
155
55.5M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
55.5M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
37.9M
                      {
178
40.7M
                        for (j = 1; j < sortbuf_count; )
179
2.74M
                          {
180
2.74M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.43M
                              {
182
1.43M
                                ucs4_t combined =
183
1.43M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.43M
                                if (combined)
185
1.39M
                                  {
186
1.39M
                                    size_t k;
187
188
1.39M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.12M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.72M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.39M
                                    sortbuf_count--;
193
1.39M
                                    continue;
194
1.39M
                                  }
195
1.43M
                              }
196
1.35M
                            j++;
197
1.35M
                          }
198
37.9M
                        if (s < s_end && sortbuf_count == 1)
199
20.5M
                          {
200
20.5M
                            ucs4_t combined =
201
20.5M
                              composer (sortbuf[0].code, uc);
202
20.5M
                            if (combined)
203
17.2k
                              {
204
17.2k
                                uc = combined;
205
17.2k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
17.2k
                                sortbuf_count = 0;
210
17.2k
                              }
211
20.5M
                          }
212
37.9M
                      }
213
55.5M
                  }
214
215
94.8M
                for (j = 0; j < sortbuf_count; j++)
216
39.3M
                  {
217
39.3M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
39.3M
                    if (length < allocated)
221
21.9M
                      {
222
21.9M
                        int ret =
223
21.9M
                          U_UCTOMB (result + length, muc, allocated - length);
224
21.9M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
21.9M
                        if (ret >= 0)
230
21.9M
                          {
231
21.9M
                            length += ret;
232
21.9M
                            goto done_appending;
233
21.9M
                          }
234
21.9M
                      }
235
17.3M
                    {
236
17.3M
                      size_t old_allocated = allocated;
237
17.3M
                      size_t new_allocated = 2 * old_allocated;
238
17.3M
                      if (new_allocated < 64)
239
17.3M
                        new_allocated = 64;
240
17.3M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
17.3M
                      {
243
17.3M
                        UNIT *larger_result;
244
17.3M
                        if (result == NULL)
245
17.3M
                          {
246
17.3M
                            larger_result =
247
17.3M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
17.3M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
17.3M
                          }
254
18.8k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
18.8k
                        else
266
18.8k
                          {
267
18.8k
                            larger_result =
268
18.8k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
18.8k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
18.8k
                          }
275
17.3M
                        result = larger_result;
276
17.3M
                        allocated = new_allocated;
277
17.3M
                        {
278
17.3M
                          int ret =
279
17.3M
                            U_UCTOMB (result + length, muc, allocated - length);
280
17.3M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
17.3M
                          if (ret < 0)
286
0
                            abort ();
287
17.3M
                          length += ret;
288
17.3M
                          goto done_appending;
289
17.3M
                        }
290
17.3M
                      }
291
17.3M
                    }
292
39.3M
                   done_appending: ;
293
39.3M
                  }
294
295
                /* sortbuf is now empty.  */
296
55.5M
                sortbuf_count = 0;
297
55.5M
              }
298
299
58.2M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
17.5M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
40.7M
            if (sortbuf_count == sortbuf_allocated)
305
1.94k
              {
306
1.94k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.94k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.94k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.94k
                new_sortbuf =
312
1.94k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.94k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.94k
                memcpy (new_sortbuf, sortbuf,
319
1.94k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.94k
                if (sortbuf != sortbuf_preallocated)
321
639
                  free (sortbuf);
322
1.94k
                sortbuf = new_sortbuf;
323
1.94k
              }
324
40.7M
            sortbuf[sortbuf_count].code = uc;
325
40.7M
            sortbuf[sortbuf_count].ccc = ccc;
326
40.7M
            sortbuf_count++;
327
328
40.7M
            i++;
329
40.7M
          }
330
331
52.5M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
17.5M
          break;
334
335
35.0M
        s += count;
336
35.0M
      }
337
17.5M
  }
338
339
17.5M
  if (length == 0)
340
178k
    {
341
178k
      if (result == NULL)
342
178k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
178k
          result = (UNIT *) malloc (1);
345
178k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
178k
        }
351
178k
    }
352
17.3M
  else if (result != resultbuf && length < allocated)
353
17.3M
    {
354
      /* Shrink the allocated memory if possible.  */
355
17.3M
      UNIT *memory;
356
357
17.3M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
17.3M
      if (memory != NULL)
359
17.3M
        result = memory;
360
17.3M
    }
361
362
17.5M
  if (sortbuf_count > 0)
363
0
    abort ();
364
17.5M
  if (sortbuf != sortbuf_preallocated)
365
1.31k
    free (sortbuf);
366
367
17.5M
  *lengthp = length;
368
17.5M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
17.5M
}
u8_normalize
Line
Count
Source
21
5.61M
{
22
5.61M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
5.61M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
5.61M
  UNIT *result;
27
5.61M
  size_t length;
28
5.61M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
5.61M
  #define SORTBUF_PREALLOCATED 64
31
5.61M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
5.61M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
5.61M
  size_t sortbuf_allocated;
34
5.61M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
5.61M
  if (resultbuf == NULL)
38
5.61M
    {
39
5.61M
      result = NULL;
40
5.61M
      allocated = 0;
41
5.61M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
5.61M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
5.61M
  sortbuf = sortbuf_preallocated;
51
5.61M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
5.61M
  sortbuf_count = 0;
53
54
5.61M
  {
55
5.61M
    const UNIT *s_end = s + n;
56
57
5.61M
    for (;;)
58
23.4M
      {
59
23.4M
        int count;
60
23.4M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
23.4M
        int decomposed_count;
62
23.4M
        int i;
63
64
23.4M
        if (s < s_end)
65
17.8M
          {
66
            /* Fetch the next character.  */
67
17.8M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
17.8M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
17.8M
            {
77
17.8M
              int curr;
78
79
44.4M
              for (curr = 0; curr < decomposed_count; )
80
26.6M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
26.6M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
26.6M
                  int curr_decomposed_count;
85
86
26.6M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
26.6M
                  if (curr_decomposed_count >= 0)
88
3.09M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.09M
                      int shift = curr_decomposed_count - 1;
93
94
3.09M
                      if (shift < 0)
95
0
                        abort ();
96
3.09M
                      if (shift > 0)
97
2.50M
                        {
98
2.50M
                          int j;
99
100
2.50M
                          decomposed_count += shift;
101
2.50M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.52M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
17.1k
                            decomposed[j + shift] = decomposed[j];
105
2.50M
                        }
106
11.9M
                      for (; shift >= 0; shift--)
107
8.80M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.09M
                    }
109
23.5M
                  else
110
23.5M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
23.5M
                      curr++;
113
23.5M
                    }
114
26.6M
                }
115
17.8M
            }
116
17.8M
          }
117
5.61M
        else
118
5.61M
          {
119
5.61M
            count = 0;
120
5.61M
            decomposed_count = 0;
121
5.61M
          }
122
123
23.4M
        i = 0;
124
23.4M
        for (;;)
125
46.9M
          {
126
46.9M
            ucs4_t uc;
127
46.9M
            int ccc;
128
129
46.9M
            if (s < s_end)
130
41.3M
              {
131
                /* Fetch the next character from the decomposition.  */
132
41.3M
                if (i == decomposed_count)
133
17.8M
                  break;
134
23.5M
                uc = decomposed[i];
135
23.5M
                ccc = uc_combining_class (uc);
136
23.5M
              }
137
5.61M
            else
138
5.61M
              {
139
                /* End of string reached.  */
140
5.61M
                uc = 0;
141
5.61M
                ccc = 0;
142
5.61M
              }
143
144
29.1M
            if (ccc == 0)
145
26.4M
              {
146
26.4M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
26.4M
                if (sortbuf_count > 1)
151
1.37M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.37M
                                                           sortbuf + sortbuf_count);
153
154
26.4M
                if (composer != NULL)
155
26.4M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
26.4M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
20.8M
                      {
178
23.5M
                        for (j = 1; j < sortbuf_count; )
179
2.64M
                          {
180
2.64M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.39M
                              {
182
1.39M
                                ucs4_t combined =
183
1.39M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.39M
                                if (combined)
185
1.37M
                                  {
186
1.37M
                                    size_t k;
187
188
1.37M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.08M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.71M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.37M
                                    sortbuf_count--;
193
1.37M
                                    continue;
194
1.37M
                                  }
195
1.39M
                              }
196
1.27M
                            j++;
197
1.27M
                          }
198
20.8M
                        if (s < s_end && sortbuf_count == 1)
199
15.2M
                          {
200
15.2M
                            ucs4_t combined =
201
15.2M
                              composer (sortbuf[0].code, uc);
202
15.2M
                            if (combined)
203
7.57k
                              {
204
7.57k
                                uc = combined;
205
7.57k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
7.57k
                                sortbuf_count = 0;
210
7.57k
                              }
211
15.2M
                          }
212
20.8M
                      }
213
26.4M
                  }
214
215
48.6M
                for (j = 0; j < sortbuf_count; j++)
216
22.1M
                  {
217
22.1M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
22.1M
                    if (length < allocated)
221
16.5M
                      {
222
16.5M
                        int ret =
223
16.5M
                          U_UCTOMB (result + length, muc, allocated - length);
224
16.5M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
16.5M
                        if (ret >= 0)
230
16.5M
                          {
231
16.5M
                            length += ret;
232
16.5M
                            goto done_appending;
233
16.5M
                          }
234
16.5M
                      }
235
5.63M
                    {
236
5.63M
                      size_t old_allocated = allocated;
237
5.63M
                      size_t new_allocated = 2 * old_allocated;
238
5.63M
                      if (new_allocated < 64)
239
5.61M
                        new_allocated = 64;
240
5.63M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
5.63M
                      {
243
5.63M
                        UNIT *larger_result;
244
5.63M
                        if (result == NULL)
245
5.61M
                          {
246
5.61M
                            larger_result =
247
5.61M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
5.61M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
5.61M
                          }
254
15.5k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
15.5k
                        else
266
15.5k
                          {
267
15.5k
                            larger_result =
268
15.5k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
15.5k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
15.5k
                          }
275
5.63M
                        result = larger_result;
276
5.63M
                        allocated = new_allocated;
277
5.63M
                        {
278
5.63M
                          int ret =
279
5.63M
                            U_UCTOMB (result + length, muc, allocated - length);
280
5.63M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
5.63M
                          if (ret < 0)
286
0
                            abort ();
287
5.63M
                          length += ret;
288
5.63M
                          goto done_appending;
289
5.63M
                        }
290
5.63M
                      }
291
5.63M
                    }
292
22.1M
                   done_appending: ;
293
22.1M
                  }
294
295
                /* sortbuf is now empty.  */
296
26.4M
                sortbuf_count = 0;
297
26.4M
              }
298
299
29.1M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
5.61M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
23.5M
            if (sortbuf_count == sortbuf_allocated)
305
1.11k
              {
306
1.11k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.11k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.11k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.11k
                new_sortbuf =
312
1.11k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.11k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.11k
                memcpy (new_sortbuf, sortbuf,
319
1.11k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.11k
                if (sortbuf != sortbuf_preallocated)
321
639
                  free (sortbuf);
322
1.11k
                sortbuf = new_sortbuf;
323
1.11k
              }
324
23.5M
            sortbuf[sortbuf_count].code = uc;
325
23.5M
            sortbuf[sortbuf_count].ccc = ccc;
326
23.5M
            sortbuf_count++;
327
328
23.5M
            i++;
329
23.5M
          }
330
331
23.4M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
5.61M
          break;
334
335
17.8M
        s += count;
336
17.8M
      }
337
5.61M
  }
338
339
5.61M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
5.61M
  else if (result != resultbuf && length < allocated)
353
5.61M
    {
354
      /* Shrink the allocated memory if possible.  */
355
5.61M
      UNIT *memory;
356
357
5.61M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
5.61M
      if (memory != NULL)
359
5.61M
        result = memory;
360
5.61M
    }
361
362
5.61M
  if (sortbuf_count > 0)
363
0
    abort ();
364
5.61M
  if (sortbuf != sortbuf_preallocated)
365
475
    free (sortbuf);
366
367
5.61M
  *lengthp = length;
368
5.61M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
5.61M
}
u32_normalize
Line
Count
Source
21
11.9M
{
22
11.9M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
11.9M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
11.9M
  UNIT *result;
27
11.9M
  size_t length;
28
11.9M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
11.9M
  #define SORTBUF_PREALLOCATED 64
31
11.9M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
11.9M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
11.9M
  size_t sortbuf_allocated;
34
11.9M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
11.9M
  if (resultbuf == NULL)
38
11.9M
    {
39
11.9M
      result = NULL;
40
11.9M
      allocated = 0;
41
11.9M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
11.9M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
11.9M
  sortbuf = sortbuf_preallocated;
51
11.9M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
11.9M
  sortbuf_count = 0;
53
54
11.9M
  {
55
11.9M
    const UNIT *s_end = s + n;
56
57
11.9M
    for (;;)
58
29.1M
      {
59
29.1M
        int count;
60
29.1M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
29.1M
        int decomposed_count;
62
29.1M
        int i;
63
64
29.1M
        if (s < s_end)
65
17.1M
          {
66
            /* Fetch the next character.  */
67
17.1M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
17.1M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
17.1M
            {
77
17.1M
              int curr;
78
79
34.4M
              for (curr = 0; curr < decomposed_count; )
80
17.2M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
17.2M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
17.2M
                  int curr_decomposed_count;
85
86
17.2M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
17.2M
                  if (curr_decomposed_count >= 0)
88
35.5k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
35.5k
                      int shift = curr_decomposed_count - 1;
93
94
35.5k
                      if (shift < 0)
95
0
                        abort ();
96
35.5k
                      if (shift > 0)
97
35.2k
                        {
98
35.2k
                          int j;
99
100
35.2k
                          decomposed_count += shift;
101
35.2k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
46.1k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
10.8k
                            decomposed[j + shift] = decomposed[j];
105
35.2k
                        }
106
106k
                      for (; shift >= 0; shift--)
107
70.8k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
35.5k
                    }
109
17.2M
                  else
110
17.2M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
17.2M
                      curr++;
113
17.2M
                    }
114
17.2M
                }
115
17.1M
            }
116
17.1M
          }
117
11.9M
        else
118
11.9M
          {
119
11.9M
            count = 0;
120
11.9M
            decomposed_count = 0;
121
11.9M
          }
122
123
29.1M
        i = 0;
124
29.1M
        for (;;)
125
46.3M
          {
126
46.3M
            ucs4_t uc;
127
46.3M
            int ccc;
128
129
46.3M
            if (s < s_end)
130
34.4M
              {
131
                /* Fetch the next character from the decomposition.  */
132
34.4M
                if (i == decomposed_count)
133
17.1M
                  break;
134
17.2M
                uc = decomposed[i];
135
17.2M
                ccc = uc_combining_class (uc);
136
17.2M
              }
137
11.9M
            else
138
11.9M
              {
139
                /* End of string reached.  */
140
11.9M
                uc = 0;
141
11.9M
                ccc = 0;
142
11.9M
              }
143
144
29.1M
            if (ccc == 0)
145
29.0M
              {
146
29.0M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
29.0M
                if (sortbuf_count > 1)
151
28.3k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
28.3k
                                                           sortbuf + sortbuf_count);
153
154
29.0M
                if (composer != NULL)
155
29.0M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
29.0M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
17.1M
                      {
178
17.2M
                        for (j = 1; j < sortbuf_count; )
179
105k
                          {
180
105k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
40.8k
                              {
182
40.8k
                                ucs4_t combined =
183
40.8k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
40.8k
                                if (combined)
185
23.8k
                                  {
186
23.8k
                                    size_t k;
187
188
23.8k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
37.5k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
13.6k
                                      sortbuf[k - 1] = sortbuf[k];
192
23.8k
                                    sortbuf_count--;
193
23.8k
                                    continue;
194
23.8k
                                  }
195
40.8k
                              }
196
81.3k
                            j++;
197
81.3k
                          }
198
17.1M
                        if (s < s_end && sortbuf_count == 1)
199
5.35M
                          {
200
5.35M
                            ucs4_t combined =
201
5.35M
                              composer (sortbuf[0].code, uc);
202
5.35M
                            if (combined)
203
9.71k
                              {
204
9.71k
                                uc = combined;
205
9.71k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
9.71k
                                sortbuf_count = 0;
210
9.71k
                              }
211
5.35M
                          }
212
17.1M
                      }
213
29.0M
                  }
214
215
46.2M
                for (j = 0; j < sortbuf_count; j++)
216
17.1M
                  {
217
17.1M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
17.1M
                    if (length < allocated)
221
5.44M
                      {
222
5.44M
                        int ret =
223
5.44M
                          U_UCTOMB (result + length, muc, allocated - length);
224
5.44M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
5.44M
                        if (ret >= 0)
230
5.44M
                          {
231
5.44M
                            length += ret;
232
5.44M
                            goto done_appending;
233
5.44M
                          }
234
5.44M
                      }
235
11.7M
                    {
236
11.7M
                      size_t old_allocated = allocated;
237
11.7M
                      size_t new_allocated = 2 * old_allocated;
238
11.7M
                      if (new_allocated < 64)
239
11.7M
                        new_allocated = 64;
240
11.7M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
11.7M
                      {
243
11.7M
                        UNIT *larger_result;
244
11.7M
                        if (result == NULL)
245
11.7M
                          {
246
11.7M
                            larger_result =
247
11.7M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
11.7M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
11.7M
                          }
254
3.34k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
3.34k
                        else
266
3.34k
                          {
267
3.34k
                            larger_result =
268
3.34k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
3.34k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
3.34k
                          }
275
11.7M
                        result = larger_result;
276
11.7M
                        allocated = new_allocated;
277
11.7M
                        {
278
11.7M
                          int ret =
279
11.7M
                            U_UCTOMB (result + length, muc, allocated - length);
280
11.7M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
11.7M
                          if (ret < 0)
286
0
                            abort ();
287
11.7M
                          length += ret;
288
11.7M
                          goto done_appending;
289
11.7M
                        }
290
11.7M
                      }
291
11.7M
                    }
292
17.1M
                   done_appending: ;
293
17.1M
                  }
294
295
                /* sortbuf is now empty.  */
296
29.0M
                sortbuf_count = 0;
297
29.0M
              }
298
299
29.1M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
11.9M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
17.2M
            if (sortbuf_count == sortbuf_allocated)
305
835
              {
306
835
                struct ucs4_with_ccc *new_sortbuf;
307
308
835
                sortbuf_allocated = 2 * sortbuf_allocated;
309
835
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
835
                new_sortbuf =
312
835
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
835
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
835
                memcpy (new_sortbuf, sortbuf,
319
835
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
835
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
835
                sortbuf = new_sortbuf;
323
835
              }
324
17.2M
            sortbuf[sortbuf_count].code = uc;
325
17.2M
            sortbuf[sortbuf_count].ccc = ccc;
326
17.2M
            sortbuf_count++;
327
328
17.2M
            i++;
329
17.2M
          }
330
331
29.1M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
11.9M
          break;
334
335
17.1M
        s += count;
336
17.1M
      }
337
11.9M
  }
338
339
11.9M
  if (length == 0)
340
178k
    {
341
178k
      if (result == NULL)
342
178k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
178k
          result = (UNIT *) malloc (1);
345
178k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
178k
        }
351
178k
    }
352
11.7M
  else if (result != resultbuf && length < allocated)
353
11.7M
    {
354
      /* Shrink the allocated memory if possible.  */
355
11.7M
      UNIT *memory;
356
357
11.7M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
11.7M
      if (memory != NULL)
359
11.7M
        result = memory;
360
11.7M
    }
361
362
11.9M
  if (sortbuf_count > 0)
363
0
    abort ();
364
11.9M
  if (sortbuf != sortbuf_preallocated)
365
835
    free (sortbuf);
366
367
11.9M
  *lengthp = length;
368
11.9M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
11.9M
}