Coverage Report

Created: 2025-08-28 06:41

/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source (jump to first uncovered line)
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
18.6M
{
22
18.6M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
18.6M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
18.6M
  UNIT *result;
27
18.6M
  size_t length;
28
18.6M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
18.6M
  #define SORTBUF_PREALLOCATED 64
31
18.6M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
18.6M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
18.6M
  size_t sortbuf_allocated;
34
18.6M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
18.6M
  if (resultbuf == NULL)
38
18.6M
    {
39
18.6M
      result = NULL;
40
18.6M
      allocated = 0;
41
18.6M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
18.6M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
18.6M
  sortbuf = sortbuf_preallocated;
51
18.6M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
18.6M
  sortbuf_count = 0;
53
54
18.6M
  {
55
18.6M
    const UNIT *s_end = s + n;
56
57
18.6M
    for (;;)
58
57.8M
      {
59
57.8M
        int count;
60
57.8M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
57.8M
        int decomposed_count;
62
57.8M
        int i;
63
64
57.8M
        if (s < s_end)
65
39.2M
          {
66
            /* Fetch the next character.  */
67
39.2M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
39.2M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
39.2M
            {
77
39.2M
              int curr;
78
79
87.8M
              for (curr = 0; curr < decomposed_count; )
80
48.6M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
48.6M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
48.6M
                  int curr_decomposed_count;
85
86
48.6M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
48.6M
                  if (curr_decomposed_count >= 0)
88
3.18M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.18M
                      int shift = curr_decomposed_count - 1;
93
94
3.18M
                      if (shift < 0)
95
0
                        abort ();
96
3.18M
                      if (shift > 0)
97
2.51M
                        {
98
2.51M
                          int j;
99
100
2.51M
                          decomposed_count += shift;
101
2.51M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.55M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
31.5k
                            decomposed[j + shift] = decomposed[j];
105
2.51M
                        }
106
12.5M
                      for (; shift >= 0; shift--)
107
9.37M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.18M
                    }
109
45.4M
                  else
110
45.4M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
45.4M
                      curr++;
113
45.4M
                    }
114
48.6M
                }
115
39.2M
            }
116
39.2M
          }
117
18.6M
        else
118
18.6M
          {
119
18.6M
            count = 0;
120
18.6M
            decomposed_count = 0;
121
18.6M
          }
122
123
57.8M
        i = 0;
124
57.8M
        for (;;)
125
103M
          {
126
103M
            ucs4_t uc;
127
103M
            int ccc;
128
129
103M
            if (s < s_end)
130
84.6M
              {
131
                /* Fetch the next character from the decomposition.  */
132
84.6M
                if (i == decomposed_count)
133
39.2M
                  break;
134
45.4M
                uc = decomposed[i];
135
45.4M
                ccc = uc_combining_class (uc);
136
45.4M
              }
137
18.6M
            else
138
18.6M
              {
139
                /* End of string reached.  */
140
18.6M
                uc = 0;
141
18.6M
                ccc = 0;
142
18.6M
              }
143
144
64.0M
            if (ccc == 0)
145
61.3M
              {
146
61.3M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
61.3M
                if (sortbuf_count > 1)
151
1.37M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.37M
                                                           sortbuf + sortbuf_count);
153
154
61.3M
                if (composer != NULL)
155
61.3M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
61.3M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
42.7M
                      {
178
45.3M
                        for (j = 1; j < sortbuf_count; )
179
2.59M
                          {
180
2.59M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.41M
                              {
182
1.41M
                                ucs4_t combined =
183
1.41M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.41M
                                if (combined)
185
1.35M
                                  {
186
1.35M
                                    size_t k;
187
188
1.35M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
2.94M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.58M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.35M
                                    sortbuf_count--;
193
1.35M
                                    continue;
194
1.35M
                                  }
195
1.41M
                              }
196
1.23M
                            j++;
197
1.23M
                          }
198
42.7M
                        if (s < s_end && sortbuf_count == 1)
199
24.3M
                          {
200
24.3M
                            ucs4_t combined =
201
24.3M
                              composer (sortbuf[0].code, uc);
202
24.3M
                            if (combined)
203
22.0k
                              {
204
22.0k
                                uc = combined;
205
22.0k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
22.0k
                                sortbuf_count = 0;
210
22.0k
                              }
211
24.3M
                          }
212
42.7M
                      }
213
61.3M
                  }
214
215
105M
                for (j = 0; j < sortbuf_count; j++)
216
44.0M
                  {
217
44.0M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
44.0M
                    if (length < allocated)
221
25.6M
                      {
222
25.6M
                        int ret =
223
25.6M
                          U_UCTOMB (result + length, muc, allocated - length);
224
25.6M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
25.6M
                        if (ret >= 0)
230
25.5M
                          {
231
25.5M
                            length += ret;
232
25.5M
                            goto done_appending;
233
25.5M
                          }
234
25.6M
                      }
235
18.4M
                    {
236
18.4M
                      size_t old_allocated = allocated;
237
18.4M
                      size_t new_allocated = 2 * old_allocated;
238
18.4M
                      if (new_allocated < 64)
239
18.4M
                        new_allocated = 64;
240
18.4M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
18.4M
                      {
243
18.4M
                        UNIT *larger_result;
244
18.4M
                        if (result == NULL)
245
18.4M
                          {
246
18.4M
                            larger_result =
247
18.4M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
18.4M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
18.4M
                          }
254
48.9k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
48.9k
                        else
266
48.9k
                          {
267
48.9k
                            larger_result =
268
48.9k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
48.9k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
48.9k
                          }
275
18.4M
                        result = larger_result;
276
18.4M
                        allocated = new_allocated;
277
18.4M
                        {
278
18.4M
                          int ret =
279
18.4M
                            U_UCTOMB (result + length, muc, allocated - length);
280
18.4M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
18.4M
                          if (ret < 0)
286
0
                            abort ();
287
18.4M
                          length += ret;
288
18.4M
                          goto done_appending;
289
18.4M
                        }
290
18.4M
                      }
291
18.4M
                    }
292
44.0M
                   done_appending: ;
293
44.0M
                  }
294
295
                /* sortbuf is now empty.  */
296
61.3M
                sortbuf_count = 0;
297
61.3M
              }
298
299
64.0M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
18.6M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
45.4M
            if (sortbuf_count == sortbuf_allocated)
305
1.31k
              {
306
1.31k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.31k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.31k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.31k
                new_sortbuf =
312
1.31k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.31k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.31k
                memcpy (new_sortbuf, sortbuf,
319
1.31k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.31k
                if (sortbuf != sortbuf_preallocated)
321
600
                  free (sortbuf);
322
1.31k
                sortbuf = new_sortbuf;
323
1.31k
              }
324
45.4M
            sortbuf[sortbuf_count].code = uc;
325
45.4M
            sortbuf[sortbuf_count].ccc = ccc;
326
45.4M
            sortbuf_count++;
327
328
45.4M
            i++;
329
45.4M
          }
330
331
57.8M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
18.6M
          break;
334
335
39.2M
        s += count;
336
39.2M
      }
337
18.6M
  }
338
339
18.6M
  if (length == 0)
340
196k
    {
341
196k
      if (result == NULL)
342
196k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
196k
          result = (UNIT *) malloc (1);
345
196k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
196k
        }
351
196k
    }
352
18.4M
  else if (result != resultbuf && length < allocated)
353
18.4M
    {
354
      /* Shrink the allocated memory if possible.  */
355
18.4M
      UNIT *memory;
356
357
18.4M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
18.4M
      if (memory != NULL)
359
18.4M
        result = memory;
360
18.4M
    }
361
362
18.6M
  if (sortbuf_count > 0)
363
0
    abort ();
364
18.6M
  if (sortbuf != sortbuf_preallocated)
365
719
    free (sortbuf);
366
367
18.6M
  *lengthp = length;
368
18.6M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
18.6M
}
u8_normalize
Line
Count
Source
21
5.83M
{
22
5.83M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
5.83M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
5.83M
  UNIT *result;
27
5.83M
  size_t length;
28
5.83M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
5.83M
  #define SORTBUF_PREALLOCATED 64
31
5.83M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
5.83M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
5.83M
  size_t sortbuf_allocated;
34
5.83M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
5.83M
  if (resultbuf == NULL)
38
5.83M
    {
39
5.83M
      result = NULL;
40
5.83M
      allocated = 0;
41
5.83M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
5.83M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
5.83M
  sortbuf = sortbuf_preallocated;
51
5.83M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
5.83M
  sortbuf_count = 0;
53
54
5.83M
  {
55
5.83M
    const UNIT *s_end = s + n;
56
57
5.83M
    for (;;)
58
26.1M
      {
59
26.1M
        int count;
60
26.1M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
26.1M
        int decomposed_count;
62
26.1M
        int i;
63
64
26.1M
        if (s < s_end)
65
20.2M
          {
66
            /* Fetch the next character.  */
67
20.2M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
20.2M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
20.2M
            {
77
20.2M
              int curr;
78
79
49.7M
              for (curr = 0; curr < decomposed_count; )
80
29.5M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
29.5M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
29.5M
                  int curr_decomposed_count;
85
86
29.5M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
29.5M
                  if (curr_decomposed_count >= 0)
88
3.13M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.13M
                      int shift = curr_decomposed_count - 1;
93
94
3.13M
                      if (shift < 0)
95
0
                        abort ();
96
3.13M
                      if (shift > 0)
97
2.46M
                        {
98
2.46M
                          int j;
99
100
2.46M
                          decomposed_count += shift;
101
2.46M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.48M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
21.3k
                            decomposed[j + shift] = decomposed[j];
105
2.46M
                        }
106
12.4M
                      for (; shift >= 0; shift--)
107
9.26M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.13M
                    }
109
26.3M
                  else
110
26.3M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
26.3M
                      curr++;
113
26.3M
                    }
114
29.5M
                }
115
20.2M
            }
116
20.2M
          }
117
5.83M
        else
118
5.83M
          {
119
5.83M
            count = 0;
120
5.83M
            decomposed_count = 0;
121
5.83M
          }
122
123
26.1M
        i = 0;
124
26.1M
        for (;;)
125
52.4M
          {
126
52.4M
            ucs4_t uc;
127
52.4M
            int ccc;
128
129
52.4M
            if (s < s_end)
130
46.6M
              {
131
                /* Fetch the next character from the decomposition.  */
132
46.6M
                if (i == decomposed_count)
133
20.2M
                  break;
134
26.3M
                uc = decomposed[i];
135
26.3M
                ccc = uc_combining_class (uc);
136
26.3M
              }
137
5.83M
            else
138
5.83M
              {
139
                /* End of string reached.  */
140
5.83M
                uc = 0;
141
5.83M
                ccc = 0;
142
5.83M
              }
143
144
32.2M
            if (ccc == 0)
145
29.7M
              {
146
29.7M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
29.7M
                if (sortbuf_count > 1)
151
1.33M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.33M
                                                           sortbuf + sortbuf_count);
153
154
29.7M
                if (composer != NULL)
155
29.7M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
29.7M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
23.8M
                      {
178
26.3M
                        for (j = 1; j < sortbuf_count; )
179
2.50M
                          {
180
2.50M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.35M
                              {
182
1.35M
                                ucs4_t combined =
183
1.35M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.35M
                                if (combined)
185
1.31M
                                  {
186
1.31M
                                    size_t k;
187
188
1.31M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
2.89M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.57M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.31M
                                    sortbuf_count--;
193
1.31M
                                    continue;
194
1.31M
                                  }
195
1.35M
                              }
196
1.18M
                            j++;
197
1.18M
                          }
198
23.8M
                        if (s < s_end && sortbuf_count == 1)
199
18.0M
                          {
200
18.0M
                            ucs4_t combined =
201
18.0M
                              composer (sortbuf[0].code, uc);
202
18.0M
                            if (combined)
203
11.4k
                              {
204
11.4k
                                uc = combined;
205
11.4k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
11.4k
                                sortbuf_count = 0;
210
11.4k
                              }
211
18.0M
                          }
212
23.8M
                      }
213
29.7M
                  }
214
215
54.7M
                for (j = 0; j < sortbuf_count; j++)
216
25.0M
                  {
217
25.0M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
25.0M
                    if (length < allocated)
221
19.2M
                      {
222
19.2M
                        int ret =
223
19.2M
                          U_UCTOMB (result + length, muc, allocated - length);
224
19.2M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
19.2M
                        if (ret >= 0)
230
19.1M
                          {
231
19.1M
                            length += ret;
232
19.1M
                            goto done_appending;
233
19.1M
                          }
234
19.2M
                      }
235
5.88M
                    {
236
5.88M
                      size_t old_allocated = allocated;
237
5.88M
                      size_t new_allocated = 2 * old_allocated;
238
5.88M
                      if (new_allocated < 64)
239
5.83M
                        new_allocated = 64;
240
5.88M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
5.88M
                      {
243
5.88M
                        UNIT *larger_result;
244
5.88M
                        if (result == NULL)
245
5.83M
                          {
246
5.83M
                            larger_result =
247
5.83M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
5.83M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
5.83M
                          }
254
45.7k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
45.7k
                        else
266
45.7k
                          {
267
45.7k
                            larger_result =
268
45.7k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
45.7k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
45.7k
                          }
275
5.88M
                        result = larger_result;
276
5.88M
                        allocated = new_allocated;
277
5.88M
                        {
278
5.88M
                          int ret =
279
5.88M
                            U_UCTOMB (result + length, muc, allocated - length);
280
5.88M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
5.88M
                          if (ret < 0)
286
0
                            abort ();
287
5.88M
                          length += ret;
288
5.88M
                          goto done_appending;
289
5.88M
                        }
290
5.88M
                      }
291
5.88M
                    }
292
25.0M
                   done_appending: ;
293
25.0M
                  }
294
295
                /* sortbuf is now empty.  */
296
29.7M
                sortbuf_count = 0;
297
29.7M
              }
298
299
32.2M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
5.83M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
26.3M
            if (sortbuf_count == sortbuf_allocated)
305
919
              {
306
919
                struct ucs4_with_ccc *new_sortbuf;
307
308
919
                sortbuf_allocated = 2 * sortbuf_allocated;
309
919
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
919
                new_sortbuf =
312
919
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
919
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
919
                memcpy (new_sortbuf, sortbuf,
319
919
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
919
                if (sortbuf != sortbuf_preallocated)
321
600
                  free (sortbuf);
322
919
                sortbuf = new_sortbuf;
323
919
              }
324
26.3M
            sortbuf[sortbuf_count].code = uc;
325
26.3M
            sortbuf[sortbuf_count].ccc = ccc;
326
26.3M
            sortbuf_count++;
327
328
26.3M
            i++;
329
26.3M
          }
330
331
26.1M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
5.83M
          break;
334
335
20.2M
        s += count;
336
20.2M
      }
337
5.83M
  }
338
339
5.83M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
5.83M
  else if (result != resultbuf && length < allocated)
353
5.83M
    {
354
      /* Shrink the allocated memory if possible.  */
355
5.83M
      UNIT *memory;
356
357
5.83M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
5.83M
      if (memory != NULL)
359
5.83M
        result = memory;
360
5.83M
    }
361
362
5.83M
  if (sortbuf_count > 0)
363
0
    abort ();
364
5.83M
  if (sortbuf != sortbuf_preallocated)
365
319
    free (sortbuf);
366
367
5.83M
  *lengthp = length;
368
5.83M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
5.83M
}
u32_normalize
Line
Count
Source
21
12.7M
{
22
12.7M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
12.7M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
12.7M
  UNIT *result;
27
12.7M
  size_t length;
28
12.7M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
12.7M
  #define SORTBUF_PREALLOCATED 64
31
12.7M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
12.7M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
12.7M
  size_t sortbuf_allocated;
34
12.7M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
12.7M
  if (resultbuf == NULL)
38
12.7M
    {
39
12.7M
      result = NULL;
40
12.7M
      allocated = 0;
41
12.7M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
12.7M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
12.7M
  sortbuf = sortbuf_preallocated;
51
12.7M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
12.7M
  sortbuf_count = 0;
53
54
12.7M
  {
55
12.7M
    const UNIT *s_end = s + n;
56
57
12.7M
    for (;;)
58
31.7M
      {
59
31.7M
        int count;
60
31.7M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
31.7M
        int decomposed_count;
62
31.7M
        int i;
63
64
31.7M
        if (s < s_end)
65
18.9M
          {
66
            /* Fetch the next character.  */
67
18.9M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
18.9M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
18.9M
            {
77
18.9M
              int curr;
78
79
38.0M
              for (curr = 0; curr < decomposed_count; )
80
19.0M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
19.0M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
19.0M
                  int curr_decomposed_count;
85
86
19.0M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
19.0M
                  if (curr_decomposed_count >= 0)
88
51.9k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
51.9k
                      int shift = curr_decomposed_count - 1;
93
94
51.9k
                      if (shift < 0)
95
0
                        abort ();
96
51.9k
                      if (shift > 0)
97
51.3k
                        {
98
51.3k
                          int j;
99
100
51.3k
                          decomposed_count += shift;
101
51.3k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
61.5k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
10.1k
                            decomposed[j + shift] = decomposed[j];
105
51.3k
                        }
106
155k
                      for (; shift >= 0; shift--)
107
103k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
51.9k
                    }
109
19.0M
                  else
110
19.0M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
19.0M
                      curr++;
113
19.0M
                    }
114
19.0M
                }
115
18.9M
            }
116
18.9M
          }
117
12.7M
        else
118
12.7M
          {
119
12.7M
            count = 0;
120
12.7M
            decomposed_count = 0;
121
12.7M
          }
122
123
31.7M
        i = 0;
124
31.7M
        for (;;)
125
50.7M
          {
126
50.7M
            ucs4_t uc;
127
50.7M
            int ccc;
128
129
50.7M
            if (s < s_end)
130
37.9M
              {
131
                /* Fetch the next character from the decomposition.  */
132
37.9M
                if (i == decomposed_count)
133
18.9M
                  break;
134
19.0M
                uc = decomposed[i];
135
19.0M
                ccc = uc_combining_class (uc);
136
19.0M
              }
137
12.7M
            else
138
12.7M
              {
139
                /* End of string reached.  */
140
12.7M
                uc = 0;
141
12.7M
                ccc = 0;
142
12.7M
              }
143
144
31.7M
            if (ccc == 0)
145
31.6M
              {
146
31.6M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
31.6M
                if (sortbuf_count > 1)
151
44.9k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
44.9k
                                                           sortbuf + sortbuf_count);
153
154
31.6M
                if (composer != NULL)
155
31.6M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
31.6M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
18.8M
                      {
178
18.9M
                        for (j = 1; j < sortbuf_count; )
179
92.6k
                          {
180
92.6k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
56.0k
                              {
182
56.0k
                                ucs4_t combined =
183
56.0k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
56.0k
                                if (combined)
185
38.4k
                                  {
186
38.4k
                                    size_t k;
187
188
38.4k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
50.7k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
12.2k
                                      sortbuf[k - 1] = sortbuf[k];
192
38.4k
                                    sortbuf_count--;
193
38.4k
                                    continue;
194
38.4k
                                  }
195
56.0k
                              }
196
54.1k
                            j++;
197
54.1k
                          }
198
18.8M
                        if (s < s_end && sortbuf_count == 1)
199
6.32M
                          {
200
6.32M
                            ucs4_t combined =
201
6.32M
                              composer (sortbuf[0].code, uc);
202
6.32M
                            if (combined)
203
10.5k
                              {
204
10.5k
                                uc = combined;
205
10.5k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
10.5k
                                sortbuf_count = 0;
210
10.5k
                              }
211
6.32M
                          }
212
18.8M
                      }
213
31.6M
                  }
214
215
50.6M
                for (j = 0; j < sortbuf_count; j++)
216
18.9M
                  {
217
18.9M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
18.9M
                    if (length < allocated)
221
6.40M
                      {
222
6.40M
                        int ret =
223
6.40M
                          U_UCTOMB (result + length, muc, allocated - length);
224
6.40M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
6.40M
                        if (ret >= 0)
230
6.40M
                          {
231
6.40M
                            length += ret;
232
6.40M
                            goto done_appending;
233
6.40M
                          }
234
6.40M
                      }
235
12.5M
                    {
236
12.5M
                      size_t old_allocated = allocated;
237
12.5M
                      size_t new_allocated = 2 * old_allocated;
238
12.5M
                      if (new_allocated < 64)
239
12.5M
                        new_allocated = 64;
240
12.5M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
12.5M
                      {
243
12.5M
                        UNIT *larger_result;
244
12.5M
                        if (result == NULL)
245
12.5M
                          {
246
12.5M
                            larger_result =
247
12.5M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
12.5M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
12.5M
                          }
254
3.18k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
3.18k
                        else
266
3.18k
                          {
267
3.18k
                            larger_result =
268
3.18k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
3.18k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
3.18k
                          }
275
12.5M
                        result = larger_result;
276
12.5M
                        allocated = new_allocated;
277
12.5M
                        {
278
12.5M
                          int ret =
279
12.5M
                            U_UCTOMB (result + length, muc, allocated - length);
280
12.5M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
12.5M
                          if (ret < 0)
286
0
                            abort ();
287
12.5M
                          length += ret;
288
12.5M
                          goto done_appending;
289
12.5M
                        }
290
12.5M
                      }
291
12.5M
                    }
292
18.9M
                   done_appending: ;
293
18.9M
                  }
294
295
                /* sortbuf is now empty.  */
296
31.6M
                sortbuf_count = 0;
297
31.6M
              }
298
299
31.7M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
12.7M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
19.0M
            if (sortbuf_count == sortbuf_allocated)
305
400
              {
306
400
                struct ucs4_with_ccc *new_sortbuf;
307
308
400
                sortbuf_allocated = 2 * sortbuf_allocated;
309
400
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
400
                new_sortbuf =
312
400
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
400
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
400
                memcpy (new_sortbuf, sortbuf,
319
400
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
400
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
400
                sortbuf = new_sortbuf;
323
400
              }
324
19.0M
            sortbuf[sortbuf_count].code = uc;
325
19.0M
            sortbuf[sortbuf_count].ccc = ccc;
326
19.0M
            sortbuf_count++;
327
328
19.0M
            i++;
329
19.0M
          }
330
331
31.7M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
12.7M
          break;
334
335
18.9M
        s += count;
336
18.9M
      }
337
12.7M
  }
338
339
12.7M
  if (length == 0)
340
196k
    {
341
196k
      if (result == NULL)
342
196k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
196k
          result = (UNIT *) malloc (1);
345
196k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
196k
        }
351
196k
    }
352
12.5M
  else if (result != resultbuf && length < allocated)
353
12.5M
    {
354
      /* Shrink the allocated memory if possible.  */
355
12.5M
      UNIT *memory;
356
357
12.5M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
12.5M
      if (memory != NULL)
359
12.5M
        result = memory;
360
12.5M
    }
361
362
12.7M
  if (sortbuf_count > 0)
363
0
    abort ();
364
12.7M
  if (sortbuf != sortbuf_preallocated)
365
400
    free (sortbuf);
366
367
12.7M
  *lengthp = length;
368
12.7M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
12.7M
}