Coverage Report

Created: 2026-02-20 06:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2026 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
9.45M
{
22
9.45M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
9.45M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
9.45M
  UNIT *result;
27
9.45M
  size_t allocated;
28
9.45M
  if (resultbuf == NULL)
29
9.45M
    {
30
9.45M
      result = NULL;
31
9.45M
      allocated = 0;
32
9.45M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
9.45M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
9.45M
  #define SORTBUF_PREALLOCATED 64
42
9.45M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
9.45M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
9.45M
    sortbuf_preallocated;
45
9.45M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
9.45M
  size_t sortbuf_count = 0;
47
48
9.45M
  {
49
9.45M
    const UNIT *s_end = s + n;
50
51
9.45M
    for (;;)
52
32.5M
      {
53
32.5M
        int count;
54
32.5M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
32.5M
        int decomposed_count;
56
57
32.5M
        if (s < s_end)
58
23.1M
          {
59
            /* Fetch the next character.  */
60
23.1M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
23.1M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
55.7M
            for (int curr = 0; curr < decomposed_count; )
70
32.6M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
32.6M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
32.6M
                int curr_decomposed_count;
75
76
32.6M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
32.6M
                if (curr_decomposed_count >= 0)
78
3.00M
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
3.00M
                    int shift = curr_decomposed_count - 1;
83
84
3.00M
                    if (shift < 0)
85
0
                      abort ();
86
3.00M
                    if (shift > 0)
87
2.60M
                      {
88
2.60M
                        decomposed_count += shift;
89
2.60M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
2.62M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
27.7k
                          decomposed[j + shift] = decomposed[j];
93
2.60M
                      }
94
12.5M
                    for (; shift >= 0; shift--)
95
9.50M
                      decomposed[curr + shift] = curr_decomposed[shift];
96
3.00M
                  }
97
29.6M
                else
98
29.6M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
29.6M
                    curr++;
101
29.6M
                  }
102
32.6M
              }
103
23.1M
          }
104
9.45M
        else
105
9.45M
          {
106
9.45M
            count = 0;
107
9.45M
            decomposed_count = 0;
108
9.45M
          }
109
110
32.5M
        int i = 0;
111
32.5M
        for (;;)
112
62.2M
          {
113
62.2M
            ucs4_t uc;
114
62.2M
            int ccc;
115
116
62.2M
            if (s < s_end)
117
52.7M
              {
118
                /* Fetch the next character from the decomposition.  */
119
52.7M
                if (i == decomposed_count)
120
23.1M
                  break;
121
29.6M
                uc = decomposed[i];
122
29.6M
                ccc = uc_combining_class (uc);
123
29.6M
              }
124
9.45M
            else
125
9.45M
              {
126
                /* End of string reached.  */
127
9.45M
                uc = 0;
128
9.45M
                ccc = 0;
129
9.45M
              }
130
131
39.0M
            if (ccc == 0)
132
36.1M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
36.1M
                if (sortbuf_count > 1)
136
1.44M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
1.44M
                                                           sortbuf + sortbuf_count);
138
139
36.1M
                if (composer != NULL)
140
36.1M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
36.1M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
26.6M
                      {
163
29.5M
                        for (size_t j = 1; j < sortbuf_count; )
164
2.90M
                          {
165
2.90M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
1.47M
                              {
167
1.47M
                                ucs4_t combined =
168
1.47M
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
1.47M
                                if (combined)
170
1.42M
                                  {
171
1.42M
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
3.33M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
1.90M
                                      sortbuf[k - 1] = sortbuf[k];
175
1.42M
                                    sortbuf_count--;
176
1.42M
                                    continue;
177
1.42M
                                  }
178
1.47M
                              }
179
1.47M
                            j++;
180
1.47M
                          }
181
26.6M
                        if (s < s_end && sortbuf_count == 1)
182
17.3M
                          {
183
17.3M
                            ucs4_t combined =
184
17.3M
                              composer (sortbuf[0].code, uc);
185
17.3M
                            if (combined)
186
21.0k
                              {
187
21.0k
                                uc = combined;
188
21.0k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
21.0k
                                sortbuf_count = 0;
193
21.0k
                              }
194
17.3M
                          }
195
26.6M
                      }
196
36.1M
                  }
197
198
64.3M
                for (size_t j = 0; j < sortbuf_count; j++)
199
28.1M
                  {
200
28.1M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
28.1M
                    if (length < allocated)
204
18.8M
                      {
205
18.8M
                        int ret =
206
18.8M
                          U_UCTOMB (result + length, muc, allocated - length);
207
18.8M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
18.8M
                        if (ret >= 0)
213
18.8M
                          {
214
18.8M
                            length += ret;
215
18.8M
                            goto done_appending;
216
18.8M
                          }
217
18.8M
                      }
218
9.32M
                    {
219
9.32M
                      size_t old_allocated = allocated;
220
9.32M
                      size_t new_allocated = 2 * old_allocated;
221
9.32M
                      if (new_allocated < 64)
222
9.28M
                        new_allocated = 64;
223
9.32M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
9.32M
                      {
226
9.32M
                        UNIT *larger_result;
227
9.32M
                        if (result == NULL)
228
9.28M
                          {
229
9.28M
                            larger_result =
230
9.28M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
9.28M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
9.28M
                          }
237
36.1k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
36.1k
                        else
249
36.1k
                          {
250
36.1k
                            larger_result =
251
36.1k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
36.1k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
36.1k
                          }
258
9.32M
                        result = larger_result;
259
9.32M
                        allocated = new_allocated;
260
9.32M
                        {
261
9.32M
                          int ret =
262
9.32M
                            U_UCTOMB (result + length, muc, allocated - length);
263
9.32M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
9.32M
                          if (ret < 0)
269
0
                            abort ();
270
9.32M
                          length += ret;
271
9.32M
                          goto done_appending;
272
9.32M
                        }
273
9.32M
                      }
274
9.32M
                    }
275
28.1M
                   done_appending: ;
276
28.1M
                  }
277
278
                /* sortbuf is now empty.  */
279
36.1M
                sortbuf_count = 0;
280
36.1M
              }
281
282
39.0M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
9.45M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
29.6M
            if (sortbuf_count == sortbuf_allocated)
288
2.41k
              {
289
2.41k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
2.41k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
2.41k
                struct ucs4_with_ccc *new_sortbuf =
293
2.41k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
2.41k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
2.41k
                memcpy (new_sortbuf, sortbuf,
300
2.41k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
2.41k
                if (sortbuf != sortbuf_preallocated)
302
2.41k
                  free (sortbuf);
303
2.41k
                sortbuf = new_sortbuf;
304
2.41k
              }
305
29.6M
            sortbuf[sortbuf_count].code = uc;
306
29.6M
            sortbuf[sortbuf_count].ccc = ccc;
307
29.6M
            sortbuf_count++;
308
309
29.6M
            i++;
310
29.6M
          }
311
312
32.5M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
9.45M
          break;
315
316
23.1M
        s += count;
317
23.1M
      }
318
9.45M
  }
319
320
9.45M
  if (length == 0)
321
168k
    {
322
168k
      if (result == NULL)
323
168k
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
168k
          result = (UNIT *) malloc (1);
326
168k
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
168k
        }
332
168k
    }
333
9.28M
  else if (result != resultbuf && length < allocated)
334
9.28M
    {
335
      /* Shrink the allocated memory if possible.  */
336
9.28M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
9.28M
      if (memory != NULL)
338
9.28M
        result = memory;
339
9.28M
    }
340
341
9.45M
  if (sortbuf_count > 0)
342
0
    abort ();
343
9.45M
  if (sortbuf != sortbuf_preallocated)
344
9.45M
    free (sortbuf);
345
346
9.45M
  *lengthp = length;
347
9.45M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
9.45M
}
u8_normalize
Line
Count
Source
21
3.08M
{
22
3.08M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
3.08M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
3.08M
  UNIT *result;
27
3.08M
  size_t allocated;
28
3.08M
  if (resultbuf == NULL)
29
3.08M
    {
30
3.08M
      result = NULL;
31
3.08M
      allocated = 0;
32
3.08M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
3.08M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
3.08M
  #define SORTBUF_PREALLOCATED 64
42
3.08M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
3.08M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
3.08M
    sortbuf_preallocated;
45
3.08M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
3.08M
  size_t sortbuf_count = 0;
47
48
3.08M
  {
49
3.08M
    const UNIT *s_end = s + n;
50
51
3.08M
    for (;;)
52
16.7M
      {
53
16.7M
        int count;
54
16.7M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
16.7M
        int decomposed_count;
56
57
16.7M
        if (s < s_end)
58
13.6M
          {
59
            /* Fetch the next character.  */
60
13.6M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
13.6M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
36.8M
            for (int curr = 0; curr < decomposed_count; )
70
23.1M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
23.1M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
23.1M
                int curr_decomposed_count;
75
76
23.1M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
23.1M
                if (curr_decomposed_count >= 0)
78
2.96M
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
2.96M
                    int shift = curr_decomposed_count - 1;
83
84
2.96M
                    if (shift < 0)
85
0
                      abort ();
86
2.96M
                    if (shift > 0)
87
2.56M
                      {
88
2.56M
                        decomposed_count += shift;
89
2.56M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
2.58M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
17.4k
                          decomposed[j + shift] = decomposed[j];
93
2.56M
                      }
94
12.4M
                    for (; shift >= 0; shift--)
95
9.43M
                      decomposed[curr + shift] = curr_decomposed[shift];
96
2.96M
                  }
97
20.1M
                else
98
20.1M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
20.1M
                    curr++;
101
20.1M
                  }
102
23.1M
              }
103
13.6M
          }
104
3.08M
        else
105
3.08M
          {
106
3.08M
            count = 0;
107
3.08M
            decomposed_count = 0;
108
3.08M
          }
109
110
16.7M
        int i = 0;
111
16.7M
        for (;;)
112
36.9M
          {
113
36.9M
            ucs4_t uc;
114
36.9M
            int ccc;
115
116
36.9M
            if (s < s_end)
117
33.8M
              {
118
                /* Fetch the next character from the decomposition.  */
119
33.8M
                if (i == decomposed_count)
120
13.6M
                  break;
121
20.1M
                uc = decomposed[i];
122
20.1M
                ccc = uc_combining_class (uc);
123
20.1M
              }
124
3.08M
            else
125
3.08M
              {
126
                /* End of string reached.  */
127
3.08M
                uc = 0;
128
3.08M
                ccc = 0;
129
3.08M
              }
130
131
23.2M
            if (ccc == 0)
132
20.4M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
20.4M
                if (sortbuf_count > 1)
136
1.41M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
1.41M
                                                           sortbuf + sortbuf_count);
138
139
20.4M
                if (composer != NULL)
140
20.4M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
20.4M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
17.3M
                      {
163
20.1M
                        for (size_t j = 1; j < sortbuf_count; )
164
2.77M
                          {
165
2.77M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
1.43M
                              {
167
1.43M
                                ucs4_t combined =
168
1.43M
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
1.43M
                                if (combined)
170
1.40M
                                  {
171
1.40M
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
3.29M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
1.89M
                                      sortbuf[k - 1] = sortbuf[k];
175
1.40M
                                    sortbuf_count--;
176
1.40M
                                    continue;
177
1.40M
                                  }
178
1.43M
                              }
179
1.36M
                            j++;
180
1.36M
                          }
181
17.3M
                        if (s < s_end && sortbuf_count == 1)
182
14.2M
                          {
183
14.2M
                            ucs4_t combined =
184
14.2M
                              composer (sortbuf[0].code, uc);
185
14.2M
                            if (combined)
186
10.1k
                              {
187
10.1k
                                uc = combined;
188
10.1k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
10.1k
                                sortbuf_count = 0;
193
10.1k
                              }
194
14.2M
                          }
195
17.3M
                      }
196
20.4M
                  }
197
198
39.1M
                for (size_t j = 0; j < sortbuf_count; j++)
199
18.7M
                  {
200
18.7M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
18.7M
                    if (length < allocated)
204
15.6M
                      {
205
15.6M
                        int ret =
206
15.6M
                          U_UCTOMB (result + length, muc, allocated - length);
207
15.6M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
15.6M
                        if (ret >= 0)
213
15.6M
                          {
214
15.6M
                            length += ret;
215
15.6M
                            goto done_appending;
216
15.6M
                          }
217
15.6M
                      }
218
3.11M
                    {
219
3.11M
                      size_t old_allocated = allocated;
220
3.11M
                      size_t new_allocated = 2 * old_allocated;
221
3.11M
                      if (new_allocated < 64)
222
3.08M
                        new_allocated = 64;
223
3.11M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
3.11M
                      {
226
3.11M
                        UNIT *larger_result;
227
3.11M
                        if (result == NULL)
228
3.08M
                          {
229
3.08M
                            larger_result =
230
3.08M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
3.08M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
3.08M
                          }
237
32.1k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
32.1k
                        else
249
32.1k
                          {
250
32.1k
                            larger_result =
251
32.1k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
32.1k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
32.1k
                          }
258
3.11M
                        result = larger_result;
259
3.11M
                        allocated = new_allocated;
260
3.11M
                        {
261
3.11M
                          int ret =
262
3.11M
                            U_UCTOMB (result + length, muc, allocated - length);
263
3.11M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
3.11M
                          if (ret < 0)
269
0
                            abort ();
270
3.11M
                          length += ret;
271
3.11M
                          goto done_appending;
272
3.11M
                        }
273
3.11M
                      }
274
3.11M
                    }
275
18.7M
                   done_appending: ;
276
18.7M
                  }
277
278
                /* sortbuf is now empty.  */
279
20.4M
                sortbuf_count = 0;
280
20.4M
              }
281
282
23.2M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
3.08M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
20.1M
            if (sortbuf_count == sortbuf_allocated)
288
1.25k
              {
289
1.25k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
1.25k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
1.25k
                struct ucs4_with_ccc *new_sortbuf =
293
1.25k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
1.25k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
1.25k
                memcpy (new_sortbuf, sortbuf,
300
1.25k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
1.25k
                if (sortbuf != sortbuf_preallocated)
302
1.25k
                  free (sortbuf);
303
1.25k
                sortbuf = new_sortbuf;
304
1.25k
              }
305
20.1M
            sortbuf[sortbuf_count].code = uc;
306
20.1M
            sortbuf[sortbuf_count].ccc = ccc;
307
20.1M
            sortbuf_count++;
308
309
20.1M
            i++;
310
20.1M
          }
311
312
16.7M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
3.08M
          break;
315
316
13.6M
        s += count;
317
13.6M
      }
318
3.08M
  }
319
320
3.08M
  if (length == 0)
321
0
    {
322
0
      if (result == NULL)
323
0
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
0
          result = (UNIT *) malloc (1);
326
0
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
0
        }
332
0
    }
333
3.08M
  else if (result != resultbuf && length < allocated)
334
3.08M
    {
335
      /* Shrink the allocated memory if possible.  */
336
3.08M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
3.08M
      if (memory != NULL)
338
3.08M
        result = memory;
339
3.08M
    }
340
341
3.08M
  if (sortbuf_count > 0)
342
0
    abort ();
343
3.08M
  if (sortbuf != sortbuf_preallocated)
344
3.08M
    free (sortbuf);
345
346
3.08M
  *lengthp = length;
347
3.08M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
3.08M
}
u32_normalize
Line
Count
Source
21
6.37M
{
22
6.37M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
6.37M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
6.37M
  UNIT *result;
27
6.37M
  size_t allocated;
28
6.37M
  if (resultbuf == NULL)
29
6.37M
    {
30
6.37M
      result = NULL;
31
6.37M
      allocated = 0;
32
6.37M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
6.37M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
6.37M
  #define SORTBUF_PREALLOCATED 64
42
6.37M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
6.37M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
6.37M
    sortbuf_preallocated;
45
6.37M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
6.37M
  size_t sortbuf_count = 0;
47
48
6.37M
  {
49
6.37M
    const UNIT *s_end = s + n;
50
51
6.37M
    for (;;)
52
15.8M
      {
53
15.8M
        int count;
54
15.8M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
15.8M
        int decomposed_count;
56
57
15.8M
        if (s < s_end)
58
9.43M
          {
59
            /* Fetch the next character.  */
60
9.43M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
9.43M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
18.9M
            for (int curr = 0; curr < decomposed_count; )
70
9.50M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
9.50M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
9.50M
                int curr_decomposed_count;
75
76
9.50M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
9.50M
                if (curr_decomposed_count >= 0)
78
35.1k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
35.1k
                    int shift = curr_decomposed_count - 1;
83
84
35.1k
                    if (shift < 0)
85
0
                      abort ();
86
35.1k
                    if (shift > 0)
87
34.8k
                      {
88
34.8k
                        decomposed_count += shift;
89
34.8k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
45.1k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
10.3k
                          decomposed[j + shift] = decomposed[j];
93
34.8k
                      }
94
105k
                    for (; shift >= 0; shift--)
95
70.0k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
35.1k
                  }
97
9.46M
                else
98
9.46M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
9.46M
                    curr++;
101
9.46M
                  }
102
9.50M
              }
103
9.43M
          }
104
6.37M
        else
105
6.37M
          {
106
6.37M
            count = 0;
107
6.37M
            decomposed_count = 0;
108
6.37M
          }
109
110
15.8M
        int i = 0;
111
15.8M
        for (;;)
112
25.2M
          {
113
25.2M
            ucs4_t uc;
114
25.2M
            int ccc;
115
116
25.2M
            if (s < s_end)
117
18.8M
              {
118
                /* Fetch the next character from the decomposition.  */
119
18.8M
                if (i == decomposed_count)
120
9.43M
                  break;
121
9.46M
                uc = decomposed[i];
122
9.46M
                ccc = uc_combining_class (uc);
123
9.46M
              }
124
6.37M
            else
125
6.37M
              {
126
                /* End of string reached.  */
127
6.37M
                uc = 0;
128
6.37M
                ccc = 0;
129
6.37M
              }
130
131
15.8M
            if (ccc == 0)
132
15.6M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
15.6M
                if (sortbuf_count > 1)
136
26.0k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
26.0k
                                                           sortbuf + sortbuf_count);
138
139
15.6M
                if (composer != NULL)
140
15.6M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
15.6M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
9.31M
                      {
163
9.44M
                        for (size_t j = 1; j < sortbuf_count; )
164
128k
                          {
165
128k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
35.6k
                              {
167
35.6k
                                ucs4_t combined =
168
35.6k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
35.6k
                                if (combined)
170
21.8k
                                  {
171
21.8k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
34.8k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
13.0k
                                      sortbuf[k - 1] = sortbuf[k];
175
21.8k
                                    sortbuf_count--;
176
21.8k
                                    continue;
177
21.8k
                                  }
178
35.6k
                              }
179
106k
                            j++;
180
106k
                          }
181
9.31M
                        if (s < s_end && sortbuf_count == 1)
182
3.10M
                          {
183
3.10M
                            ucs4_t combined =
184
3.10M
                              composer (sortbuf[0].code, uc);
185
3.10M
                            if (combined)
186
10.8k
                              {
187
10.8k
                                uc = combined;
188
10.8k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
10.8k
                                sortbuf_count = 0;
193
10.8k
                              }
194
3.10M
                          }
195
9.31M
                      }
196
15.6M
                  }
197
198
25.1M
                for (size_t j = 0; j < sortbuf_count; j++)
199
9.43M
                  {
200
9.43M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
9.43M
                    if (length < allocated)
204
3.22M
                      {
205
3.22M
                        int ret =
206
3.22M
                          U_UCTOMB (result + length, muc, allocated - length);
207
3.22M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
3.22M
                        if (ret >= 0)
213
3.22M
                          {
214
3.22M
                            length += ret;
215
3.22M
                            goto done_appending;
216
3.22M
                          }
217
3.22M
                      }
218
6.21M
                    {
219
6.21M
                      size_t old_allocated = allocated;
220
6.21M
                      size_t new_allocated = 2 * old_allocated;
221
6.21M
                      if (new_allocated < 64)
222
6.20M
                        new_allocated = 64;
223
6.21M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
6.21M
                      {
226
6.21M
                        UNIT *larger_result;
227
6.21M
                        if (result == NULL)
228
6.20M
                          {
229
6.20M
                            larger_result =
230
6.20M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
6.20M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
6.20M
                          }
237
4.03k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
4.03k
                        else
249
4.03k
                          {
250
4.03k
                            larger_result =
251
4.03k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
4.03k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
4.03k
                          }
258
6.21M
                        result = larger_result;
259
6.21M
                        allocated = new_allocated;
260
6.21M
                        {
261
6.21M
                          int ret =
262
6.21M
                            U_UCTOMB (result + length, muc, allocated - length);
263
6.21M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
6.21M
                          if (ret < 0)
269
0
                            abort ();
270
6.21M
                          length += ret;
271
6.21M
                          goto done_appending;
272
6.21M
                        }
273
6.21M
                      }
274
6.21M
                    }
275
9.43M
                   done_appending: ;
276
9.43M
                  }
277
278
                /* sortbuf is now empty.  */
279
15.6M
                sortbuf_count = 0;
280
15.6M
              }
281
282
15.8M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
6.37M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
9.46M
            if (sortbuf_count == sortbuf_allocated)
288
1.16k
              {
289
1.16k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
1.16k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
1.16k
                struct ucs4_with_ccc *new_sortbuf =
293
1.16k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
1.16k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
1.16k
                memcpy (new_sortbuf, sortbuf,
300
1.16k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
1.16k
                if (sortbuf != sortbuf_preallocated)
302
1.16k
                  free (sortbuf);
303
1.16k
                sortbuf = new_sortbuf;
304
1.16k
              }
305
9.46M
            sortbuf[sortbuf_count].code = uc;
306
9.46M
            sortbuf[sortbuf_count].ccc = ccc;
307
9.46M
            sortbuf_count++;
308
309
9.46M
            i++;
310
9.46M
          }
311
312
15.8M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
6.37M
          break;
315
316
9.43M
        s += count;
317
9.43M
      }
318
6.37M
  }
319
320
6.37M
  if (length == 0)
321
168k
    {
322
168k
      if (result == NULL)
323
168k
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
168k
          result = (UNIT *) malloc (1);
326
168k
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
168k
        }
332
168k
    }
333
6.20M
  else if (result != resultbuf && length < allocated)
334
6.20M
    {
335
      /* Shrink the allocated memory if possible.  */
336
6.20M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
6.20M
      if (memory != NULL)
338
6.20M
        result = memory;
339
6.20M
    }
340
341
6.37M
  if (sortbuf_count > 0)
342
0
    abort ();
343
6.37M
  if (sortbuf != sortbuf_preallocated)
344
6.37M
    free (sortbuf);
345
346
6.37M
  *lengthp = length;
347
6.37M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
6.37M
}