Coverage Report

Created: 2026-01-06 07:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2026 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
18.4M
{
22
18.4M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
18.4M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
18.4M
  UNIT *result;
27
18.4M
  size_t allocated;
28
18.4M
  if (resultbuf == NULL)
29
18.4M
    {
30
18.4M
      result = NULL;
31
18.4M
      allocated = 0;
32
18.4M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
18.4M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
18.4M
  #define SORTBUF_PREALLOCATED 64
42
18.4M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
18.4M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
18.4M
    sortbuf_preallocated;
45
18.4M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
18.4M
  size_t sortbuf_count = 0;
47
48
18.4M
  {
49
18.4M
    const UNIT *s_end = s + n;
50
51
18.4M
    for (;;)
52
54.6M
      {
53
54.6M
        int count;
54
54.6M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
54.6M
        int decomposed_count;
56
57
54.6M
        if (s < s_end)
58
36.2M
          {
59
            /* Fetch the next character.  */
60
36.2M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
36.2M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
82.1M
            for (int curr = 0; curr < decomposed_count; )
70
45.8M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
45.8M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
45.8M
                int curr_decomposed_count;
75
76
45.8M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
45.8M
                if (curr_decomposed_count >= 0)
78
3.34M
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
3.34M
                    int shift = curr_decomposed_count - 1;
83
84
3.34M
                    if (shift < 0)
85
0
                      abort ();
86
3.34M
                    if (shift > 0)
87
2.87M
                      {
88
2.87M
                        decomposed_count += shift;
89
2.87M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
2.90M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
26.5k
                          decomposed[j + shift] = decomposed[j];
93
2.87M
                      }
94
13.0M
                    for (; shift >= 0; shift--)
95
9.66M
                      decomposed[curr + shift] = curr_decomposed[shift];
96
3.34M
                  }
97
42.5M
                else
98
42.5M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
42.5M
                    curr++;
101
42.5M
                  }
102
45.8M
              }
103
36.2M
          }
104
18.4M
        else
105
18.4M
          {
106
18.4M
            count = 0;
107
18.4M
            decomposed_count = 0;
108
18.4M
          }
109
110
54.6M
        int i = 0;
111
54.6M
        for (;;)
112
97.2M
          {
113
97.2M
            ucs4_t uc;
114
97.2M
            int ccc;
115
116
97.2M
            if (s < s_end)
117
78.7M
              {
118
                /* Fetch the next character from the decomposition.  */
119
78.7M
                if (i == decomposed_count)
120
36.2M
                  break;
121
42.5M
                uc = decomposed[i];
122
42.5M
                ccc = uc_combining_class (uc);
123
42.5M
              }
124
18.4M
            else
125
18.4M
              {
126
                /* End of string reached.  */
127
18.4M
                uc = 0;
128
18.4M
                ccc = 0;
129
18.4M
              }
130
131
61.0M
            if (ccc == 0)
132
57.8M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
57.8M
                if (sortbuf_count > 1)
136
1.66M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
1.66M
                                                           sortbuf + sortbuf_count);
138
139
57.8M
                if (composer != NULL)
140
57.8M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
57.8M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
39.4M
                      {
163
42.5M
                        for (size_t j = 1; j < sortbuf_count; )
164
3.11M
                          {
165
3.11M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
1.69M
                              {
167
1.69M
                                ucs4_t combined =
168
1.69M
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
1.69M
                                if (combined)
170
1.64M
                                  {
171
1.64M
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
3.50M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
1.86M
                                      sortbuf[k - 1] = sortbuf[k];
175
1.64M
                                    sortbuf_count--;
176
1.64M
                                    continue;
177
1.64M
                                  }
178
1.69M
                              }
179
1.47M
                            j++;
180
1.47M
                          }
181
39.4M
                        if (s < s_end && sortbuf_count == 1)
182
21.0M
                          {
183
21.0M
                            ucs4_t combined =
184
21.0M
                              composer (sortbuf[0].code, uc);
185
21.0M
                            if (combined)
186
18.4k
                              {
187
18.4k
                                uc = combined;
188
18.4k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
18.4k
                                sortbuf_count = 0;
193
18.4k
                              }
194
21.0M
                          }
195
39.4M
                      }
196
57.8M
                  }
197
198
98.7M
                for (size_t j = 0; j < sortbuf_count; j++)
199
40.8M
                  {
200
40.8M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
40.8M
                    if (length < allocated)
204
22.5M
                      {
205
22.5M
                        int ret =
206
22.5M
                          U_UCTOMB (result + length, muc, allocated - length);
207
22.5M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
22.5M
                        if (ret >= 0)
213
22.5M
                          {
214
22.5M
                            length += ret;
215
22.5M
                            goto done_appending;
216
22.5M
                          }
217
22.5M
                      }
218
18.3M
                    {
219
18.3M
                      size_t old_allocated = allocated;
220
18.3M
                      size_t new_allocated = 2 * old_allocated;
221
18.3M
                      if (new_allocated < 64)
222
18.3M
                        new_allocated = 64;
223
18.3M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
18.3M
                      {
226
18.3M
                        UNIT *larger_result;
227
18.3M
                        if (result == NULL)
228
18.3M
                          {
229
18.3M
                            larger_result =
230
18.3M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
18.3M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
18.3M
                          }
237
15.8k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
15.8k
                        else
249
15.8k
                          {
250
15.8k
                            larger_result =
251
15.8k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
15.8k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
15.8k
                          }
258
18.3M
                        result = larger_result;
259
18.3M
                        allocated = new_allocated;
260
18.3M
                        {
261
18.3M
                          int ret =
262
18.3M
                            U_UCTOMB (result + length, muc, allocated - length);
263
18.3M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
18.3M
                          if (ret < 0)
269
0
                            abort ();
270
18.3M
                          length += ret;
271
18.3M
                          goto done_appending;
272
18.3M
                        }
273
18.3M
                      }
274
18.3M
                    }
275
40.8M
                   done_appending: ;
276
40.8M
                  }
277
278
                /* sortbuf is now empty.  */
279
57.8M
                sortbuf_count = 0;
280
57.8M
              }
281
282
61.0M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
18.4M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
42.5M
            if (sortbuf_count == sortbuf_allocated)
288
1.94k
              {
289
1.94k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
1.94k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
1.94k
                struct ucs4_with_ccc *new_sortbuf =
293
1.94k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
1.94k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
1.94k
                memcpy (new_sortbuf, sortbuf,
300
1.94k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
1.94k
                if (sortbuf != sortbuf_preallocated)
302
1.94k
                  free (sortbuf);
303
1.94k
                sortbuf = new_sortbuf;
304
1.94k
              }
305
42.5M
            sortbuf[sortbuf_count].code = uc;
306
42.5M
            sortbuf[sortbuf_count].ccc = ccc;
307
42.5M
            sortbuf_count++;
308
309
42.5M
            i++;
310
42.5M
          }
311
312
54.6M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
18.4M
          break;
315
316
36.2M
        s += count;
317
36.2M
      }
318
18.4M
  }
319
320
18.4M
  if (length == 0)
321
101k
    {
322
101k
      if (result == NULL)
323
101k
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
101k
          result = (UNIT *) malloc (1);
326
101k
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
101k
        }
332
101k
    }
333
18.3M
  else if (result != resultbuf && length < allocated)
334
18.3M
    {
335
      /* Shrink the allocated memory if possible.  */
336
18.3M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
18.3M
      if (memory != NULL)
338
18.3M
        result = memory;
339
18.3M
    }
340
341
18.4M
  if (sortbuf_count > 0)
342
0
    abort ();
343
18.4M
  if (sortbuf != sortbuf_preallocated)
344
18.4M
    free (sortbuf);
345
346
18.4M
  *lengthp = length;
347
18.4M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
18.4M
}
u8_normalize
Line
Count
Source
21
6.13M
{
22
6.13M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
6.13M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
6.13M
  UNIT *result;
27
6.13M
  size_t allocated;
28
6.13M
  if (resultbuf == NULL)
29
6.13M
    {
30
6.13M
      result = NULL;
31
6.13M
      allocated = 0;
32
6.13M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
6.13M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
6.13M
  #define SORTBUF_PREALLOCATED 64
42
6.13M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
6.13M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
6.13M
    sortbuf_preallocated;
45
6.13M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
6.13M
  size_t sortbuf_count = 0;
47
48
6.13M
  {
49
6.13M
    const UNIT *s_end = s + n;
50
51
6.13M
    for (;;)
52
25.1M
      {
53
25.1M
        int count;
54
25.1M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
25.1M
        int decomposed_count;
56
57
25.1M
        if (s < s_end)
58
19.0M
          {
59
            /* Fetch the next character.  */
60
19.0M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
19.0M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
47.6M
            for (int curr = 0; curr < decomposed_count; )
70
28.6M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
28.6M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
28.6M
                int curr_decomposed_count;
75
76
28.6M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
28.6M
                if (curr_decomposed_count >= 0)
78
3.30M
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
3.30M
                    int shift = curr_decomposed_count - 1;
83
84
3.30M
                    if (shift < 0)
85
0
                      abort ();
86
3.30M
                    if (shift > 0)
87
2.83M
                      {
88
2.83M
                        decomposed_count += shift;
89
2.83M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
2.85M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
16.4k
                          decomposed[j + shift] = decomposed[j];
93
2.83M
                      }
94
12.9M
                    for (; shift >= 0; shift--)
95
9.59M
                      decomposed[curr + shift] = curr_decomposed[shift];
96
3.30M
                  }
97
25.3M
                else
98
25.3M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
25.3M
                    curr++;
101
25.3M
                  }
102
28.6M
              }
103
19.0M
          }
104
6.13M
        else
105
6.13M
          {
106
6.13M
            count = 0;
107
6.13M
            decomposed_count = 0;
108
6.13M
          }
109
110
25.1M
        int i = 0;
111
25.1M
        for (;;)
112
50.4M
          {
113
50.4M
            ucs4_t uc;
114
50.4M
            int ccc;
115
116
50.4M
            if (s < s_end)
117
44.3M
              {
118
                /* Fetch the next character from the decomposition.  */
119
44.3M
                if (i == decomposed_count)
120
19.0M
                  break;
121
25.3M
                uc = decomposed[i];
122
25.3M
                ccc = uc_combining_class (uc);
123
25.3M
              }
124
6.13M
            else
125
6.13M
              {
126
                /* End of string reached.  */
127
6.13M
                uc = 0;
128
6.13M
                ccc = 0;
129
6.13M
              }
130
131
31.4M
            if (ccc == 0)
132
28.4M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
28.4M
                if (sortbuf_count > 1)
136
1.63M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
1.63M
                                                           sortbuf + sortbuf_count);
138
139
28.4M
                if (composer != NULL)
140
28.4M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
28.4M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
22.2M
                      {
163
25.3M
                        for (size_t j = 1; j < sortbuf_count; )
164
3.01M
                          {
165
3.01M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
1.65M
                              {
167
1.65M
                                ucs4_t combined =
168
1.65M
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
1.65M
                                if (combined)
170
1.61M
                                  {
171
1.61M
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
3.47M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
1.85M
                                      sortbuf[k - 1] = sortbuf[k];
175
1.61M
                                    sortbuf_count--;
176
1.61M
                                    continue;
177
1.61M
                                  }
178
1.65M
                              }
179
1.39M
                            j++;
180
1.39M
                          }
181
22.2M
                        if (s < s_end && sortbuf_count == 1)
182
16.1M
                          {
183
16.1M
                            ucs4_t combined =
184
16.1M
                              composer (sortbuf[0].code, uc);
185
16.1M
                            if (combined)
186
8.51k
                              {
187
8.51k
                                uc = combined;
188
8.51k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
8.51k
                                sortbuf_count = 0;
193
8.51k
                              }
194
16.1M
                          }
195
22.2M
                      }
196
28.4M
                  }
197
198
52.1M
                for (size_t j = 0; j < sortbuf_count; j++)
199
23.6M
                  {
200
23.6M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
23.6M
                    if (length < allocated)
204
17.5M
                      {
205
17.5M
                        int ret =
206
17.5M
                          U_UCTOMB (result + length, muc, allocated - length);
207
17.5M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
17.5M
                        if (ret >= 0)
213
17.5M
                          {
214
17.5M
                            length += ret;
215
17.5M
                            goto done_appending;
216
17.5M
                          }
217
17.5M
                      }
218
6.14M
                    {
219
6.14M
                      size_t old_allocated = allocated;
220
6.14M
                      size_t new_allocated = 2 * old_allocated;
221
6.14M
                      if (new_allocated < 64)
222
6.13M
                        new_allocated = 64;
223
6.14M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
6.14M
                      {
226
6.14M
                        UNIT *larger_result;
227
6.14M
                        if (result == NULL)
228
6.13M
                          {
229
6.13M
                            larger_result =
230
6.13M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
6.13M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
6.13M
                          }
237
12.8k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
12.8k
                        else
249
12.8k
                          {
250
12.8k
                            larger_result =
251
12.8k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
12.8k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
12.8k
                          }
258
6.14M
                        result = larger_result;
259
6.14M
                        allocated = new_allocated;
260
6.14M
                        {
261
6.14M
                          int ret =
262
6.14M
                            U_UCTOMB (result + length, muc, allocated - length);
263
6.14M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
6.14M
                          if (ret < 0)
269
0
                            abort ();
270
6.14M
                          length += ret;
271
6.14M
                          goto done_appending;
272
6.14M
                        }
273
6.14M
                      }
274
6.14M
                    }
275
23.6M
                   done_appending: ;
276
23.6M
                  }
277
278
                /* sortbuf is now empty.  */
279
28.4M
                sortbuf_count = 0;
280
28.4M
              }
281
282
31.4M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
6.13M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
25.3M
            if (sortbuf_count == sortbuf_allocated)
288
1.12k
              {
289
1.12k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
1.12k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
1.12k
                struct ucs4_with_ccc *new_sortbuf =
293
1.12k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
1.12k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
1.12k
                memcpy (new_sortbuf, sortbuf,
300
1.12k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
1.12k
                if (sortbuf != sortbuf_preallocated)
302
1.12k
                  free (sortbuf);
303
1.12k
                sortbuf = new_sortbuf;
304
1.12k
              }
305
25.3M
            sortbuf[sortbuf_count].code = uc;
306
25.3M
            sortbuf[sortbuf_count].ccc = ccc;
307
25.3M
            sortbuf_count++;
308
309
25.3M
            i++;
310
25.3M
          }
311
312
25.1M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
6.13M
          break;
315
316
19.0M
        s += count;
317
19.0M
      }
318
6.13M
  }
319
320
6.13M
  if (length == 0)
321
0
    {
322
0
      if (result == NULL)
323
0
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
0
          result = (UNIT *) malloc (1);
326
0
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
0
        }
332
0
    }
333
6.13M
  else if (result != resultbuf && length < allocated)
334
6.13M
    {
335
      /* Shrink the allocated memory if possible.  */
336
6.13M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
6.13M
      if (memory != NULL)
338
6.13M
        result = memory;
339
6.13M
    }
340
341
6.13M
  if (sortbuf_count > 0)
342
0
    abort ();
343
6.13M
  if (sortbuf != sortbuf_preallocated)
344
6.13M
    free (sortbuf);
345
346
6.13M
  *lengthp = length;
347
6.13M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
6.13M
}
u32_normalize
Line
Count
Source
21
12.3M
{
22
12.3M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
12.3M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
12.3M
  UNIT *result;
27
12.3M
  size_t allocated;
28
12.3M
  if (resultbuf == NULL)
29
12.3M
    {
30
12.3M
      result = NULL;
31
12.3M
      allocated = 0;
32
12.3M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
12.3M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
12.3M
  #define SORTBUF_PREALLOCATED 64
42
12.3M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
12.3M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
12.3M
    sortbuf_preallocated;
45
12.3M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
12.3M
  size_t sortbuf_count = 0;
47
48
12.3M
  {
49
12.3M
    const UNIT *s_end = s + n;
50
51
12.3M
    for (;;)
52
29.5M
      {
53
29.5M
        int count;
54
29.5M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
29.5M
        int decomposed_count;
56
57
29.5M
        if (s < s_end)
58
17.1M
          {
59
            /* Fetch the next character.  */
60
17.1M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
17.1M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
34.4M
            for (int curr = 0; curr < decomposed_count; )
70
17.2M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
17.2M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
17.2M
                int curr_decomposed_count;
75
76
17.2M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
17.2M
                if (curr_decomposed_count >= 0)
78
35.8k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
35.8k
                    int shift = curr_decomposed_count - 1;
83
84
35.8k
                    if (shift < 0)
85
0
                      abort ();
86
35.8k
                    if (shift > 0)
87
35.4k
                      {
88
35.4k
                        decomposed_count += shift;
89
35.4k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
45.5k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
10.1k
                          decomposed[j + shift] = decomposed[j];
93
35.4k
                      }
94
107k
                    for (; shift >= 0; shift--)
95
71.3k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
35.8k
                  }
97
17.2M
                else
98
17.2M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
17.2M
                    curr++;
101
17.2M
                  }
102
17.2M
              }
103
17.1M
          }
104
12.3M
        else
105
12.3M
          {
106
12.3M
            count = 0;
107
12.3M
            decomposed_count = 0;
108
12.3M
          }
109
110
29.5M
        int i = 0;
111
29.5M
        for (;;)
112
46.7M
          {
113
46.7M
            ucs4_t uc;
114
46.7M
            int ccc;
115
116
46.7M
            if (s < s_end)
117
34.4M
              {
118
                /* Fetch the next character from the decomposition.  */
119
34.4M
                if (i == decomposed_count)
120
17.1M
                  break;
121
17.2M
                uc = decomposed[i];
122
17.2M
                ccc = uc_combining_class (uc);
123
17.2M
              }
124
12.3M
            else
125
12.3M
              {
126
                /* End of string reached.  */
127
12.3M
                uc = 0;
128
12.3M
                ccc = 0;
129
12.3M
              }
130
131
29.5M
            if (ccc == 0)
132
29.4M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
29.4M
                if (sortbuf_count > 1)
136
27.6k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
27.6k
                                                           sortbuf + sortbuf_count);
138
139
29.4M
                if (composer != NULL)
140
29.4M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
29.4M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
17.1M
                      {
163
17.2M
                        for (size_t j = 1; j < sortbuf_count; )
164
101k
                          {
165
101k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
39.0k
                              {
167
39.0k
                                ucs4_t combined =
168
39.0k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
39.0k
                                if (combined)
170
23.7k
                                  {
171
23.7k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
36.2k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
12.5k
                                      sortbuf[k - 1] = sortbuf[k];
175
23.7k
                                    sortbuf_count--;
176
23.7k
                                    continue;
177
23.7k
                                  }
178
39.0k
                              }
179
77.7k
                            j++;
180
77.7k
                          }
181
17.1M
                        if (s < s_end && sortbuf_count == 1)
182
4.88M
                          {
183
4.88M
                            ucs4_t combined =
184
4.88M
                              composer (sortbuf[0].code, uc);
185
4.88M
                            if (combined)
186
9.95k
                              {
187
9.95k
                                uc = combined;
188
9.95k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
9.95k
                                sortbuf_count = 0;
193
9.95k
                              }
194
4.88M
                          }
195
17.1M
                      }
196
29.4M
                  }
197
198
46.6M
                for (size_t j = 0; j < sortbuf_count; j++)
199
17.1M
                  {
200
17.1M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
17.1M
                    if (length < allocated)
204
4.97M
                      {
205
4.97M
                        int ret =
206
4.97M
                          U_UCTOMB (result + length, muc, allocated - length);
207
4.97M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
4.97M
                        if (ret >= 0)
213
4.97M
                          {
214
4.97M
                            length += ret;
215
4.97M
                            goto done_appending;
216
4.97M
                          }
217
4.97M
                      }
218
12.2M
                    {
219
12.2M
                      size_t old_allocated = allocated;
220
12.2M
                      size_t new_allocated = 2 * old_allocated;
221
12.2M
                      if (new_allocated < 64)
222
12.2M
                        new_allocated = 64;
223
12.2M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
12.2M
                      {
226
12.2M
                        UNIT *larger_result;
227
12.2M
                        if (result == NULL)
228
12.2M
                          {
229
12.2M
                            larger_result =
230
12.2M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
12.2M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
12.2M
                          }
237
3.04k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
3.04k
                        else
249
3.04k
                          {
250
3.04k
                            larger_result =
251
3.04k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
3.04k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
3.04k
                          }
258
12.2M
                        result = larger_result;
259
12.2M
                        allocated = new_allocated;
260
12.2M
                        {
261
12.2M
                          int ret =
262
12.2M
                            U_UCTOMB (result + length, muc, allocated - length);
263
12.2M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
12.2M
                          if (ret < 0)
269
0
                            abort ();
270
12.2M
                          length += ret;
271
12.2M
                          goto done_appending;
272
12.2M
                        }
273
12.2M
                      }
274
12.2M
                    }
275
17.1M
                   done_appending: ;
276
17.1M
                  }
277
278
                /* sortbuf is now empty.  */
279
29.4M
                sortbuf_count = 0;
280
29.4M
              }
281
282
29.5M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
12.3M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
17.2M
            if (sortbuf_count == sortbuf_allocated)
288
817
              {
289
817
                sortbuf_allocated = 2 * sortbuf_allocated;
290
817
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
817
                struct ucs4_with_ccc *new_sortbuf =
293
817
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
817
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
817
                memcpy (new_sortbuf, sortbuf,
300
817
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
817
                if (sortbuf != sortbuf_preallocated)
302
817
                  free (sortbuf);
303
817
                sortbuf = new_sortbuf;
304
817
              }
305
17.2M
            sortbuf[sortbuf_count].code = uc;
306
17.2M
            sortbuf[sortbuf_count].ccc = ccc;
307
17.2M
            sortbuf_count++;
308
309
17.2M
            i++;
310
17.2M
          }
311
312
29.5M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
12.3M
          break;
315
316
17.1M
        s += count;
317
17.1M
      }
318
12.3M
  }
319
320
12.3M
  if (length == 0)
321
101k
    {
322
101k
      if (result == NULL)
323
101k
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
101k
          result = (UNIT *) malloc (1);
326
101k
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
101k
        }
332
101k
    }
333
12.2M
  else if (result != resultbuf && length < allocated)
334
12.2M
    {
335
      /* Shrink the allocated memory if possible.  */
336
12.2M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
12.2M
      if (memory != NULL)
338
12.2M
        result = memory;
339
12.2M
    }
340
341
12.3M
  if (sortbuf_count > 0)
342
0
    abort ();
343
12.3M
  if (sortbuf != sortbuf_preallocated)
344
12.3M
    free (sortbuf);
345
346
12.3M
  *lengthp = length;
347
12.3M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
12.3M
}