Coverage Report

Created: 2025-12-12 07:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
13.0M
{
22
13.0M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
13.0M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
13.0M
  UNIT *result;
27
13.0M
  size_t allocated;
28
13.0M
  if (resultbuf == NULL)
29
13.0M
    {
30
13.0M
      result = NULL;
31
13.0M
      allocated = 0;
32
13.0M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
13.0M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
13.0M
  #define SORTBUF_PREALLOCATED 64
42
13.0M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
13.0M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
13.0M
    sortbuf_preallocated;
45
13.0M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
13.0M
  size_t sortbuf_count = 0;
47
48
13.0M
  {
49
13.0M
    const UNIT *s_end = s + n;
50
51
13.0M
    for (;;)
52
40.6M
      {
53
40.6M
        int count;
54
40.6M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
40.6M
        int decomposed_count;
56
57
40.6M
        if (s < s_end)
58
27.6M
          {
59
            /* Fetch the next character.  */
60
27.6M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
27.6M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
64.9M
            for (int curr = 0; curr < decomposed_count; )
70
37.2M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
37.2M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
37.2M
                int curr_decomposed_count;
75
76
37.2M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
37.2M
                if (curr_decomposed_count >= 0)
78
3.22M
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
3.22M
                    int shift = curr_decomposed_count - 1;
83
84
3.22M
                    if (shift < 0)
85
0
                      abort ();
86
3.22M
                    if (shift > 0)
87
2.81M
                      {
88
2.81M
                        decomposed_count += shift;
89
2.81M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
2.84M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
25.7k
                          decomposed[j + shift] = decomposed[j];
93
2.81M
                      }
94
12.8M
                    for (; shift >= 0; shift--)
95
9.65M
                      decomposed[curr + shift] = curr_decomposed[shift];
96
3.22M
                  }
97
34.0M
                else
98
34.0M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
34.0M
                    curr++;
101
34.0M
                  }
102
37.2M
              }
103
27.6M
          }
104
13.0M
        else
105
13.0M
          {
106
13.0M
            count = 0;
107
13.0M
            decomposed_count = 0;
108
13.0M
          }
109
110
40.6M
        int i = 0;
111
40.6M
        for (;;)
112
74.7M
          {
113
74.7M
            ucs4_t uc;
114
74.7M
            int ccc;
115
116
74.7M
            if (s < s_end)
117
61.6M
              {
118
                /* Fetch the next character from the decomposition.  */
119
61.6M
                if (i == decomposed_count)
120
27.6M
                  break;
121
34.0M
                uc = decomposed[i];
122
34.0M
                ccc = uc_combining_class (uc);
123
34.0M
              }
124
13.0M
            else
125
13.0M
              {
126
                /* End of string reached.  */
127
13.0M
                uc = 0;
128
13.0M
                ccc = 0;
129
13.0M
              }
130
131
47.0M
            if (ccc == 0)
132
43.9M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
43.9M
                if (sortbuf_count > 1)
136
1.64M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
1.64M
                                                           sortbuf + sortbuf_count);
138
139
43.9M
                if (composer != NULL)
140
43.9M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
43.9M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
30.9M
                      {
163
34.0M
                        for (size_t j = 1; j < sortbuf_count; )
164
3.10M
                          {
165
3.10M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
1.67M
                              {
167
1.67M
                                ucs4_t combined =
168
1.67M
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
1.67M
                                if (combined)
170
1.62M
                                  {
171
1.62M
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
3.54M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
1.92M
                                      sortbuf[k - 1] = sortbuf[k];
175
1.62M
                                    sortbuf_count--;
176
1.62M
                                    continue;
177
1.62M
                                  }
178
1.67M
                              }
179
1.48M
                            j++;
180
1.48M
                          }
181
30.9M
                        if (s < s_end && sortbuf_count == 1)
182
17.9M
                          {
183
17.9M
                            ucs4_t combined =
184
17.9M
                              composer (sortbuf[0].code, uc);
185
17.9M
                            if (combined)
186
16.8k
                              {
187
16.8k
                                uc = combined;
188
16.8k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
16.8k
                                sortbuf_count = 0;
193
16.8k
                              }
194
17.9M
                          }
195
30.9M
                      }
196
43.9M
                  }
197
198
76.3M
                for (size_t j = 0; j < sortbuf_count; j++)
199
32.4M
                  {
200
32.4M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
32.4M
                    if (length < allocated)
204
19.4M
                      {
205
19.4M
                        int ret =
206
19.4M
                          U_UCTOMB (result + length, muc, allocated - length);
207
19.4M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
19.4M
                        if (ret >= 0)
213
19.4M
                          {
214
19.4M
                            length += ret;
215
19.4M
                            goto done_appending;
216
19.4M
                          }
217
19.4M
                      }
218
12.9M
                    {
219
12.9M
                      size_t old_allocated = allocated;
220
12.9M
                      size_t new_allocated = 2 * old_allocated;
221
12.9M
                      if (new_allocated < 64)
222
12.9M
                        new_allocated = 64;
223
12.9M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
12.9M
                      {
226
12.9M
                        UNIT *larger_result;
227
12.9M
                        if (result == NULL)
228
12.9M
                          {
229
12.9M
                            larger_result =
230
12.9M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
12.9M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
12.9M
                          }
237
15.5k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
15.5k
                        else
249
15.5k
                          {
250
15.5k
                            larger_result =
251
15.5k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
15.5k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
15.5k
                          }
258
12.9M
                        result = larger_result;
259
12.9M
                        allocated = new_allocated;
260
12.9M
                        {
261
12.9M
                          int ret =
262
12.9M
                            U_UCTOMB (result + length, muc, allocated - length);
263
12.9M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
12.9M
                          if (ret < 0)
269
0
                            abort ();
270
12.9M
                          length += ret;
271
12.9M
                          goto done_appending;
272
12.9M
                        }
273
12.9M
                      }
274
12.9M
                    }
275
32.4M
                   done_appending: ;
276
32.4M
                  }
277
278
                /* sortbuf is now empty.  */
279
43.9M
                sortbuf_count = 0;
280
43.9M
              }
281
282
47.0M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
13.0M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
34.0M
            if (sortbuf_count == sortbuf_allocated)
288
1.90k
              {
289
1.90k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
1.90k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
1.90k
                struct ucs4_with_ccc *new_sortbuf =
293
1.90k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
1.90k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
1.90k
                memcpy (new_sortbuf, sortbuf,
300
1.90k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
1.90k
                if (sortbuf != sortbuf_preallocated)
302
1.90k
                  free (sortbuf);
303
1.90k
                sortbuf = new_sortbuf;
304
1.90k
              }
305
34.0M
            sortbuf[sortbuf_count].code = uc;
306
34.0M
            sortbuf[sortbuf_count].ccc = ccc;
307
34.0M
            sortbuf_count++;
308
309
34.0M
            i++;
310
34.0M
          }
311
312
40.6M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
13.0M
          break;
315
316
27.6M
        s += count;
317
27.6M
      }
318
13.0M
  }
319
320
13.0M
  if (length == 0)
321
89.5k
    {
322
89.5k
      if (result == NULL)
323
89.5k
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
89.5k
          result = (UNIT *) malloc (1);
326
89.5k
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
89.5k
        }
332
89.5k
    }
333
12.9M
  else if (result != resultbuf && length < allocated)
334
12.9M
    {
335
      /* Shrink the allocated memory if possible.  */
336
12.9M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
12.9M
      if (memory != NULL)
338
12.9M
        result = memory;
339
12.9M
    }
340
341
13.0M
  if (sortbuf_count > 0)
342
0
    abort ();
343
13.0M
  if (sortbuf != sortbuf_preallocated)
344
13.0M
    free (sortbuf);
345
346
13.0M
  *lengthp = length;
347
13.0M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
13.0M
}
u8_normalize
Line
Count
Source
21
4.27M
{
22
4.27M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
4.27M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
4.27M
  UNIT *result;
27
4.27M
  size_t allocated;
28
4.27M
  if (resultbuf == NULL)
29
4.27M
    {
30
4.27M
      result = NULL;
31
4.27M
      allocated = 0;
32
4.27M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
4.27M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
4.27M
  #define SORTBUF_PREALLOCATED 64
42
4.27M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
4.27M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
4.27M
    sortbuf_preallocated;
45
4.27M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
4.27M
  size_t sortbuf_count = 0;
47
48
4.27M
  {
49
4.27M
    const UNIT *s_end = s + n;
50
51
4.27M
    for (;;)
52
19.4M
      {
53
19.4M
        int count;
54
19.4M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
19.4M
        int decomposed_count;
56
57
19.4M
        if (s < s_end)
58
15.1M
          {
59
            /* Fetch the next character.  */
60
15.1M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
15.1M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
39.8M
            for (int curr = 0; curr < decomposed_count; )
70
24.7M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
24.7M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
24.7M
                int curr_decomposed_count;
75
76
24.7M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
24.7M
                if (curr_decomposed_count >= 0)
78
3.19M
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
3.19M
                    int shift = curr_decomposed_count - 1;
83
84
3.19M
                    if (shift < 0)
85
0
                      abort ();
86
3.19M
                    if (shift > 0)
87
2.78M
                      {
88
2.78M
                        decomposed_count += shift;
89
2.78M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
2.79M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
15.9k
                          decomposed[j + shift] = decomposed[j];
93
2.78M
                      }
94
12.7M
                    for (; shift >= 0; shift--)
95
9.58M
                      decomposed[curr + shift] = curr_decomposed[shift];
96
3.19M
                  }
97
21.5M
                else
98
21.5M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
21.5M
                    curr++;
101
21.5M
                  }
102
24.7M
              }
103
15.1M
          }
104
4.27M
        else
105
4.27M
          {
106
4.27M
            count = 0;
107
4.27M
            decomposed_count = 0;
108
4.27M
          }
109
110
19.4M
        int i = 0;
111
19.4M
        for (;;)
112
40.9M
          {
113
40.9M
            ucs4_t uc;
114
40.9M
            int ccc;
115
116
40.9M
            if (s < s_end)
117
36.6M
              {
118
                /* Fetch the next character from the decomposition.  */
119
36.6M
                if (i == decomposed_count)
120
15.1M
                  break;
121
21.5M
                uc = decomposed[i];
122
21.5M
                ccc = uc_combining_class (uc);
123
21.5M
              }
124
4.27M
            else
125
4.27M
              {
126
                /* End of string reached.  */
127
4.27M
                uc = 0;
128
4.27M
                ccc = 0;
129
4.27M
              }
130
131
25.8M
            if (ccc == 0)
132
22.8M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
22.8M
                if (sortbuf_count > 1)
136
1.61M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
1.61M
                                                           sortbuf + sortbuf_count);
138
139
22.8M
                if (composer != NULL)
140
22.8M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
22.8M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
18.5M
                      {
163
21.5M
                        for (size_t j = 1; j < sortbuf_count; )
164
3.00M
                          {
165
3.00M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
1.63M
                              {
167
1.63M
                                ucs4_t combined =
168
1.63M
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
1.63M
                                if (combined)
170
1.59M
                                  {
171
1.59M
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
3.50M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
1.90M
                                      sortbuf[k - 1] = sortbuf[k];
175
1.59M
                                    sortbuf_count--;
176
1.59M
                                    continue;
177
1.59M
                                  }
178
1.63M
                              }
179
1.40M
                            j++;
180
1.40M
                          }
181
18.5M
                        if (s < s_end && sortbuf_count == 1)
182
14.2M
                          {
183
14.2M
                            ucs4_t combined =
184
14.2M
                              composer (sortbuf[0].code, uc);
185
14.2M
                            if (combined)
186
7.62k
                              {
187
7.62k
                                uc = combined;
188
7.62k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
7.62k
                                sortbuf_count = 0;
193
7.62k
                              }
194
14.2M
                          }
195
18.5M
                      }
196
22.8M
                  }
197
198
42.7M
                for (size_t j = 0; j < sortbuf_count; j++)
199
19.9M
                  {
200
19.9M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
19.9M
                    if (length < allocated)
204
15.6M
                      {
205
15.6M
                        int ret =
206
15.6M
                          U_UCTOMB (result + length, muc, allocated - length);
207
15.6M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
15.6M
                        if (ret >= 0)
213
15.6M
                          {
214
15.6M
                            length += ret;
215
15.6M
                            goto done_appending;
216
15.6M
                          }
217
15.6M
                      }
218
4.28M
                    {
219
4.28M
                      size_t old_allocated = allocated;
220
4.28M
                      size_t new_allocated = 2 * old_allocated;
221
4.28M
                      if (new_allocated < 64)
222
4.27M
                        new_allocated = 64;
223
4.28M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
4.28M
                      {
226
4.28M
                        UNIT *larger_result;
227
4.28M
                        if (result == NULL)
228
4.27M
                          {
229
4.27M
                            larger_result =
230
4.27M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
4.27M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
4.27M
                          }
237
12.4k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
12.4k
                        else
249
12.4k
                          {
250
12.4k
                            larger_result =
251
12.4k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
12.4k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
12.4k
                          }
258
4.28M
                        result = larger_result;
259
4.28M
                        allocated = new_allocated;
260
4.28M
                        {
261
4.28M
                          int ret =
262
4.28M
                            U_UCTOMB (result + length, muc, allocated - length);
263
4.28M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
4.28M
                          if (ret < 0)
269
0
                            abort ();
270
4.28M
                          length += ret;
271
4.28M
                          goto done_appending;
272
4.28M
                        }
273
4.28M
                      }
274
4.28M
                    }
275
19.9M
                   done_appending: ;
276
19.9M
                  }
277
278
                /* sortbuf is now empty.  */
279
22.8M
                sortbuf_count = 0;
280
22.8M
              }
281
282
25.8M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
4.27M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
21.5M
            if (sortbuf_count == sortbuf_allocated)
288
1.08k
              {
289
1.08k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
1.08k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
1.08k
                struct ucs4_with_ccc *new_sortbuf =
293
1.08k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
1.08k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
1.08k
                memcpy (new_sortbuf, sortbuf,
300
1.08k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
1.08k
                if (sortbuf != sortbuf_preallocated)
302
1.08k
                  free (sortbuf);
303
1.08k
                sortbuf = new_sortbuf;
304
1.08k
              }
305
21.5M
            sortbuf[sortbuf_count].code = uc;
306
21.5M
            sortbuf[sortbuf_count].ccc = ccc;
307
21.5M
            sortbuf_count++;
308
309
21.5M
            i++;
310
21.5M
          }
311
312
19.4M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
4.27M
          break;
315
316
15.1M
        s += count;
317
15.1M
      }
318
4.27M
  }
319
320
4.27M
  if (length == 0)
321
0
    {
322
0
      if (result == NULL)
323
0
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
0
          result = (UNIT *) malloc (1);
326
0
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
0
        }
332
0
    }
333
4.27M
  else if (result != resultbuf && length < allocated)
334
4.27M
    {
335
      /* Shrink the allocated memory if possible.  */
336
4.27M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
4.27M
      if (memory != NULL)
338
4.27M
        result = memory;
339
4.27M
    }
340
341
4.27M
  if (sortbuf_count > 0)
342
0
    abort ();
343
4.27M
  if (sortbuf != sortbuf_preallocated)
344
4.27M
    free (sortbuf);
345
346
4.27M
  *lengthp = length;
347
4.27M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
4.27M
}
u32_normalize
Line
Count
Source
21
8.75M
{
22
8.75M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
8.75M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
8.75M
  UNIT *result;
27
8.75M
  size_t allocated;
28
8.75M
  if (resultbuf == NULL)
29
8.75M
    {
30
8.75M
      result = NULL;
31
8.75M
      allocated = 0;
32
8.75M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
8.75M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
8.75M
  #define SORTBUF_PREALLOCATED 64
42
8.75M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
8.75M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
8.75M
    sortbuf_preallocated;
45
8.75M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
8.75M
  size_t sortbuf_count = 0;
47
48
8.75M
  {
49
8.75M
    const UNIT *s_end = s + n;
50
51
8.75M
    for (;;)
52
21.2M
      {
53
21.2M
        int count;
54
21.2M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
21.2M
        int decomposed_count;
56
57
21.2M
        if (s < s_end)
58
12.4M
          {
59
            /* Fetch the next character.  */
60
12.4M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
12.4M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
25.0M
            for (int curr = 0; curr < decomposed_count; )
70
12.5M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
12.5M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
12.5M
                int curr_decomposed_count;
75
76
12.5M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
12.5M
                if (curr_decomposed_count >= 0)
78
34.5k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
34.5k
                    int shift = curr_decomposed_count - 1;
83
84
34.5k
                    if (shift < 0)
85
0
                      abort ();
86
34.5k
                    if (shift > 0)
87
34.1k
                      {
88
34.1k
                        decomposed_count += shift;
89
34.1k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
44.0k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
9.84k
                          decomposed[j + shift] = decomposed[j];
93
34.1k
                      }
94
103k
                    for (; shift >= 0; shift--)
95
68.7k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
34.5k
                  }
97
12.5M
                else
98
12.5M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
12.5M
                    curr++;
101
12.5M
                  }
102
12.5M
              }
103
12.4M
          }
104
8.75M
        else
105
8.75M
          {
106
8.75M
            count = 0;
107
8.75M
            decomposed_count = 0;
108
8.75M
          }
109
110
21.2M
        int i = 0;
111
21.2M
        for (;;)
112
33.7M
          {
113
33.7M
            ucs4_t uc;
114
33.7M
            int ccc;
115
116
33.7M
            if (s < s_end)
117
25.0M
              {
118
                /* Fetch the next character from the decomposition.  */
119
25.0M
                if (i == decomposed_count)
120
12.4M
                  break;
121
12.5M
                uc = decomposed[i];
122
12.5M
                ccc = uc_combining_class (uc);
123
12.5M
              }
124
8.75M
            else
125
8.75M
              {
126
                /* End of string reached.  */
127
8.75M
                uc = 0;
128
8.75M
                ccc = 0;
129
8.75M
              }
130
131
21.2M
            if (ccc == 0)
132
21.1M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
21.1M
                if (sortbuf_count > 1)
136
25.7k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
25.7k
                                                           sortbuf + sortbuf_count);
138
139
21.1M
                if (composer != NULL)
140
21.1M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
21.1M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
12.4M
                      {
163
12.5M
                        for (size_t j = 1; j < sortbuf_count; )
164
99.9k
                          {
165
99.9k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
37.2k
                              {
167
37.2k
                                ucs4_t combined =
168
37.2k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
37.2k
                                if (combined)
170
22.9k
                                  {
171
22.9k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
36.5k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
13.5k
                                      sortbuf[k - 1] = sortbuf[k];
175
22.9k
                                    sortbuf_count--;
176
22.9k
                                    continue;
177
22.9k
                                  }
178
37.2k
                              }
179
76.9k
                            j++;
180
76.9k
                          }
181
12.4M
                        if (s < s_end && sortbuf_count == 1)
182
3.74M
                          {
183
3.74M
                            ucs4_t combined =
184
3.74M
                              composer (sortbuf[0].code, uc);
185
3.74M
                            if (combined)
186
9.23k
                              {
187
9.23k
                                uc = combined;
188
9.23k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
9.23k
                                sortbuf_count = 0;
193
9.23k
                              }
194
3.74M
                          }
195
12.4M
                      }
196
21.1M
                  }
197
198
33.6M
                for (size_t j = 0; j < sortbuf_count; j++)
199
12.4M
                  {
200
12.4M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
12.4M
                    if (length < allocated)
204
3.82M
                      {
205
3.82M
                        int ret =
206
3.82M
                          U_UCTOMB (result + length, muc, allocated - length);
207
3.82M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
3.82M
                        if (ret >= 0)
213
3.82M
                          {
214
3.82M
                            length += ret;
215
3.82M
                            goto done_appending;
216
3.82M
                          }
217
3.82M
                      }
218
8.66M
                    {
219
8.66M
                      size_t old_allocated = allocated;
220
8.66M
                      size_t new_allocated = 2 * old_allocated;
221
8.66M
                      if (new_allocated < 64)
222
8.66M
                        new_allocated = 64;
223
8.66M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
8.66M
                      {
226
8.66M
                        UNIT *larger_result;
227
8.66M
                        if (result == NULL)
228
8.66M
                          {
229
8.66M
                            larger_result =
230
8.66M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
8.66M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
8.66M
                          }
237
3.02k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
3.02k
                        else
249
3.02k
                          {
250
3.02k
                            larger_result =
251
3.02k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
3.02k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
3.02k
                          }
258
8.66M
                        result = larger_result;
259
8.66M
                        allocated = new_allocated;
260
8.66M
                        {
261
8.66M
                          int ret =
262
8.66M
                            U_UCTOMB (result + length, muc, allocated - length);
263
8.66M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
8.66M
                          if (ret < 0)
269
0
                            abort ();
270
8.66M
                          length += ret;
271
8.66M
                          goto done_appending;
272
8.66M
                        }
273
8.66M
                      }
274
8.66M
                    }
275
12.4M
                   done_appending: ;
276
12.4M
                  }
277
278
                /* sortbuf is now empty.  */
279
21.1M
                sortbuf_count = 0;
280
21.1M
              }
281
282
21.2M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
8.75M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
12.5M
            if (sortbuf_count == sortbuf_allocated)
288
823
              {
289
823
                sortbuf_allocated = 2 * sortbuf_allocated;
290
823
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
823
                struct ucs4_with_ccc *new_sortbuf =
293
823
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
823
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
823
                memcpy (new_sortbuf, sortbuf,
300
823
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
823
                if (sortbuf != sortbuf_preallocated)
302
823
                  free (sortbuf);
303
823
                sortbuf = new_sortbuf;
304
823
              }
305
12.5M
            sortbuf[sortbuf_count].code = uc;
306
12.5M
            sortbuf[sortbuf_count].ccc = ccc;
307
12.5M
            sortbuf_count++;
308
309
12.5M
            i++;
310
12.5M
          }
311
312
21.2M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
8.75M
          break;
315
316
12.4M
        s += count;
317
12.4M
      }
318
8.75M
  }
319
320
8.75M
  if (length == 0)
321
89.5k
    {
322
89.5k
      if (result == NULL)
323
89.5k
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
89.5k
          result = (UNIT *) malloc (1);
326
89.5k
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
89.5k
        }
332
89.5k
    }
333
8.66M
  else if (result != resultbuf && length < allocated)
334
8.66M
    {
335
      /* Shrink the allocated memory if possible.  */
336
8.66M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
8.66M
      if (memory != NULL)
338
8.66M
        result = memory;
339
8.66M
    }
340
341
8.75M
  if (sortbuf_count > 0)
342
0
    abort ();
343
8.75M
  if (sortbuf != sortbuf_preallocated)
344
8.75M
    free (sortbuf);
345
346
8.75M
  *lengthp = length;
347
8.75M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
8.75M
}