Coverage Report

Created: 2025-12-05 06:16

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
12.5M
{
22
12.5M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
12.5M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
12.5M
  UNIT *result;
27
12.5M
  size_t length;
28
12.5M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
12.5M
  #define SORTBUF_PREALLOCATED 64
31
12.5M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
12.5M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
12.5M
  size_t sortbuf_allocated;
34
12.5M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
12.5M
  if (resultbuf == NULL)
38
12.5M
    {
39
12.5M
      result = NULL;
40
12.5M
      allocated = 0;
41
12.5M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
12.5M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
12.5M
  sortbuf = sortbuf_preallocated;
51
12.5M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
12.5M
  sortbuf_count = 0;
53
54
12.5M
  {
55
12.5M
    const UNIT *s_end = s + n;
56
57
12.5M
    for (;;)
58
39.3M
      {
59
39.3M
        int count;
60
39.3M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
39.3M
        int decomposed_count;
62
39.3M
        int i;
63
64
39.3M
        if (s < s_end)
65
26.7M
          {
66
            /* Fetch the next character.  */
67
26.7M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
26.7M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
63.0M
            for (int curr = 0; curr < decomposed_count; )
77
36.2M
              {
78
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
79
                   all elements are atomic.  */
80
36.2M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
81
36.2M
                int curr_decomposed_count;
82
83
36.2M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
84
36.2M
                if (curr_decomposed_count >= 0)
85
3.13M
                  {
86
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
87
                       decomposed[curr], making room.  It's not worth using
88
                       memcpy() here, since the counts are so small.  */
89
3.13M
                    int shift = curr_decomposed_count - 1;
90
91
3.13M
                    if (shift < 0)
92
0
                      abort ();
93
3.13M
                    if (shift > 0)
94
2.72M
                      {
95
2.72M
                        decomposed_count += shift;
96
2.72M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
97
0
                          abort ();
98
2.75M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
99
25.9k
                          decomposed[j + shift] = decomposed[j];
100
2.72M
                      }
101
12.6M
                    for (; shift >= 0; shift--)
102
9.48M
                      decomposed[curr + shift] = curr_decomposed[shift];
103
3.13M
                  }
104
33.1M
                else
105
33.1M
                  {
106
                    /* decomposed[curr] is atomic.  */
107
33.1M
                    curr++;
108
33.1M
                  }
109
36.2M
              }
110
26.7M
          }
111
12.5M
        else
112
12.5M
          {
113
12.5M
            count = 0;
114
12.5M
            decomposed_count = 0;
115
12.5M
          }
116
117
39.3M
        i = 0;
118
39.3M
        for (;;)
119
72.4M
          {
120
72.4M
            ucs4_t uc;
121
72.4M
            int ccc;
122
123
72.4M
            if (s < s_end)
124
59.8M
              {
125
                /* Fetch the next character from the decomposition.  */
126
59.8M
                if (i == decomposed_count)
127
26.7M
                  break;
128
33.1M
                uc = decomposed[i];
129
33.1M
                ccc = uc_combining_class (uc);
130
33.1M
              }
131
12.5M
            else
132
12.5M
              {
133
                /* End of string reached.  */
134
12.5M
                uc = 0;
135
12.5M
                ccc = 0;
136
12.5M
              }
137
138
45.6M
            if (ccc == 0)
139
42.6M
              {
140
                /* Apply the canonical ordering algorithm to the accumulated
141
                   sequence of characters.  */
142
42.6M
                if (sortbuf_count > 1)
143
1.63M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
144
1.63M
                                                           sortbuf + sortbuf_count);
145
146
42.6M
                if (composer != NULL)
147
42.6M
                  {
148
                    /* Attempt to combine decomposed characters, as specified
149
                       in the Unicode Standard Annex #15 "Unicode Normalization
150
                       Forms".  We need to check
151
                         1. whether the first accumulated character is a
152
                            "starter" (i.e. has ccc = 0).  This is usually the
153
                            case.  But when the string starts with a
154
                            non-starter, the sortbuf also starts with a
155
                            non-starter.  Btw, this check could also be
156
                            omitted, because the composition table has only
157
                            entries (code1, code2) for which code1 is a
158
                            starter; if the first accumulated character is not
159
                            a starter, no lookup will succeed.
160
                         2. If the sortbuf has more than one character, check
161
                            for each of these characters that are not "blocked"
162
                            from the starter (i.e. have a ccc that is higher
163
                            than the ccc of the previous character) whether it
164
                            can be combined with the first character.
165
                         3. If only one character is left in sortbuf, check
166
                            whether it can be combined with the next character
167
                            (also a starter).  */
168
42.6M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
169
30.1M
                      {
170
33.0M
                        for (size_t j = 1; j < sortbuf_count; )
171
2.95M
                          {
172
2.95M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
173
1.66M
                              {
174
1.66M
                                ucs4_t combined =
175
1.66M
                                  composer (sortbuf[0].code, sortbuf[j].code);
176
1.66M
                                if (combined)
177
1.61M
                                  {
178
1.61M
                                    sortbuf[0].code = combined;
179
                                    /* sortbuf[0].ccc = 0, still valid.  */
180
3.23M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
181
1.61M
                                      sortbuf[k - 1] = sortbuf[k];
182
1.61M
                                    sortbuf_count--;
183
1.61M
                                    continue;
184
1.61M
                                  }
185
1.66M
                              }
186
1.33M
                            j++;
187
1.33M
                          }
188
30.1M
                        if (s < s_end && sortbuf_count == 1)
189
17.6M
                          {
190
17.6M
                            ucs4_t combined =
191
17.6M
                              composer (sortbuf[0].code, uc);
192
17.6M
                            if (combined)
193
16.6k
                              {
194
16.6k
                                uc = combined;
195
16.6k
                                ccc = 0;
196
                                /* uc could be further combined with subsequent
197
                                   characters.  So don't put it into sortbuf[0] in
198
                                   this round, only in the next round.  */
199
16.6k
                                sortbuf_count = 0;
200
16.6k
                              }
201
17.6M
                          }
202
30.1M
                      }
203
42.6M
                  }
204
205
74.1M
                for (size_t j = 0; j < sortbuf_count; j++)
206
31.4M
                  {
207
31.4M
                    ucs4_t muc = sortbuf[j].code;
208
209
                    /* Append muc to the result accumulator.  */
210
31.4M
                    if (length < allocated)
211
19.0M
                      {
212
19.0M
                        int ret =
213
19.0M
                          U_UCTOMB (result + length, muc, allocated - length);
214
19.0M
                        if (ret == -1)
215
0
                          {
216
0
                            errno = EINVAL;
217
0
                            goto fail;
218
0
                          }
219
19.0M
                        if (ret >= 0)
220
19.0M
                          {
221
19.0M
                            length += ret;
222
19.0M
                            goto done_appending;
223
19.0M
                          }
224
19.0M
                      }
225
12.4M
                    {
226
12.4M
                      size_t old_allocated = allocated;
227
12.4M
                      size_t new_allocated = 2 * old_allocated;
228
12.4M
                      if (new_allocated < 64)
229
12.4M
                        new_allocated = 64;
230
12.4M
                      if (new_allocated < old_allocated) /* integer overflow? */
231
0
                        abort ();
232
12.4M
                      {
233
12.4M
                        UNIT *larger_result;
234
12.4M
                        if (result == NULL)
235
12.4M
                          {
236
12.4M
                            larger_result =
237
12.4M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
238
12.4M
                            if (larger_result == NULL)
239
0
                              {
240
0
                                errno = ENOMEM;
241
0
                                goto fail;
242
0
                              }
243
12.4M
                          }
244
15.5k
                        else if (result == resultbuf)
245
0
                          {
246
0
                            larger_result =
247
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
0
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
0
                            U_CPY (larger_result, resultbuf, length);
254
0
                          }
255
15.5k
                        else
256
15.5k
                          {
257
15.5k
                            larger_result =
258
15.5k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
259
15.5k
                            if (larger_result == NULL)
260
0
                              {
261
0
                                errno = ENOMEM;
262
0
                                goto fail;
263
0
                              }
264
15.5k
                          }
265
12.4M
                        result = larger_result;
266
12.4M
                        allocated = new_allocated;
267
12.4M
                        {
268
12.4M
                          int ret =
269
12.4M
                            U_UCTOMB (result + length, muc, allocated - length);
270
12.4M
                          if (ret == -1)
271
0
                            {
272
0
                              errno = EINVAL;
273
0
                              goto fail;
274
0
                            }
275
12.4M
                          if (ret < 0)
276
0
                            abort ();
277
12.4M
                          length += ret;
278
12.4M
                          goto done_appending;
279
12.4M
                        }
280
12.4M
                      }
281
12.4M
                    }
282
31.4M
                   done_appending: ;
283
31.4M
                  }
284
285
                /* sortbuf is now empty.  */
286
42.6M
                sortbuf_count = 0;
287
42.6M
              }
288
289
45.6M
            if (!(s < s_end))
290
              /* End of string reached.  */
291
12.5M
              break;
292
293
            /* Append (uc, ccc) to sortbuf.  */
294
33.1M
            if (sortbuf_count == sortbuf_allocated)
295
1.86k
              {
296
1.86k
                struct ucs4_with_ccc *new_sortbuf;
297
298
1.86k
                sortbuf_allocated = 2 * sortbuf_allocated;
299
1.86k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
300
0
                  abort ();
301
1.86k
                new_sortbuf =
302
1.86k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
303
1.86k
                if (new_sortbuf == NULL)
304
0
                  {
305
0
                    errno = ENOMEM;
306
0
                    goto fail;
307
0
                  }
308
1.86k
                memcpy (new_sortbuf, sortbuf,
309
1.86k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
310
1.86k
                if (sortbuf != sortbuf_preallocated)
311
1.86k
                  free (sortbuf);
312
1.86k
                sortbuf = new_sortbuf;
313
1.86k
              }
314
33.1M
            sortbuf[sortbuf_count].code = uc;
315
33.1M
            sortbuf[sortbuf_count].ccc = ccc;
316
33.1M
            sortbuf_count++;
317
318
33.1M
            i++;
319
33.1M
          }
320
321
39.3M
        if (!(s < s_end))
322
          /* End of string reached.  */
323
12.5M
          break;
324
325
26.7M
        s += count;
326
26.7M
      }
327
12.5M
  }
328
329
12.5M
  if (length == 0)
330
91.3k
    {
331
91.3k
      if (result == NULL)
332
91.3k
        {
333
          /* Return a non-NULL value.  NULL means error.  */
334
91.3k
          result = (UNIT *) malloc (1);
335
91.3k
          if (result == NULL)
336
0
            {
337
0
              errno = ENOMEM;
338
0
              goto fail;
339
0
            }
340
91.3k
        }
341
91.3k
    }
342
12.4M
  else if (result != resultbuf && length < allocated)
343
12.4M
    {
344
      /* Shrink the allocated memory if possible.  */
345
12.4M
      UNIT *memory;
346
347
12.4M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
348
12.4M
      if (memory != NULL)
349
12.4M
        result = memory;
350
12.4M
    }
351
352
12.5M
  if (sortbuf_count > 0)
353
0
    abort ();
354
12.5M
  if (sortbuf != sortbuf_preallocated)
355
12.5M
    free (sortbuf);
356
357
12.5M
  *lengthp = length;
358
12.5M
  return result;
359
360
0
 fail:
361
0
  {
362
0
    int saved_errno = errno;
363
0
    if (sortbuf != sortbuf_preallocated)
364
0
      free (sortbuf);
365
0
    if (result != resultbuf)
366
0
      free (result);
367
0
    errno = saved_errno;
368
0
  }
369
  return NULL;
370
12.5M
}
u8_normalize
Line
Count
Source
21
4.12M
{
22
4.12M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
4.12M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
4.12M
  UNIT *result;
27
4.12M
  size_t length;
28
4.12M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
4.12M
  #define SORTBUF_PREALLOCATED 64
31
4.12M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
4.12M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
4.12M
  size_t sortbuf_allocated;
34
4.12M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
4.12M
  if (resultbuf == NULL)
38
4.12M
    {
39
4.12M
      result = NULL;
40
4.12M
      allocated = 0;
41
4.12M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
4.12M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
4.12M
  sortbuf = sortbuf_preallocated;
51
4.12M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
4.12M
  sortbuf_count = 0;
53
54
4.12M
  {
55
4.12M
    const UNIT *s_end = s + n;
56
57
4.12M
    for (;;)
58
18.8M
      {
59
18.8M
        int count;
60
18.8M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
18.8M
        int decomposed_count;
62
18.8M
        int i;
63
64
18.8M
        if (s < s_end)
65
14.6M
          {
66
            /* Fetch the next character.  */
67
14.6M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
14.6M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
38.8M
            for (int curr = 0; curr < decomposed_count; )
77
24.1M
              {
78
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
79
                   all elements are atomic.  */
80
24.1M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
81
24.1M
                int curr_decomposed_count;
82
83
24.1M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
84
24.1M
                if (curr_decomposed_count >= 0)
85
3.09M
                  {
86
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
87
                       decomposed[curr], making room.  It's not worth using
88
                       memcpy() here, since the counts are so small.  */
89
3.09M
                    int shift = curr_decomposed_count - 1;
90
91
3.09M
                    if (shift < 0)
92
0
                      abort ();
93
3.09M
                    if (shift > 0)
94
2.69M
                      {
95
2.69M
                        decomposed_count += shift;
96
2.69M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
97
0
                          abort ();
98
2.70M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
99
16.1k
                          decomposed[j + shift] = decomposed[j];
100
2.69M
                      }
101
12.5M
                    for (; shift >= 0; shift--)
102
9.42M
                      decomposed[curr + shift] = curr_decomposed[shift];
103
3.09M
                  }
104
21.0M
                else
105
21.0M
                  {
106
                    /* decomposed[curr] is atomic.  */
107
21.0M
                    curr++;
108
21.0M
                  }
109
24.1M
              }
110
14.6M
          }
111
4.12M
        else
112
4.12M
          {
113
4.12M
            count = 0;
114
4.12M
            decomposed_count = 0;
115
4.12M
          }
116
117
18.8M
        i = 0;
118
18.8M
        for (;;)
119
39.8M
          {
120
39.8M
            ucs4_t uc;
121
39.8M
            int ccc;
122
123
39.8M
            if (s < s_end)
124
35.7M
              {
125
                /* Fetch the next character from the decomposition.  */
126
35.7M
                if (i == decomposed_count)
127
14.6M
                  break;
128
21.0M
                uc = decomposed[i];
129
21.0M
                ccc = uc_combining_class (uc);
130
21.0M
              }
131
4.12M
            else
132
4.12M
              {
133
                /* End of string reached.  */
134
4.12M
                uc = 0;
135
4.12M
                ccc = 0;
136
4.12M
              }
137
138
25.1M
            if (ccc == 0)
139
22.2M
              {
140
                /* Apply the canonical ordering algorithm to the accumulated
141
                   sequence of characters.  */
142
22.2M
                if (sortbuf_count > 1)
143
1.60M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
144
1.60M
                                                           sortbuf + sortbuf_count);
145
146
22.2M
                if (composer != NULL)
147
22.2M
                  {
148
                    /* Attempt to combine decomposed characters, as specified
149
                       in the Unicode Standard Annex #15 "Unicode Normalization
150
                       Forms".  We need to check
151
                         1. whether the first accumulated character is a
152
                            "starter" (i.e. has ccc = 0).  This is usually the
153
                            case.  But when the string starts with a
154
                            non-starter, the sortbuf also starts with a
155
                            non-starter.  Btw, this check could also be
156
                            omitted, because the composition table has only
157
                            entries (code1, code2) for which code1 is a
158
                            starter; if the first accumulated character is not
159
                            a starter, no lookup will succeed.
160
                         2. If the sortbuf has more than one character, check
161
                            for each of these characters that are not "blocked"
162
                            from the starter (i.e. have a ccc that is higher
163
                            than the ccc of the previous character) whether it
164
                            can be combined with the first character.
165
                         3. If only one character is left in sortbuf, check
166
                            whether it can be combined with the next character
167
                            (also a starter).  */
168
22.2M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
169
18.1M
                      {
170
21.0M
                        for (size_t j = 1; j < sortbuf_count; )
171
2.84M
                          {
172
2.84M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
173
1.62M
                              {
174
1.62M
                                ucs4_t combined =
175
1.62M
                                  composer (sortbuf[0].code, sortbuf[j].code);
176
1.62M
                                if (combined)
177
1.59M
                                  {
178
1.59M
                                    sortbuf[0].code = combined;
179
                                    /* sortbuf[0].ccc = 0, still valid.  */
180
3.19M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
181
1.60M
                                      sortbuf[k - 1] = sortbuf[k];
182
1.59M
                                    sortbuf_count--;
183
1.59M
                                    continue;
184
1.59M
                                  }
185
1.62M
                              }
186
1.25M
                            j++;
187
1.25M
                          }
188
18.1M
                        if (s < s_end && sortbuf_count == 1)
189
14.0M
                          {
190
14.0M
                            ucs4_t combined =
191
14.0M
                              composer (sortbuf[0].code, uc);
192
14.0M
                            if (combined)
193
7.42k
                              {
194
7.42k
                                uc = combined;
195
7.42k
                                ccc = 0;
196
                                /* uc could be further combined with subsequent
197
                                   characters.  So don't put it into sortbuf[0] in
198
                                   this round, only in the next round.  */
199
7.42k
                                sortbuf_count = 0;
200
7.42k
                              }
201
14.0M
                          }
202
18.1M
                      }
203
22.2M
                  }
204
205
41.6M
                for (size_t j = 0; j < sortbuf_count; j++)
206
19.4M
                  {
207
19.4M
                    ucs4_t muc = sortbuf[j].code;
208
209
                    /* Append muc to the result accumulator.  */
210
19.4M
                    if (length < allocated)
211
15.2M
                      {
212
15.2M
                        int ret =
213
15.2M
                          U_UCTOMB (result + length, muc, allocated - length);
214
15.2M
                        if (ret == -1)
215
0
                          {
216
0
                            errno = EINVAL;
217
0
                            goto fail;
218
0
                          }
219
15.2M
                        if (ret >= 0)
220
15.2M
                          {
221
15.2M
                            length += ret;
222
15.2M
                            goto done_appending;
223
15.2M
                          }
224
15.2M
                      }
225
4.13M
                    {
226
4.13M
                      size_t old_allocated = allocated;
227
4.13M
                      size_t new_allocated = 2 * old_allocated;
228
4.13M
                      if (new_allocated < 64)
229
4.12M
                        new_allocated = 64;
230
4.13M
                      if (new_allocated < old_allocated) /* integer overflow? */
231
0
                        abort ();
232
4.13M
                      {
233
4.13M
                        UNIT *larger_result;
234
4.13M
                        if (result == NULL)
235
4.12M
                          {
236
4.12M
                            larger_result =
237
4.12M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
238
4.12M
                            if (larger_result == NULL)
239
0
                              {
240
0
                                errno = ENOMEM;
241
0
                                goto fail;
242
0
                              }
243
4.12M
                          }
244
12.5k
                        else if (result == resultbuf)
245
0
                          {
246
0
                            larger_result =
247
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
0
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
0
                            U_CPY (larger_result, resultbuf, length);
254
0
                          }
255
12.5k
                        else
256
12.5k
                          {
257
12.5k
                            larger_result =
258
12.5k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
259
12.5k
                            if (larger_result == NULL)
260
0
                              {
261
0
                                errno = ENOMEM;
262
0
                                goto fail;
263
0
                              }
264
12.5k
                          }
265
4.13M
                        result = larger_result;
266
4.13M
                        allocated = new_allocated;
267
4.13M
                        {
268
4.13M
                          int ret =
269
4.13M
                            U_UCTOMB (result + length, muc, allocated - length);
270
4.13M
                          if (ret == -1)
271
0
                            {
272
0
                              errno = EINVAL;
273
0
                              goto fail;
274
0
                            }
275
4.13M
                          if (ret < 0)
276
0
                            abort ();
277
4.13M
                          length += ret;
278
4.13M
                          goto done_appending;
279
4.13M
                        }
280
4.13M
                      }
281
4.13M
                    }
282
19.4M
                   done_appending: ;
283
19.4M
                  }
284
285
                /* sortbuf is now empty.  */
286
22.2M
                sortbuf_count = 0;
287
22.2M
              }
288
289
25.1M
            if (!(s < s_end))
290
              /* End of string reached.  */
291
4.12M
              break;
292
293
            /* Append (uc, ccc) to sortbuf.  */
294
21.0M
            if (sortbuf_count == sortbuf_allocated)
295
1.04k
              {
296
1.04k
                struct ucs4_with_ccc *new_sortbuf;
297
298
1.04k
                sortbuf_allocated = 2 * sortbuf_allocated;
299
1.04k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
300
0
                  abort ();
301
1.04k
                new_sortbuf =
302
1.04k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
303
1.04k
                if (new_sortbuf == NULL)
304
0
                  {
305
0
                    errno = ENOMEM;
306
0
                    goto fail;
307
0
                  }
308
1.04k
                memcpy (new_sortbuf, sortbuf,
309
1.04k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
310
1.04k
                if (sortbuf != sortbuf_preallocated)
311
1.04k
                  free (sortbuf);
312
1.04k
                sortbuf = new_sortbuf;
313
1.04k
              }
314
21.0M
            sortbuf[sortbuf_count].code = uc;
315
21.0M
            sortbuf[sortbuf_count].ccc = ccc;
316
21.0M
            sortbuf_count++;
317
318
21.0M
            i++;
319
21.0M
          }
320
321
18.8M
        if (!(s < s_end))
322
          /* End of string reached.  */
323
4.12M
          break;
324
325
14.6M
        s += count;
326
14.6M
      }
327
4.12M
  }
328
329
4.12M
  if (length == 0)
330
0
    {
331
0
      if (result == NULL)
332
0
        {
333
          /* Return a non-NULL value.  NULL means error.  */
334
0
          result = (UNIT *) malloc (1);
335
0
          if (result == NULL)
336
0
            {
337
0
              errno = ENOMEM;
338
0
              goto fail;
339
0
            }
340
0
        }
341
0
    }
342
4.12M
  else if (result != resultbuf && length < allocated)
343
4.12M
    {
344
      /* Shrink the allocated memory if possible.  */
345
4.12M
      UNIT *memory;
346
347
4.12M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
348
4.12M
      if (memory != NULL)
349
4.12M
        result = memory;
350
4.12M
    }
351
352
4.12M
  if (sortbuf_count > 0)
353
0
    abort ();
354
4.12M
  if (sortbuf != sortbuf_preallocated)
355
4.12M
    free (sortbuf);
356
357
4.12M
  *lengthp = length;
358
4.12M
  return result;
359
360
0
 fail:
361
0
  {
362
0
    int saved_errno = errno;
363
0
    if (sortbuf != sortbuf_preallocated)
364
0
      free (sortbuf);
365
0
    if (result != resultbuf)
366
0
      free (result);
367
0
    errno = saved_errno;
368
0
  }
369
  return NULL;
370
4.12M
}
u32_normalize
Line
Count
Source
21
8.42M
{
22
8.42M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
8.42M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
8.42M
  UNIT *result;
27
8.42M
  size_t length;
28
8.42M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
8.42M
  #define SORTBUF_PREALLOCATED 64
31
8.42M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
8.42M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
8.42M
  size_t sortbuf_allocated;
34
8.42M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
8.42M
  if (resultbuf == NULL)
38
8.42M
    {
39
8.42M
      result = NULL;
40
8.42M
      allocated = 0;
41
8.42M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
8.42M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
8.42M
  sortbuf = sortbuf_preallocated;
51
8.42M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
8.42M
  sortbuf_count = 0;
53
54
8.42M
  {
55
8.42M
    const UNIT *s_end = s + n;
56
57
8.42M
    for (;;)
58
20.4M
      {
59
20.4M
        int count;
60
20.4M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
20.4M
        int decomposed_count;
62
20.4M
        int i;
63
64
20.4M
        if (s < s_end)
65
12.0M
          {
66
            /* Fetch the next character.  */
67
12.0M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
12.0M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
24.2M
            for (int curr = 0; curr < decomposed_count; )
77
12.1M
              {
78
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
79
                   all elements are atomic.  */
80
12.1M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
81
12.1M
                int curr_decomposed_count;
82
83
12.1M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
84
12.1M
                if (curr_decomposed_count >= 0)
85
34.7k
                  {
86
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
87
                       decomposed[curr], making room.  It's not worth using
88
                       memcpy() here, since the counts are so small.  */
89
34.7k
                    int shift = curr_decomposed_count - 1;
90
91
34.7k
                    if (shift < 0)
92
0
                      abort ();
93
34.7k
                    if (shift > 0)
94
34.3k
                      {
95
34.3k
                        decomposed_count += shift;
96
34.3k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
97
0
                          abort ();
98
44.1k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
99
9.76k
                          decomposed[j + shift] = decomposed[j];
100
34.3k
                      }
101
103k
                    for (; shift >= 0; shift--)
102
69.0k
                      decomposed[curr + shift] = curr_decomposed[shift];
103
34.7k
                  }
104
12.1M
                else
105
12.1M
                  {
106
                    /* decomposed[curr] is atomic.  */
107
12.1M
                    curr++;
108
12.1M
                  }
109
12.1M
              }
110
12.0M
          }
111
8.42M
        else
112
8.42M
          {
113
8.42M
            count = 0;
114
8.42M
            decomposed_count = 0;
115
8.42M
          }
116
117
20.4M
        i = 0;
118
20.4M
        for (;;)
119
32.6M
          {
120
32.6M
            ucs4_t uc;
121
32.6M
            int ccc;
122
123
32.6M
            if (s < s_end)
124
24.1M
              {
125
                /* Fetch the next character from the decomposition.  */
126
24.1M
                if (i == decomposed_count)
127
12.0M
                  break;
128
12.1M
                uc = decomposed[i];
129
12.1M
                ccc = uc_combining_class (uc);
130
12.1M
              }
131
8.42M
            else
132
8.42M
              {
133
                /* End of string reached.  */
134
8.42M
                uc = 0;
135
8.42M
                ccc = 0;
136
8.42M
              }
137
138
20.5M
            if (ccc == 0)
139
20.4M
              {
140
                /* Apply the canonical ordering algorithm to the accumulated
141
                   sequence of characters.  */
142
20.4M
                if (sortbuf_count > 1)
143
26.0k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
144
26.0k
                                                           sortbuf + sortbuf_count);
145
146
20.4M
                if (composer != NULL)
147
20.4M
                  {
148
                    /* Attempt to combine decomposed characters, as specified
149
                       in the Unicode Standard Annex #15 "Unicode Normalization
150
                       Forms".  We need to check
151
                         1. whether the first accumulated character is a
152
                            "starter" (i.e. has ccc = 0).  This is usually the
153
                            case.  But when the string starts with a
154
                            non-starter, the sortbuf also starts with a
155
                            non-starter.  Btw, this check could also be
156
                            omitted, because the composition table has only
157
                            entries (code1, code2) for which code1 is a
158
                            starter; if the first accumulated character is not
159
                            a starter, no lookup will succeed.
160
                         2. If the sortbuf has more than one character, check
161
                            for each of these characters that are not "blocked"
162
                            from the starter (i.e. have a ccc that is higher
163
                            than the ccc of the previous character) whether it
164
                            can be combined with the first character.
165
                         3. If only one character is left in sortbuf, check
166
                            whether it can be combined with the next character
167
                            (also a starter).  */
168
20.4M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
169
11.9M
                      {
170
12.0M
                        for (size_t j = 1; j < sortbuf_count; )
171
100k
                          {
172
100k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
173
37.5k
                              {
174
37.5k
                                ucs4_t combined =
175
37.5k
                                  composer (sortbuf[0].code, sortbuf[j].code);
176
37.5k
                                if (combined)
177
23.0k
                                  {
178
23.0k
                                    sortbuf[0].code = combined;
179
                                    /* sortbuf[0].ccc = 0, still valid.  */
180
36.7k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
181
13.6k
                                      sortbuf[k - 1] = sortbuf[k];
182
23.0k
                                    sortbuf_count--;
183
23.0k
                                    continue;
184
23.0k
                                  }
185
37.5k
                              }
186
77.8k
                            j++;
187
77.8k
                          }
188
11.9M
                        if (s < s_end && sortbuf_count == 1)
189
3.66M
                          {
190
3.66M
                            ucs4_t combined =
191
3.66M
                              composer (sortbuf[0].code, uc);
192
3.66M
                            if (combined)
193
9.25k
                              {
194
9.25k
                                uc = combined;
195
9.25k
                                ccc = 0;
196
                                /* uc could be further combined with subsequent
197
                                   characters.  So don't put it into sortbuf[0] in
198
                                   this round, only in the next round.  */
199
9.25k
                                sortbuf_count = 0;
200
9.25k
                              }
201
3.66M
                          }
202
11.9M
                      }
203
20.4M
                  }
204
205
32.4M
                for (size_t j = 0; j < sortbuf_count; j++)
206
12.0M
                  {
207
12.0M
                    ucs4_t muc = sortbuf[j].code;
208
209
                    /* Append muc to the result accumulator.  */
210
12.0M
                    if (length < allocated)
211
3.74M
                      {
212
3.74M
                        int ret =
213
3.74M
                          U_UCTOMB (result + length, muc, allocated - length);
214
3.74M
                        if (ret == -1)
215
0
                          {
216
0
                            errno = EINVAL;
217
0
                            goto fail;
218
0
                          }
219
3.74M
                        if (ret >= 0)
220
3.74M
                          {
221
3.74M
                            length += ret;
222
3.74M
                            goto done_appending;
223
3.74M
                          }
224
3.74M
                      }
225
8.33M
                    {
226
8.33M
                      size_t old_allocated = allocated;
227
8.33M
                      size_t new_allocated = 2 * old_allocated;
228
8.33M
                      if (new_allocated < 64)
229
8.32M
                        new_allocated = 64;
230
8.33M
                      if (new_allocated < old_allocated) /* integer overflow? */
231
0
                        abort ();
232
8.33M
                      {
233
8.33M
                        UNIT *larger_result;
234
8.33M
                        if (result == NULL)
235
8.32M
                          {
236
8.32M
                            larger_result =
237
8.32M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
238
8.32M
                            if (larger_result == NULL)
239
0
                              {
240
0
                                errno = ENOMEM;
241
0
                                goto fail;
242
0
                              }
243
8.32M
                          }
244
3.01k
                        else if (result == resultbuf)
245
0
                          {
246
0
                            larger_result =
247
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
0
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
0
                            U_CPY (larger_result, resultbuf, length);
254
0
                          }
255
3.01k
                        else
256
3.01k
                          {
257
3.01k
                            larger_result =
258
3.01k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
259
3.01k
                            if (larger_result == NULL)
260
0
                              {
261
0
                                errno = ENOMEM;
262
0
                                goto fail;
263
0
                              }
264
3.01k
                          }
265
8.33M
                        result = larger_result;
266
8.33M
                        allocated = new_allocated;
267
8.33M
                        {
268
8.33M
                          int ret =
269
8.33M
                            U_UCTOMB (result + length, muc, allocated - length);
270
8.33M
                          if (ret == -1)
271
0
                            {
272
0
                              errno = EINVAL;
273
0
                              goto fail;
274
0
                            }
275
8.33M
                          if (ret < 0)
276
0
                            abort ();
277
8.33M
                          length += ret;
278
8.33M
                          goto done_appending;
279
8.33M
                        }
280
8.33M
                      }
281
8.33M
                    }
282
12.0M
                   done_appending: ;
283
12.0M
                  }
284
285
                /* sortbuf is now empty.  */
286
20.4M
                sortbuf_count = 0;
287
20.4M
              }
288
289
20.5M
            if (!(s < s_end))
290
              /* End of string reached.  */
291
8.42M
              break;
292
293
            /* Append (uc, ccc) to sortbuf.  */
294
12.1M
            if (sortbuf_count == sortbuf_allocated)
295
826
              {
296
826
                struct ucs4_with_ccc *new_sortbuf;
297
298
826
                sortbuf_allocated = 2 * sortbuf_allocated;
299
826
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
300
0
                  abort ();
301
826
                new_sortbuf =
302
826
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
303
826
                if (new_sortbuf == NULL)
304
0
                  {
305
0
                    errno = ENOMEM;
306
0
                    goto fail;
307
0
                  }
308
826
                memcpy (new_sortbuf, sortbuf,
309
826
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
310
826
                if (sortbuf != sortbuf_preallocated)
311
826
                  free (sortbuf);
312
826
                sortbuf = new_sortbuf;
313
826
              }
314
12.1M
            sortbuf[sortbuf_count].code = uc;
315
12.1M
            sortbuf[sortbuf_count].ccc = ccc;
316
12.1M
            sortbuf_count++;
317
318
12.1M
            i++;
319
12.1M
          }
320
321
20.4M
        if (!(s < s_end))
322
          /* End of string reached.  */
323
8.42M
          break;
324
325
12.0M
        s += count;
326
12.0M
      }
327
8.42M
  }
328
329
8.42M
  if (length == 0)
330
91.3k
    {
331
91.3k
      if (result == NULL)
332
91.3k
        {
333
          /* Return a non-NULL value.  NULL means error.  */
334
91.3k
          result = (UNIT *) malloc (1);
335
91.3k
          if (result == NULL)
336
0
            {
337
0
              errno = ENOMEM;
338
0
              goto fail;
339
0
            }
340
91.3k
        }
341
91.3k
    }
342
8.32M
  else if (result != resultbuf && length < allocated)
343
8.32M
    {
344
      /* Shrink the allocated memory if possible.  */
345
8.32M
      UNIT *memory;
346
347
8.32M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
348
8.32M
      if (memory != NULL)
349
8.32M
        result = memory;
350
8.32M
    }
351
352
8.42M
  if (sortbuf_count > 0)
353
0
    abort ();
354
8.42M
  if (sortbuf != sortbuf_preallocated)
355
8.42M
    free (sortbuf);
356
357
8.42M
  *lengthp = length;
358
8.42M
  return result;
359
360
0
 fail:
361
0
  {
362
0
    int saved_errno = errno;
363
0
    if (sortbuf != sortbuf_preallocated)
364
0
      free (sortbuf);
365
0
    if (result != resultbuf)
366
0
      free (result);
367
0
    errno = saved_errno;
368
0
  }
369
  return NULL;
370
8.42M
}