Coverage Report

Created: 2026-04-06 06:30

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2026 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
9.20M
{
22
9.20M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
9.20M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
9.20M
  UNIT *result;
27
9.20M
  size_t allocated;
28
9.20M
  if (resultbuf == NULL)
29
9.20M
    {
30
9.20M
      result = NULL;
31
9.20M
      allocated = 0;
32
9.20M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
9.20M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
9.20M
  #define SORTBUF_PREALLOCATED 64
42
9.20M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
9.20M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
9.20M
    sortbuf_preallocated;
45
9.20M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
9.20M
  size_t sortbuf_count = 0;
47
48
9.20M
  {
49
9.20M
    const UNIT *s_end = s + n;
50
51
9.20M
    for (;;)
52
30.2M
      {
53
30.2M
        int count;
54
30.2M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
30.2M
        int decomposed_count;
56
57
30.2M
        if (s < s_end)
58
21.0M
          {
59
            /* Fetch the next character.  */
60
21.0M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
21.0M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
50.5M
            for (int curr = 0; curr < decomposed_count; )
70
29.5M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
29.5M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
29.5M
                int curr_decomposed_count;
75
76
29.5M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
29.5M
                if (curr_decomposed_count >= 0)
78
2.96M
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
2.96M
                    int shift = curr_decomposed_count - 1;
83
84
2.96M
                    if (shift < 0)
85
0
                      abort ();
86
2.96M
                    if (shift > 0)
87
2.63M
                      {
88
2.63M
                        decomposed_count += shift;
89
2.63M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
2.66M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
26.7k
                          decomposed[j + shift] = decomposed[j];
93
2.63M
                      }
94
11.4M
                    for (; shift >= 0; shift--)
95
8.49M
                      decomposed[curr + shift] = curr_decomposed[shift];
96
2.96M
                  }
97
26.5M
                else
98
26.5M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
26.5M
                    curr++;
101
26.5M
                  }
102
29.5M
              }
103
21.0M
          }
104
9.20M
        else
105
9.20M
          {
106
9.20M
            count = 0;
107
9.20M
            decomposed_count = 0;
108
9.20M
          }
109
110
30.2M
        int i = 0;
111
30.2M
        for (;;)
112
56.8M
          {
113
56.8M
            ucs4_t uc;
114
56.8M
            int ccc;
115
116
56.8M
            if (s < s_end)
117
47.5M
              {
118
                /* Fetch the next character from the decomposition.  */
119
47.5M
                if (i == decomposed_count)
120
21.0M
                  break;
121
26.5M
                uc = decomposed[i];
122
26.5M
                ccc = uc_combining_class (uc);
123
26.5M
              }
124
9.20M
            else
125
9.20M
              {
126
                /* End of string reached.  */
127
9.20M
                uc = 0;
128
9.20M
                ccc = 0;
129
9.20M
              }
130
131
35.7M
            if (ccc == 0)
132
32.7M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
32.7M
                if (sortbuf_count > 1)
136
1.53M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
1.53M
                                                           sortbuf + sortbuf_count);
138
139
32.7M
                if (composer != NULL)
140
32.7M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
32.7M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
23.5M
                      {
163
26.5M
                        for (size_t j = 1; j < sortbuf_count; )
164
3.03M
                          {
165
3.03M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
1.56M
                              {
167
1.56M
                                ucs4_t combined =
168
1.56M
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
1.56M
                                if (combined)
170
1.51M
                                  {
171
1.51M
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
3.38M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
1.86M
                                      sortbuf[k - 1] = sortbuf[k];
175
1.51M
                                    sortbuf_count--;
176
1.51M
                                    continue;
177
1.51M
                                  }
178
1.56M
                              }
179
1.52M
                            j++;
180
1.52M
                          }
181
23.5M
                        if (s < s_end && sortbuf_count == 1)
182
14.3M
                          {
183
14.3M
                            ucs4_t combined =
184
14.3M
                              composer (sortbuf[0].code, uc);
185
14.3M
                            if (combined)
186
18.6k
                              {
187
18.6k
                                uc = combined;
188
18.6k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
18.6k
                                sortbuf_count = 0;
193
18.6k
                              }
194
14.3M
                          }
195
23.5M
                      }
196
32.7M
                  }
197
198
57.7M
                for (size_t j = 0; j < sortbuf_count; j++)
199
25.0M
                  {
200
25.0M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
25.0M
                    if (length < allocated)
204
15.8M
                      {
205
15.8M
                        int ret =
206
15.8M
                          U_UCTOMB (result + length, muc, allocated - length);
207
15.8M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
15.8M
                        if (ret >= 0)
213
15.8M
                          {
214
15.8M
                            length += ret;
215
15.8M
                            goto done_appending;
216
15.8M
                          }
217
15.8M
                      }
218
9.15M
                    {
219
9.15M
                      size_t old_allocated = allocated;
220
9.15M
                      size_t new_allocated = 2 * old_allocated;
221
9.15M
                      if (new_allocated < 64)
222
9.14M
                        new_allocated = 64;
223
9.15M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
9.15M
                      {
226
9.15M
                        UNIT *larger_result;
227
9.15M
                        if (result == NULL)
228
9.14M
                          {
229
9.14M
                            larger_result =
230
9.14M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
9.14M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
9.14M
                          }
237
14.8k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
14.8k
                        else
249
14.8k
                          {
250
14.8k
                            larger_result =
251
14.8k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
14.8k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
14.8k
                          }
258
9.15M
                        result = larger_result;
259
9.15M
                        allocated = new_allocated;
260
9.15M
                        {
261
9.15M
                          int ret =
262
9.15M
                            U_UCTOMB (result + length, muc, allocated - length);
263
9.15M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
9.15M
                          if (ret < 0)
269
0
                            abort ();
270
9.15M
                          length += ret;
271
9.15M
                          goto done_appending;
272
9.15M
                        }
273
9.15M
                      }
274
9.15M
                    }
275
25.0M
                   done_appending: ;
276
25.0M
                  }
277
278
                /* sortbuf is now empty.  */
279
32.7M
                sortbuf_count = 0;
280
32.7M
              }
281
282
35.7M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
9.20M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
26.5M
            if (sortbuf_count == sortbuf_allocated)
288
1.93k
              {
289
1.93k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
1.93k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
1.93k
                struct ucs4_with_ccc *new_sortbuf =
293
1.93k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
1.93k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
1.93k
                memcpy (new_sortbuf, sortbuf,
300
1.93k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
1.93k
                if (sortbuf != sortbuf_preallocated)
302
1.93k
                  free (sortbuf);
303
1.93k
                sortbuf = new_sortbuf;
304
1.93k
              }
305
26.5M
            sortbuf[sortbuf_count].code = uc;
306
26.5M
            sortbuf[sortbuf_count].ccc = ccc;
307
26.5M
            sortbuf_count++;
308
309
26.5M
            i++;
310
26.5M
          }
311
312
30.2M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
9.20M
          break;
315
316
21.0M
        s += count;
317
21.0M
      }
318
9.20M
  }
319
320
9.20M
  if (length == 0)
321
65.3k
    {
322
65.3k
      if (result == NULL)
323
65.3k
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
65.3k
          result = (UNIT *) malloc (1);
326
65.3k
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
65.3k
        }
332
65.3k
    }
333
9.14M
  else if (result != resultbuf && length < allocated)
334
9.13M
    {
335
      /* Shrink the allocated memory if possible.  */
336
9.13M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
9.13M
      if (memory != NULL)
338
9.13M
        result = memory;
339
9.13M
    }
340
341
9.20M
  if (sortbuf_count > 0)
342
0
    abort ();
343
9.20M
  if (sortbuf != sortbuf_preallocated)
344
9.20M
    free (sortbuf);
345
346
9.20M
  *lengthp = length;
347
9.20M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
9.20M
}
u8_normalize
Line
Count
Source
21
3.08M
{
22
3.08M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
3.08M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
3.08M
  UNIT *result;
27
3.08M
  size_t allocated;
28
3.08M
  if (resultbuf == NULL)
29
3.08M
    {
30
3.08M
      result = NULL;
31
3.08M
      allocated = 0;
32
3.08M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
3.08M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
3.08M
  #define SORTBUF_PREALLOCATED 64
42
3.08M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
3.08M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
3.08M
    sortbuf_preallocated;
45
3.08M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
3.08M
  size_t sortbuf_count = 0;
47
48
3.08M
  {
49
3.08M
    const UNIT *s_end = s + n;
50
51
3.08M
    for (;;)
52
15.2M
      {
53
15.2M
        int count;
54
15.2M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
15.2M
        int decomposed_count;
56
57
15.2M
        if (s < s_end)
58
12.1M
          {
59
            /* Fetch the next character.  */
60
12.1M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
12.1M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
32.7M
            for (int curr = 0; curr < decomposed_count; )
70
20.5M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
20.5M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
20.5M
                int curr_decomposed_count;
75
76
20.5M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
20.5M
                if (curr_decomposed_count >= 0)
78
2.92M
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
2.92M
                    int shift = curr_decomposed_count - 1;
83
84
2.92M
                    if (shift < 0)
85
0
                      abort ();
86
2.92M
                    if (shift > 0)
87
2.60M
                      {
88
2.60M
                        decomposed_count += shift;
89
2.60M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
2.62M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
16.8k
                          decomposed[j + shift] = decomposed[j];
93
2.60M
                      }
94
11.3M
                    for (; shift >= 0; shift--)
95
8.42M
                      decomposed[curr + shift] = curr_decomposed[shift];
96
2.92M
                  }
97
17.6M
                else
98
17.6M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
17.6M
                    curr++;
101
17.6M
                  }
102
20.5M
              }
103
12.1M
          }
104
3.08M
        else
105
3.08M
          {
106
3.08M
            count = 0;
107
3.08M
            decomposed_count = 0;
108
3.08M
          }
109
110
15.2M
        int i = 0;
111
15.2M
        for (;;)
112
32.8M
          {
113
32.8M
            ucs4_t uc;
114
32.8M
            int ccc;
115
116
32.8M
            if (s < s_end)
117
29.7M
              {
118
                /* Fetch the next character from the decomposition.  */
119
29.7M
                if (i == decomposed_count)
120
12.1M
                  break;
121
17.6M
                uc = decomposed[i];
122
17.6M
                ccc = uc_combining_class (uc);
123
17.6M
              }
124
3.08M
            else
125
3.08M
              {
126
                /* End of string reached.  */
127
3.08M
                uc = 0;
128
3.08M
                ccc = 0;
129
3.08M
              }
130
131
20.7M
            if (ccc == 0)
132
17.7M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
17.7M
                if (sortbuf_count > 1)
136
1.50M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
1.50M
                                                           sortbuf + sortbuf_count);
138
139
17.7M
                if (composer != NULL)
140
17.7M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
17.7M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
14.6M
                      {
163
17.6M
                        for (size_t j = 1; j < sortbuf_count; )
164
2.93M
                          {
165
2.93M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
1.52M
                              {
167
1.52M
                                ucs4_t combined =
168
1.52M
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
1.52M
                                if (combined)
170
1.49M
                                  {
171
1.49M
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
3.34M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
1.85M
                                      sortbuf[k - 1] = sortbuf[k];
175
1.49M
                                    sortbuf_count--;
176
1.49M
                                    continue;
177
1.49M
                                  }
178
1.52M
                              }
179
1.44M
                            j++;
180
1.44M
                          }
181
14.6M
                        if (s < s_end && sortbuf_count == 1)
182
11.5M
                          {
183
11.5M
                            ucs4_t combined =
184
11.5M
                              composer (sortbuf[0].code, uc);
185
11.5M
                            if (combined)
186
8.31k
                              {
187
8.31k
                                uc = combined;
188
8.31k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
8.31k
                                sortbuf_count = 0;
193
8.31k
                              }
194
11.5M
                          }
195
14.6M
                      }
196
17.7M
                  }
197
198
33.9M
                for (size_t j = 0; j < sortbuf_count; j++)
199
16.1M
                  {
200
16.1M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
16.1M
                    if (length < allocated)
204
13.0M
                      {
205
13.0M
                        int ret =
206
13.0M
                          U_UCTOMB (result + length, muc, allocated - length);
207
13.0M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
13.0M
                        if (ret >= 0)
213
13.0M
                          {
214
13.0M
                            length += ret;
215
13.0M
                            goto done_appending;
216
13.0M
                          }
217
13.0M
                      }
218
3.09M
                    {
219
3.09M
                      size_t old_allocated = allocated;
220
3.09M
                      size_t new_allocated = 2 * old_allocated;
221
3.09M
                      if (new_allocated < 64)
222
3.08M
                        new_allocated = 64;
223
3.09M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
3.09M
                      {
226
3.09M
                        UNIT *larger_result;
227
3.09M
                        if (result == NULL)
228
3.08M
                          {
229
3.08M
                            larger_result =
230
3.08M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
3.08M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
3.08M
                          }
237
12.0k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
12.0k
                        else
249
12.0k
                          {
250
12.0k
                            larger_result =
251
12.0k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
12.0k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
12.0k
                          }
258
3.09M
                        result = larger_result;
259
3.09M
                        allocated = new_allocated;
260
3.09M
                        {
261
3.09M
                          int ret =
262
3.09M
                            U_UCTOMB (result + length, muc, allocated - length);
263
3.09M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
3.09M
                          if (ret < 0)
269
0
                            abort ();
270
3.09M
                          length += ret;
271
3.09M
                          goto done_appending;
272
3.09M
                        }
273
3.09M
                      }
274
3.09M
                    }
275
16.1M
                   done_appending: ;
276
16.1M
                  }
277
278
                /* sortbuf is now empty.  */
279
17.7M
                sortbuf_count = 0;
280
17.7M
              }
281
282
20.7M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
3.08M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
17.6M
            if (sortbuf_count == sortbuf_allocated)
288
1.11k
              {
289
1.11k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
1.11k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
1.11k
                struct ucs4_with_ccc *new_sortbuf =
293
1.11k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
1.11k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
1.11k
                memcpy (new_sortbuf, sortbuf,
300
1.11k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
1.11k
                if (sortbuf != sortbuf_preallocated)
302
1.11k
                  free (sortbuf);
303
1.11k
                sortbuf = new_sortbuf;
304
1.11k
              }
305
17.6M
            sortbuf[sortbuf_count].code = uc;
306
17.6M
            sortbuf[sortbuf_count].ccc = ccc;
307
17.6M
            sortbuf_count++;
308
309
17.6M
            i++;
310
17.6M
          }
311
312
15.2M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
3.08M
          break;
315
316
12.1M
        s += count;
317
12.1M
      }
318
3.08M
  }
319
320
3.08M
  if (length == 0)
321
0
    {
322
0
      if (result == NULL)
323
0
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
0
          result = (UNIT *) malloc (1);
326
0
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
0
        }
332
0
    }
333
3.08M
  else if (result != resultbuf && length < allocated)
334
3.08M
    {
335
      /* Shrink the allocated memory if possible.  */
336
3.08M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
3.08M
      if (memory != NULL)
338
3.08M
        result = memory;
339
3.08M
    }
340
341
3.08M
  if (sortbuf_count > 0)
342
0
    abort ();
343
3.08M
  if (sortbuf != sortbuf_preallocated)
344
3.08M
    free (sortbuf);
345
346
3.08M
  *lengthp = length;
347
3.08M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
3.08M
}
u32_normalize
Line
Count
Source
21
6.12M
{
22
6.12M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
6.12M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
6.12M
  UNIT *result;
27
6.12M
  size_t allocated;
28
6.12M
  if (resultbuf == NULL)
29
6.12M
    {
30
6.12M
      result = NULL;
31
6.12M
      allocated = 0;
32
6.12M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
6.12M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
6.12M
  #define SORTBUF_PREALLOCATED 64
42
6.12M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
6.12M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
6.12M
    sortbuf_preallocated;
45
6.12M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
6.12M
  size_t sortbuf_count = 0;
47
48
6.12M
  {
49
6.12M
    const UNIT *s_end = s + n;
50
51
6.12M
    for (;;)
52
15.0M
      {
53
15.0M
        int count;
54
15.0M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
15.0M
        int decomposed_count;
56
57
15.0M
        if (s < s_end)
58
8.89M
          {
59
            /* Fetch the next character.  */
60
8.89M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
8.89M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
17.8M
            for (int curr = 0; curr < decomposed_count; )
70
8.95M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
8.95M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
8.95M
                int curr_decomposed_count;
75
76
8.95M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
8.95M
                if (curr_decomposed_count >= 0)
78
32.4k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
32.4k
                    int shift = curr_decomposed_count - 1;
83
84
32.4k
                    if (shift < 0)
85
0
                      abort ();
86
32.4k
                    if (shift > 0)
87
32.1k
                      {
88
32.1k
                        decomposed_count += shift;
89
32.1k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
42.1k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
9.92k
                          decomposed[j + shift] = decomposed[j];
93
32.1k
                      }
94
97.1k
                    for (; shift >= 0; shift--)
95
64.6k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
32.4k
                  }
97
8.92M
                else
98
8.92M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
8.92M
                    curr++;
101
8.92M
                  }
102
8.95M
              }
103
8.89M
          }
104
6.12M
        else
105
6.12M
          {
106
6.12M
            count = 0;
107
6.12M
            decomposed_count = 0;
108
6.12M
          }
109
110
15.0M
        int i = 0;
111
15.0M
        for (;;)
112
23.9M
          {
113
23.9M
            ucs4_t uc;
114
23.9M
            int ccc;
115
116
23.9M
            if (s < s_end)
117
17.8M
              {
118
                /* Fetch the next character from the decomposition.  */
119
17.8M
                if (i == decomposed_count)
120
8.89M
                  break;
121
8.92M
                uc = decomposed[i];
122
8.92M
                ccc = uc_combining_class (uc);
123
8.92M
              }
124
6.12M
            else
125
6.12M
              {
126
                /* End of string reached.  */
127
6.12M
                uc = 0;
128
6.12M
                ccc = 0;
129
6.12M
              }
130
131
15.0M
            if (ccc == 0)
132
14.9M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
14.9M
                if (sortbuf_count > 1)
136
23.3k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
23.3k
                                                           sortbuf + sortbuf_count);
138
139
14.9M
                if (composer != NULL)
140
14.9M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
14.9M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
8.81M
                      {
163
8.91M
                        for (size_t j = 1; j < sortbuf_count; )
164
98.3k
                          {
165
98.3k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
33.9k
                              {
167
33.9k
                                ucs4_t combined =
168
33.9k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
33.9k
                                if (combined)
170
20.1k
                                  {
171
20.1k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
33.1k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
12.9k
                                      sortbuf[k - 1] = sortbuf[k];
175
20.1k
                                    sortbuf_count--;
176
20.1k
                                    continue;
177
20.1k
                                  }
178
33.9k
                              }
179
78.1k
                            j++;
180
78.1k
                          }
181
8.81M
                        if (s < s_end && sortbuf_count == 1)
182
2.75M
                          {
183
2.75M
                            ucs4_t combined =
184
2.75M
                              composer (sortbuf[0].code, uc);
185
2.75M
                            if (combined)
186
10.2k
                              {
187
10.2k
                                uc = combined;
188
10.2k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
10.2k
                                sortbuf_count = 0;
193
10.2k
                              }
194
2.75M
                          }
195
8.81M
                      }
196
14.9M
                  }
197
198
23.8M
                for (size_t j = 0; j < sortbuf_count; j++)
199
8.89M
                  {
200
8.89M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
8.89M
                    if (length < allocated)
204
2.83M
                      {
205
2.83M
                        int ret =
206
2.83M
                          U_UCTOMB (result + length, muc, allocated - length);
207
2.83M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
2.83M
                        if (ret >= 0)
213
2.83M
                          {
214
2.83M
                            length += ret;
215
2.83M
                            goto done_appending;
216
2.83M
                          }
217
2.83M
                      }
218
6.05M
                    {
219
6.05M
                      size_t old_allocated = allocated;
220
6.05M
                      size_t new_allocated = 2 * old_allocated;
221
6.05M
                      if (new_allocated < 64)
222
6.05M
                        new_allocated = 64;
223
6.05M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
6.05M
                      {
226
6.05M
                        UNIT *larger_result;
227
6.05M
                        if (result == NULL)
228
6.05M
                          {
229
6.05M
                            larger_result =
230
6.05M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
6.05M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
6.05M
                          }
237
2.75k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
2.75k
                        else
249
2.75k
                          {
250
2.75k
                            larger_result =
251
2.75k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
2.75k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
2.75k
                          }
258
6.05M
                        result = larger_result;
259
6.05M
                        allocated = new_allocated;
260
6.05M
                        {
261
6.05M
                          int ret =
262
6.05M
                            U_UCTOMB (result + length, muc, allocated - length);
263
6.05M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
6.05M
                          if (ret < 0)
269
0
                            abort ();
270
6.05M
                          length += ret;
271
6.05M
                          goto done_appending;
272
6.05M
                        }
273
6.05M
                      }
274
6.05M
                    }
275
8.89M
                   done_appending: ;
276
8.89M
                  }
277
278
                /* sortbuf is now empty.  */
279
14.9M
                sortbuf_count = 0;
280
14.9M
              }
281
282
15.0M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
6.12M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
8.92M
            if (sortbuf_count == sortbuf_allocated)
288
818
              {
289
818
                sortbuf_allocated = 2 * sortbuf_allocated;
290
818
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
818
                struct ucs4_with_ccc *new_sortbuf =
293
818
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
818
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
818
                memcpy (new_sortbuf, sortbuf,
300
818
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
818
                if (sortbuf != sortbuf_preallocated)
302
818
                  free (sortbuf);
303
818
                sortbuf = new_sortbuf;
304
818
              }
305
8.92M
            sortbuf[sortbuf_count].code = uc;
306
8.92M
            sortbuf[sortbuf_count].ccc = ccc;
307
8.92M
            sortbuf_count++;
308
309
8.92M
            i++;
310
8.92M
          }
311
312
15.0M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
6.12M
          break;
315
316
8.89M
        s += count;
317
8.89M
      }
318
6.12M
  }
319
320
6.12M
  if (length == 0)
321
65.3k
    {
322
65.3k
      if (result == NULL)
323
65.3k
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
65.3k
          result = (UNIT *) malloc (1);
326
65.3k
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
65.3k
        }
332
65.3k
    }
333
6.05M
  else if (result != resultbuf && length < allocated)
334
6.05M
    {
335
      /* Shrink the allocated memory if possible.  */
336
6.05M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
6.05M
      if (memory != NULL)
338
6.05M
        result = memory;
339
6.05M
    }
340
341
6.12M
  if (sortbuf_count > 0)
342
0
    abort ();
343
6.12M
  if (sortbuf != sortbuf_preallocated)
344
6.12M
    free (sortbuf);
345
346
6.12M
  *lengthp = length;
347
6.12M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
6.12M
}