Coverage Report

Created: 2026-03-31 06:55

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2026 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
9.20M
{
22
9.20M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
9.20M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
9.20M
  UNIT *result;
27
9.20M
  size_t allocated;
28
9.20M
  if (resultbuf == NULL)
29
9.20M
    {
30
9.20M
      result = NULL;
31
9.20M
      allocated = 0;
32
9.20M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
9.20M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
9.20M
  #define SORTBUF_PREALLOCATED 64
42
9.20M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
9.20M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
9.20M
    sortbuf_preallocated;
45
9.20M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
9.20M
  size_t sortbuf_count = 0;
47
48
9.20M
  {
49
9.20M
    const UNIT *s_end = s + n;
50
51
9.20M
    for (;;)
52
30.6M
      {
53
30.6M
        int count;
54
30.6M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
30.6M
        int decomposed_count;
56
57
30.6M
        if (s < s_end)
58
21.4M
          {
59
            /* Fetch the next character.  */
60
21.4M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
21.4M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
52.2M
            for (int curr = 0; curr < decomposed_count; )
70
30.8M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
30.8M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
30.8M
                int curr_decomposed_count;
75
76
30.8M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
30.8M
                if (curr_decomposed_count >= 0)
78
3.15M
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
3.15M
                    int shift = curr_decomposed_count - 1;
83
84
3.15M
                    if (shift < 0)
85
0
                      abort ();
86
3.15M
                    if (shift > 0)
87
2.79M
                      {
88
2.79M
                        decomposed_count += shift;
89
2.79M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
2.82M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
27.6k
                          decomposed[j + shift] = decomposed[j];
93
2.79M
                      }
94
12.5M
                    for (; shift >= 0; shift--)
95
9.35M
                      decomposed[curr + shift] = curr_decomposed[shift];
96
3.15M
                  }
97
27.6M
                else
98
27.6M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
27.6M
                    curr++;
101
27.6M
                  }
102
30.8M
              }
103
21.4M
          }
104
9.20M
        else
105
9.20M
          {
106
9.20M
            count = 0;
107
9.20M
            decomposed_count = 0;
108
9.20M
          }
109
110
30.6M
        int i = 0;
111
30.6M
        for (;;)
112
58.3M
          {
113
58.3M
            ucs4_t uc;
114
58.3M
            int ccc;
115
116
58.3M
            if (s < s_end)
117
49.1M
              {
118
                /* Fetch the next character from the decomposition.  */
119
49.1M
                if (i == decomposed_count)
120
21.4M
                  break;
121
27.6M
                uc = decomposed[i];
122
27.6M
                ccc = uc_combining_class (uc);
123
27.6M
              }
124
9.20M
            else
125
9.20M
              {
126
                /* End of string reached.  */
127
9.20M
                uc = 0;
128
9.20M
                ccc = 0;
129
9.20M
              }
130
131
36.8M
            if (ccc == 0)
132
33.6M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
33.6M
                if (sortbuf_count > 1)
136
1.58M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
1.58M
                                                           sortbuf + sortbuf_count);
138
139
33.6M
                if (composer != NULL)
140
33.6M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
33.6M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
24.4M
                      {
163
27.6M
                        for (size_t j = 1; j < sortbuf_count; )
164
3.20M
                          {
165
3.20M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
1.62M
                              {
167
1.62M
                                ucs4_t combined =
168
1.62M
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
1.62M
                                if (combined)
170
1.56M
                                  {
171
1.56M
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
3.74M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
2.17M
                                      sortbuf[k - 1] = sortbuf[k];
175
1.56M
                                    sortbuf_count--;
176
1.56M
                                    continue;
177
1.56M
                                  }
178
1.62M
                              }
179
1.63M
                            j++;
180
1.63M
                          }
181
24.4M
                        if (s < s_end && sortbuf_count == 1)
182
15.2M
                          {
183
15.2M
                            ucs4_t combined =
184
15.2M
                              composer (sortbuf[0].code, uc);
185
15.2M
                            if (combined)
186
19.1k
                              {
187
19.1k
                                uc = combined;
188
19.1k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
19.1k
                                sortbuf_count = 0;
193
19.1k
                              }
194
15.2M
                          }
195
24.4M
                      }
196
33.6M
                  }
197
198
59.6M
                for (size_t j = 0; j < sortbuf_count; j++)
199
26.0M
                  {
200
26.0M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
26.0M
                    if (length < allocated)
204
16.9M
                      {
205
16.9M
                        int ret =
206
16.9M
                          U_UCTOMB (result + length, muc, allocated - length);
207
16.9M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
16.9M
                        if (ret >= 0)
213
16.9M
                          {
214
16.9M
                            length += ret;
215
16.9M
                            goto done_appending;
216
16.9M
                          }
217
16.9M
                      }
218
9.15M
                    {
219
9.15M
                      size_t old_allocated = allocated;
220
9.15M
                      size_t new_allocated = 2 * old_allocated;
221
9.15M
                      if (new_allocated < 64)
222
9.14M
                        new_allocated = 64;
223
9.15M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
9.15M
                      {
226
9.15M
                        UNIT *larger_result;
227
9.15M
                        if (result == NULL)
228
9.14M
                          {
229
9.14M
                            larger_result =
230
9.14M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
9.14M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
9.14M
                          }
237
15.0k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
15.0k
                        else
249
15.0k
                          {
250
15.0k
                            larger_result =
251
15.0k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
15.0k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
15.0k
                          }
258
9.15M
                        result = larger_result;
259
9.15M
                        allocated = new_allocated;
260
9.15M
                        {
261
9.15M
                          int ret =
262
9.15M
                            U_UCTOMB (result + length, muc, allocated - length);
263
9.15M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
9.15M
                          if (ret < 0)
269
0
                            abort ();
270
9.15M
                          length += ret;
271
9.15M
                          goto done_appending;
272
9.15M
                        }
273
9.15M
                      }
274
9.15M
                    }
275
26.0M
                   done_appending: ;
276
26.0M
                  }
277
278
                /* sortbuf is now empty.  */
279
33.6M
                sortbuf_count = 0;
280
33.6M
              }
281
282
36.8M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
9.20M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
27.6M
            if (sortbuf_count == sortbuf_allocated)
288
1.98k
              {
289
1.98k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
1.98k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
1.98k
                struct ucs4_with_ccc *new_sortbuf =
293
1.98k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
1.98k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
1.98k
                memcpy (new_sortbuf, sortbuf,
300
1.98k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
1.98k
                if (sortbuf != sortbuf_preallocated)
302
1.98k
                  free (sortbuf);
303
1.98k
                sortbuf = new_sortbuf;
304
1.98k
              }
305
27.6M
            sortbuf[sortbuf_count].code = uc;
306
27.6M
            sortbuf[sortbuf_count].ccc = ccc;
307
27.6M
            sortbuf_count++;
308
309
27.6M
            i++;
310
27.6M
          }
311
312
30.6M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
9.20M
          break;
315
316
21.4M
        s += count;
317
21.4M
      }
318
9.20M
  }
319
320
9.20M
  if (length == 0)
321
65.3k
    {
322
65.3k
      if (result == NULL)
323
65.3k
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
65.3k
          result = (UNIT *) malloc (1);
326
65.3k
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
65.3k
        }
332
65.3k
    }
333
9.14M
  else if (result != resultbuf && length < allocated)
334
9.13M
    {
335
      /* Shrink the allocated memory if possible.  */
336
9.13M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
9.13M
      if (memory != NULL)
338
9.13M
        result = memory;
339
9.13M
    }
340
341
9.20M
  if (sortbuf_count > 0)
342
0
    abort ();
343
9.20M
  if (sortbuf != sortbuf_preallocated)
344
9.20M
    free (sortbuf);
345
346
9.20M
  *lengthp = length;
347
9.20M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
9.20M
}
u8_normalize
Line
Count
Source
21
3.08M
{
22
3.08M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
3.08M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
3.08M
  UNIT *result;
27
3.08M
  size_t allocated;
28
3.08M
  if (resultbuf == NULL)
29
3.08M
    {
30
3.08M
      result = NULL;
31
3.08M
      allocated = 0;
32
3.08M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
3.08M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
3.08M
  #define SORTBUF_PREALLOCATED 64
42
3.08M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
3.08M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
3.08M
    sortbuf_preallocated;
45
3.08M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
3.08M
  size_t sortbuf_count = 0;
47
48
3.08M
  {
49
3.08M
    const UNIT *s_end = s + n;
50
51
3.08M
    for (;;)
52
15.6M
      {
53
15.6M
        int count;
54
15.6M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
15.6M
        int decomposed_count;
56
57
15.6M
        if (s < s_end)
58
12.5M
          {
59
            /* Fetch the next character.  */
60
12.5M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
12.5M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
34.3M
            for (int curr = 0; curr < decomposed_count; )
70
21.8M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
21.8M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
21.8M
                int curr_decomposed_count;
75
76
21.8M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
21.8M
                if (curr_decomposed_count >= 0)
78
3.11M
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
3.11M
                    int shift = curr_decomposed_count - 1;
83
84
3.11M
                    if (shift < 0)
85
0
                      abort ();
86
3.11M
                    if (shift > 0)
87
2.76M
                      {
88
2.76M
                        decomposed_count += shift;
89
2.76M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
2.78M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
17.7k
                          decomposed[j + shift] = decomposed[j];
93
2.76M
                      }
94
12.4M
                    for (; shift >= 0; shift--)
95
9.29M
                      decomposed[curr + shift] = curr_decomposed[shift];
96
3.11M
                  }
97
18.7M
                else
98
18.7M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
18.7M
                    curr++;
101
18.7M
                  }
102
21.8M
              }
103
12.5M
          }
104
3.08M
        else
105
3.08M
          {
106
3.08M
            count = 0;
107
3.08M
            decomposed_count = 0;
108
3.08M
          }
109
110
15.6M
        int i = 0;
111
15.6M
        for (;;)
112
34.3M
          {
113
34.3M
            ucs4_t uc;
114
34.3M
            int ccc;
115
116
34.3M
            if (s < s_end)
117
31.2M
              {
118
                /* Fetch the next character from the decomposition.  */
119
31.2M
                if (i == decomposed_count)
120
12.5M
                  break;
121
18.7M
                uc = decomposed[i];
122
18.7M
                ccc = uc_combining_class (uc);
123
18.7M
              }
124
3.08M
            else
125
3.08M
              {
126
                /* End of string reached.  */
127
3.08M
                uc = 0;
128
3.08M
                ccc = 0;
129
3.08M
              }
130
131
21.8M
            if (ccc == 0)
132
18.6M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
18.6M
                if (sortbuf_count > 1)
136
1.56M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
1.56M
                                                           sortbuf + sortbuf_count);
138
139
18.6M
                if (composer != NULL)
140
18.6M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
18.6M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
15.6M
                      {
163
18.7M
                        for (size_t j = 1; j < sortbuf_count; )
164
3.10M
                          {
165
3.10M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
1.58M
                              {
167
1.58M
                                ucs4_t combined =
168
1.58M
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
1.58M
                                if (combined)
170
1.54M
                                  {
171
1.54M
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
3.71M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
2.16M
                                      sortbuf[k - 1] = sortbuf[k];
175
1.54M
                                    sortbuf_count--;
176
1.54M
                                    continue;
177
1.54M
                                  }
178
1.58M
                              }
179
1.55M
                            j++;
180
1.55M
                          }
181
15.6M
                        if (s < s_end && sortbuf_count == 1)
182
12.4M
                          {
183
12.4M
                            ucs4_t combined =
184
12.4M
                              composer (sortbuf[0].code, uc);
185
12.4M
                            if (combined)
186
8.87k
                              {
187
8.87k
                                uc = combined;
188
8.87k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
8.87k
                                sortbuf_count = 0;
193
8.87k
                              }
194
12.4M
                          }
195
15.6M
                      }
196
18.6M
                  }
197
198
35.8M
                for (size_t j = 0; j < sortbuf_count; j++)
199
17.1M
                  {
200
17.1M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
17.1M
                    if (length < allocated)
204
14.0M
                      {
205
14.0M
                        int ret =
206
14.0M
                          U_UCTOMB (result + length, muc, allocated - length);
207
14.0M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
14.0M
                        if (ret >= 0)
213
14.0M
                          {
214
14.0M
                            length += ret;
215
14.0M
                            goto done_appending;
216
14.0M
                          }
217
14.0M
                      }
218
3.09M
                    {
219
3.09M
                      size_t old_allocated = allocated;
220
3.09M
                      size_t new_allocated = 2 * old_allocated;
221
3.09M
                      if (new_allocated < 64)
222
3.08M
                        new_allocated = 64;
223
3.09M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
3.09M
                      {
226
3.09M
                        UNIT *larger_result;
227
3.09M
                        if (result == NULL)
228
3.08M
                          {
229
3.08M
                            larger_result =
230
3.08M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
3.08M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
3.08M
                          }
237
12.2k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
12.2k
                        else
249
12.2k
                          {
250
12.2k
                            larger_result =
251
12.2k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
12.2k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
12.2k
                          }
258
3.09M
                        result = larger_result;
259
3.09M
                        allocated = new_allocated;
260
3.09M
                        {
261
3.09M
                          int ret =
262
3.09M
                            U_UCTOMB (result + length, muc, allocated - length);
263
3.09M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
3.09M
                          if (ret < 0)
269
0
                            abort ();
270
3.09M
                          length += ret;
271
3.09M
                          goto done_appending;
272
3.09M
                        }
273
3.09M
                      }
274
3.09M
                    }
275
17.1M
                   done_appending: ;
276
17.1M
                  }
277
278
                /* sortbuf is now empty.  */
279
18.6M
                sortbuf_count = 0;
280
18.6M
              }
281
282
21.8M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
3.08M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
18.7M
            if (sortbuf_count == sortbuf_allocated)
288
1.16k
              {
289
1.16k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
1.16k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
1.16k
                struct ucs4_with_ccc *new_sortbuf =
293
1.16k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
1.16k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
1.16k
                memcpy (new_sortbuf, sortbuf,
300
1.16k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
1.16k
                if (sortbuf != sortbuf_preallocated)
302
1.16k
                  free (sortbuf);
303
1.16k
                sortbuf = new_sortbuf;
304
1.16k
              }
305
18.7M
            sortbuf[sortbuf_count].code = uc;
306
18.7M
            sortbuf[sortbuf_count].ccc = ccc;
307
18.7M
            sortbuf_count++;
308
309
18.7M
            i++;
310
18.7M
          }
311
312
15.6M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
3.08M
          break;
315
316
12.5M
        s += count;
317
12.5M
      }
318
3.08M
  }
319
320
3.08M
  if (length == 0)
321
0
    {
322
0
      if (result == NULL)
323
0
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
0
          result = (UNIT *) malloc (1);
326
0
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
0
        }
332
0
    }
333
3.08M
  else if (result != resultbuf && length < allocated)
334
3.08M
    {
335
      /* Shrink the allocated memory if possible.  */
336
3.08M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
3.08M
      if (memory != NULL)
338
3.08M
        result = memory;
339
3.08M
    }
340
341
3.08M
  if (sortbuf_count > 0)
342
0
    abort ();
343
3.08M
  if (sortbuf != sortbuf_preallocated)
344
3.08M
    free (sortbuf);
345
346
3.08M
  *lengthp = length;
347
3.08M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
3.08M
}
u32_normalize
Line
Count
Source
21
6.12M
{
22
6.12M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
6.12M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
6.12M
  UNIT *result;
27
6.12M
  size_t allocated;
28
6.12M
  if (resultbuf == NULL)
29
6.12M
    {
30
6.12M
      result = NULL;
31
6.12M
      allocated = 0;
32
6.12M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
6.12M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
6.12M
  #define SORTBUF_PREALLOCATED 64
42
6.12M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
6.12M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
6.12M
    sortbuf_preallocated;
45
6.12M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
6.12M
  size_t sortbuf_count = 0;
47
48
6.12M
  {
49
6.12M
    const UNIT *s_end = s + n;
50
51
6.12M
    for (;;)
52
15.0M
      {
53
15.0M
        int count;
54
15.0M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
15.0M
        int decomposed_count;
56
57
15.0M
        if (s < s_end)
58
8.89M
          {
59
            /* Fetch the next character.  */
60
8.89M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
8.89M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
17.8M
            for (int curr = 0; curr < decomposed_count; )
70
8.95M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
8.95M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
8.95M
                int curr_decomposed_count;
75
76
8.95M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
8.95M
                if (curr_decomposed_count >= 0)
78
32.4k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
32.4k
                    int shift = curr_decomposed_count - 1;
83
84
32.4k
                    if (shift < 0)
85
0
                      abort ();
86
32.4k
                    if (shift > 0)
87
32.1k
                      {
88
32.1k
                        decomposed_count += shift;
89
32.1k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
42.1k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
9.92k
                          decomposed[j + shift] = decomposed[j];
93
32.1k
                      }
94
97.0k
                    for (; shift >= 0; shift--)
95
64.6k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
32.4k
                  }
97
8.92M
                else
98
8.92M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
8.92M
                    curr++;
101
8.92M
                  }
102
8.95M
              }
103
8.89M
          }
104
6.12M
        else
105
6.12M
          {
106
6.12M
            count = 0;
107
6.12M
            decomposed_count = 0;
108
6.12M
          }
109
110
15.0M
        int i = 0;
111
15.0M
        for (;;)
112
23.9M
          {
113
23.9M
            ucs4_t uc;
114
23.9M
            int ccc;
115
116
23.9M
            if (s < s_end)
117
17.8M
              {
118
                /* Fetch the next character from the decomposition.  */
119
17.8M
                if (i == decomposed_count)
120
8.89M
                  break;
121
8.92M
                uc = decomposed[i];
122
8.92M
                ccc = uc_combining_class (uc);
123
8.92M
              }
124
6.12M
            else
125
6.12M
              {
126
                /* End of string reached.  */
127
6.12M
                uc = 0;
128
6.12M
                ccc = 0;
129
6.12M
              }
130
131
15.0M
            if (ccc == 0)
132
14.9M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
14.9M
                if (sortbuf_count > 1)
136
23.3k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
23.3k
                                                           sortbuf + sortbuf_count);
138
139
14.9M
                if (composer != NULL)
140
14.9M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
14.9M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
8.81M
                      {
163
8.91M
                        for (size_t j = 1; j < sortbuf_count; )
164
98.3k
                          {
165
98.3k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
33.9k
                              {
167
33.9k
                                ucs4_t combined =
168
33.9k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
33.9k
                                if (combined)
170
20.1k
                                  {
171
20.1k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
33.1k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
12.9k
                                      sortbuf[k - 1] = sortbuf[k];
175
20.1k
                                    sortbuf_count--;
176
20.1k
                                    continue;
177
20.1k
                                  }
178
33.9k
                              }
179
78.1k
                            j++;
180
78.1k
                          }
181
8.81M
                        if (s < s_end && sortbuf_count == 1)
182
2.75M
                          {
183
2.75M
                            ucs4_t combined =
184
2.75M
                              composer (sortbuf[0].code, uc);
185
2.75M
                            if (combined)
186
10.2k
                              {
187
10.2k
                                uc = combined;
188
10.2k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
10.2k
                                sortbuf_count = 0;
193
10.2k
                              }
194
2.75M
                          }
195
8.81M
                      }
196
14.9M
                  }
197
198
23.8M
                for (size_t j = 0; j < sortbuf_count; j++)
199
8.89M
                  {
200
8.89M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
8.89M
                    if (length < allocated)
204
2.83M
                      {
205
2.83M
                        int ret =
206
2.83M
                          U_UCTOMB (result + length, muc, allocated - length);
207
2.83M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
2.83M
                        if (ret >= 0)
213
2.83M
                          {
214
2.83M
                            length += ret;
215
2.83M
                            goto done_appending;
216
2.83M
                          }
217
2.83M
                      }
218
6.05M
                    {
219
6.05M
                      size_t old_allocated = allocated;
220
6.05M
                      size_t new_allocated = 2 * old_allocated;
221
6.05M
                      if (new_allocated < 64)
222
6.05M
                        new_allocated = 64;
223
6.05M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
6.05M
                      {
226
6.05M
                        UNIT *larger_result;
227
6.05M
                        if (result == NULL)
228
6.05M
                          {
229
6.05M
                            larger_result =
230
6.05M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
6.05M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
6.05M
                          }
237
2.75k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
2.75k
                        else
249
2.75k
                          {
250
2.75k
                            larger_result =
251
2.75k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
2.75k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
2.75k
                          }
258
6.05M
                        result = larger_result;
259
6.05M
                        allocated = new_allocated;
260
6.05M
                        {
261
6.05M
                          int ret =
262
6.05M
                            U_UCTOMB (result + length, muc, allocated - length);
263
6.05M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
6.05M
                          if (ret < 0)
269
0
                            abort ();
270
6.05M
                          length += ret;
271
6.05M
                          goto done_appending;
272
6.05M
                        }
273
6.05M
                      }
274
6.05M
                    }
275
8.89M
                   done_appending: ;
276
8.89M
                  }
277
278
                /* sortbuf is now empty.  */
279
14.9M
                sortbuf_count = 0;
280
14.9M
              }
281
282
15.0M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
6.12M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
8.92M
            if (sortbuf_count == sortbuf_allocated)
288
818
              {
289
818
                sortbuf_allocated = 2 * sortbuf_allocated;
290
818
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
818
                struct ucs4_with_ccc *new_sortbuf =
293
818
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
818
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
818
                memcpy (new_sortbuf, sortbuf,
300
818
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
818
                if (sortbuf != sortbuf_preallocated)
302
818
                  free (sortbuf);
303
818
                sortbuf = new_sortbuf;
304
818
              }
305
8.92M
            sortbuf[sortbuf_count].code = uc;
306
8.92M
            sortbuf[sortbuf_count].ccc = ccc;
307
8.92M
            sortbuf_count++;
308
309
8.92M
            i++;
310
8.92M
          }
311
312
15.0M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
6.12M
          break;
315
316
8.89M
        s += count;
317
8.89M
      }
318
6.12M
  }
319
320
6.12M
  if (length == 0)
321
65.3k
    {
322
65.3k
      if (result == NULL)
323
65.3k
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
65.3k
          result = (UNIT *) malloc (1);
326
65.3k
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
65.3k
        }
332
65.3k
    }
333
6.05M
  else if (result != resultbuf && length < allocated)
334
6.05M
    {
335
      /* Shrink the allocated memory if possible.  */
336
6.05M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
6.05M
      if (memory != NULL)
338
6.05M
        result = memory;
339
6.05M
    }
340
341
6.12M
  if (sortbuf_count > 0)
342
0
    abort ();
343
6.12M
  if (sortbuf != sortbuf_preallocated)
344
6.12M
    free (sortbuf);
345
346
6.12M
  *lengthp = length;
347
6.12M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
6.12M
}