Coverage Report

Created: 2026-02-05 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2026 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
7.34M
{
22
7.34M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
7.34M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
7.34M
  UNIT *result;
27
7.34M
  size_t allocated;
28
7.34M
  if (resultbuf == NULL)
29
7.34M
    {
30
7.34M
      result = NULL;
31
7.34M
      allocated = 0;
32
7.34M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
7.34M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
7.34M
  #define SORTBUF_PREALLOCATED 64
42
7.34M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
7.34M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
7.34M
    sortbuf_preallocated;
45
7.34M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
7.34M
  size_t sortbuf_count = 0;
47
48
7.34M
  {
49
7.34M
    const UNIT *s_end = s + n;
50
51
7.34M
    for (;;)
52
25.5M
      {
53
25.5M
        int count;
54
25.5M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
25.5M
        int decomposed_count;
56
57
25.5M
        if (s < s_end)
58
18.2M
          {
59
            /* Fetch the next character.  */
60
18.2M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
18.2M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
45.3M
            for (int curr = 0; curr < decomposed_count; )
70
27.1M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
27.1M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
27.1M
                int curr_decomposed_count;
75
76
27.1M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
27.1M
                if (curr_decomposed_count >= 0)
78
2.89M
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
2.89M
                    int shift = curr_decomposed_count - 1;
83
84
2.89M
                    if (shift < 0)
85
0
                      abort ();
86
2.89M
                    if (shift > 0)
87
2.51M
                      {
88
2.51M
                        decomposed_count += shift;
89
2.51M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
2.54M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
27.0k
                          decomposed[j + shift] = decomposed[j];
93
2.51M
                      }
94
11.8M
                    for (; shift >= 0; shift--)
95
8.93M
                      decomposed[curr + shift] = curr_decomposed[shift];
96
2.89M
                  }
97
24.2M
                else
98
24.2M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
24.2M
                    curr++;
101
24.2M
                  }
102
27.1M
              }
103
18.2M
          }
104
7.34M
        else
105
7.34M
          {
106
7.34M
            count = 0;
107
7.34M
            decomposed_count = 0;
108
7.34M
          }
109
110
25.5M
        int i = 0;
111
25.5M
        for (;;)
112
49.7M
          {
113
49.7M
            ucs4_t uc;
114
49.7M
            int ccc;
115
116
49.7M
            if (s < s_end)
117
42.4M
              {
118
                /* Fetch the next character from the decomposition.  */
119
42.4M
                if (i == decomposed_count)
120
18.2M
                  break;
121
24.2M
                uc = decomposed[i];
122
24.2M
                ccc = uc_combining_class (uc);
123
24.2M
              }
124
7.34M
            else
125
7.34M
              {
126
                /* End of string reached.  */
127
7.34M
                uc = 0;
128
7.34M
                ccc = 0;
129
7.34M
              }
130
131
31.5M
            if (ccc == 0)
132
28.7M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
28.7M
                if (sortbuf_count > 1)
136
1.44M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
1.44M
                                                           sortbuf + sortbuf_count);
138
139
28.7M
                if (composer != NULL)
140
28.7M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
28.7M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
21.3M
                      {
163
24.2M
                        for (size_t j = 1; j < sortbuf_count; )
164
2.82M
                          {
165
2.82M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
1.47M
                              {
167
1.47M
                                ucs4_t combined =
168
1.47M
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
1.47M
                                if (combined)
170
1.43M
                                  {
171
1.43M
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
3.30M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
1.87M
                                      sortbuf[k - 1] = sortbuf[k];
175
1.43M
                                    sortbuf_count--;
176
1.43M
                                    continue;
177
1.43M
                                  }
178
1.47M
                              }
179
1.39M
                            j++;
180
1.39M
                          }
181
21.3M
                        if (s < s_end && sortbuf_count == 1)
182
14.0M
                          {
183
14.0M
                            ucs4_t combined =
184
14.0M
                              composer (sortbuf[0].code, uc);
185
14.0M
                            if (combined)
186
20.0k
                              {
187
20.0k
                                uc = combined;
188
20.0k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
20.0k
                                sortbuf_count = 0;
193
20.0k
                              }
194
14.0M
                          }
195
21.3M
                      }
196
28.7M
                  }
197
198
51.5M
                for (size_t j = 0; j < sortbuf_count; j++)
199
22.7M
                  {
200
22.7M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
22.7M
                    if (length < allocated)
204
15.5M
                      {
205
15.5M
                        int ret =
206
15.5M
                          U_UCTOMB (result + length, muc, allocated - length);
207
15.5M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
15.5M
                        if (ret >= 0)
213
15.5M
                          {
214
15.5M
                            length += ret;
215
15.5M
                            goto done_appending;
216
15.5M
                          }
217
15.5M
                      }
218
7.28M
                    {
219
7.28M
                      size_t old_allocated = allocated;
220
7.28M
                      size_t new_allocated = 2 * old_allocated;
221
7.28M
                      if (new_allocated < 64)
222
7.27M
                        new_allocated = 64;
223
7.28M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
7.28M
                      {
226
7.28M
                        UNIT *larger_result;
227
7.28M
                        if (result == NULL)
228
7.27M
                          {
229
7.27M
                            larger_result =
230
7.27M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
7.27M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
7.27M
                          }
237
14.5k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
14.5k
                        else
249
14.5k
                          {
250
14.5k
                            larger_result =
251
14.5k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
14.5k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
14.5k
                          }
258
7.28M
                        result = larger_result;
259
7.28M
                        allocated = new_allocated;
260
7.28M
                        {
261
7.28M
                          int ret =
262
7.28M
                            U_UCTOMB (result + length, muc, allocated - length);
263
7.28M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
7.28M
                          if (ret < 0)
269
0
                            abort ();
270
7.28M
                          length += ret;
271
7.28M
                          goto done_appending;
272
7.28M
                        }
273
7.28M
                      }
274
7.28M
                    }
275
22.7M
                   done_appending: ;
276
22.7M
                  }
277
278
                /* sortbuf is now empty.  */
279
28.7M
                sortbuf_count = 0;
280
28.7M
              }
281
282
31.5M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
7.34M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
24.2M
            if (sortbuf_count == sortbuf_allocated)
288
1.93k
              {
289
1.93k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
1.93k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
1.93k
                struct ucs4_with_ccc *new_sortbuf =
293
1.93k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
1.93k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
1.93k
                memcpy (new_sortbuf, sortbuf,
300
1.93k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
1.93k
                if (sortbuf != sortbuf_preallocated)
302
1.93k
                  free (sortbuf);
303
1.93k
                sortbuf = new_sortbuf;
304
1.93k
              }
305
24.2M
            sortbuf[sortbuf_count].code = uc;
306
24.2M
            sortbuf[sortbuf_count].ccc = ccc;
307
24.2M
            sortbuf_count++;
308
309
24.2M
            i++;
310
24.2M
          }
311
312
25.5M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
7.34M
          break;
315
316
18.2M
        s += count;
317
18.2M
      }
318
7.34M
  }
319
320
7.34M
  if (length == 0)
321
69.0k
    {
322
69.0k
      if (result == NULL)
323
69.0k
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
69.0k
          result = (UNIT *) malloc (1);
326
69.0k
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
69.0k
        }
332
69.0k
    }
333
7.27M
  else if (result != resultbuf && length < allocated)
334
7.27M
    {
335
      /* Shrink the allocated memory if possible.  */
336
7.27M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
7.27M
      if (memory != NULL)
338
7.27M
        result = memory;
339
7.27M
    }
340
341
7.34M
  if (sortbuf_count > 0)
342
0
    abort ();
343
7.34M
  if (sortbuf != sortbuf_preallocated)
344
7.34M
    free (sortbuf);
345
346
7.34M
  *lengthp = length;
347
7.34M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
7.34M
}
u8_normalize
Line
Count
Source
21
2.45M
{
22
2.45M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
2.45M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
2.45M
  UNIT *result;
27
2.45M
  size_t allocated;
28
2.45M
  if (resultbuf == NULL)
29
2.45M
    {
30
2.45M
      result = NULL;
31
2.45M
      allocated = 0;
32
2.45M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
2.45M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
2.45M
  #define SORTBUF_PREALLOCATED 64
42
2.45M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
2.45M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
2.45M
    sortbuf_preallocated;
45
2.45M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
2.45M
  size_t sortbuf_count = 0;
47
48
2.45M
  {
49
2.45M
    const UNIT *s_end = s + n;
50
51
2.45M
    for (;;)
52
13.4M
      {
53
13.4M
        int count;
54
13.4M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
13.4M
        int decomposed_count;
56
57
13.4M
        if (s < s_end)
58
11.0M
          {
59
            /* Fetch the next character.  */
60
11.0M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
11.0M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
30.8M
            for (int curr = 0; curr < decomposed_count; )
70
19.8M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
19.8M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
19.8M
                int curr_decomposed_count;
75
76
19.8M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
19.8M
                if (curr_decomposed_count >= 0)
78
2.86M
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
2.86M
                    int shift = curr_decomposed_count - 1;
83
84
2.86M
                    if (shift < 0)
85
0
                      abort ();
86
2.86M
                    if (shift > 0)
87
2.48M
                      {
88
2.48M
                        decomposed_count += shift;
89
2.48M
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
2.50M
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
16.4k
                          decomposed[j + shift] = decomposed[j];
93
2.48M
                      }
94
11.7M
                    for (; shift >= 0; shift--)
95
8.86M
                      decomposed[curr + shift] = curr_decomposed[shift];
96
2.86M
                  }
97
17.0M
                else
98
17.0M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
17.0M
                    curr++;
101
17.0M
                  }
102
19.8M
              }
103
11.0M
          }
104
2.45M
        else
105
2.45M
          {
106
2.45M
            count = 0;
107
2.45M
            decomposed_count = 0;
108
2.45M
          }
109
110
13.4M
        int i = 0;
111
13.4M
        for (;;)
112
30.4M
          {
113
30.4M
            ucs4_t uc;
114
30.4M
            int ccc;
115
116
30.4M
            if (s < s_end)
117
28.0M
              {
118
                /* Fetch the next character from the decomposition.  */
119
28.0M
                if (i == decomposed_count)
120
11.0M
                  break;
121
17.0M
                uc = decomposed[i];
122
17.0M
                ccc = uc_combining_class (uc);
123
17.0M
              }
124
2.45M
            else
125
2.45M
              {
126
                /* End of string reached.  */
127
2.45M
                uc = 0;
128
2.45M
                ccc = 0;
129
2.45M
              }
130
131
19.4M
            if (ccc == 0)
132
16.7M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
16.7M
                if (sortbuf_count > 1)
136
1.41M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
1.41M
                                                           sortbuf + sortbuf_count);
138
139
16.7M
                if (composer != NULL)
140
16.7M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
16.7M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
14.2M
                      {
163
16.9M
                        for (size_t j = 1; j < sortbuf_count; )
164
2.73M
                          {
165
2.73M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
1.43M
                              {
167
1.43M
                                ucs4_t combined =
168
1.43M
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
1.43M
                                if (combined)
170
1.41M
                                  {
171
1.41M
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
3.26M
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
1.85M
                                      sortbuf[k - 1] = sortbuf[k];
175
1.41M
                                    sortbuf_count--;
176
1.41M
                                    continue;
177
1.41M
                                  }
178
1.43M
                              }
179
1.32M
                            j++;
180
1.32M
                          }
181
14.2M
                        if (s < s_end && sortbuf_count == 1)
182
11.7M
                          {
183
11.7M
                            ucs4_t combined =
184
11.7M
                              composer (sortbuf[0].code, uc);
185
11.7M
                            if (combined)
186
8.37k
                              {
187
8.37k
                                uc = combined;
188
8.37k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
8.37k
                                sortbuf_count = 0;
193
8.37k
                              }
194
11.7M
                          }
195
14.2M
                      }
196
16.7M
                  }
197
198
32.3M
                for (size_t j = 0; j < sortbuf_count; j++)
199
15.5M
                  {
200
15.5M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
15.5M
                    if (length < allocated)
204
13.1M
                      {
205
13.1M
                        int ret =
206
13.1M
                          U_UCTOMB (result + length, muc, allocated - length);
207
13.1M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
13.1M
                        if (ret >= 0)
213
13.1M
                          {
214
13.1M
                            length += ret;
215
13.1M
                            goto done_appending;
216
13.1M
                          }
217
13.1M
                      }
218
2.46M
                    {
219
2.46M
                      size_t old_allocated = allocated;
220
2.46M
                      size_t new_allocated = 2 * old_allocated;
221
2.46M
                      if (new_allocated < 64)
222
2.45M
                        new_allocated = 64;
223
2.46M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
2.46M
                      {
226
2.46M
                        UNIT *larger_result;
227
2.46M
                        if (result == NULL)
228
2.45M
                          {
229
2.45M
                            larger_result =
230
2.45M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
2.45M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
2.45M
                          }
237
11.8k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
11.8k
                        else
249
11.8k
                          {
250
11.8k
                            larger_result =
251
11.8k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
11.8k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
11.8k
                          }
258
2.46M
                        result = larger_result;
259
2.46M
                        allocated = new_allocated;
260
2.46M
                        {
261
2.46M
                          int ret =
262
2.46M
                            U_UCTOMB (result + length, muc, allocated - length);
263
2.46M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
2.46M
                          if (ret < 0)
269
0
                            abort ();
270
2.46M
                          length += ret;
271
2.46M
                          goto done_appending;
272
2.46M
                        }
273
2.46M
                      }
274
2.46M
                    }
275
15.5M
                   done_appending: ;
276
15.5M
                  }
277
278
                /* sortbuf is now empty.  */
279
16.7M
                sortbuf_count = 0;
280
16.7M
              }
281
282
19.4M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
2.45M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
17.0M
            if (sortbuf_count == sortbuf_allocated)
288
1.12k
              {
289
1.12k
                sortbuf_allocated = 2 * sortbuf_allocated;
290
1.12k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
1.12k
                struct ucs4_with_ccc *new_sortbuf =
293
1.12k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
1.12k
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
1.12k
                memcpy (new_sortbuf, sortbuf,
300
1.12k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
1.12k
                if (sortbuf != sortbuf_preallocated)
302
1.12k
                  free (sortbuf);
303
1.12k
                sortbuf = new_sortbuf;
304
1.12k
              }
305
17.0M
            sortbuf[sortbuf_count].code = uc;
306
17.0M
            sortbuf[sortbuf_count].ccc = ccc;
307
17.0M
            sortbuf_count++;
308
309
17.0M
            i++;
310
17.0M
          }
311
312
13.4M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
2.45M
          break;
315
316
11.0M
        s += count;
317
11.0M
      }
318
2.45M
  }
319
320
2.45M
  if (length == 0)
321
0
    {
322
0
      if (result == NULL)
323
0
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
0
          result = (UNIT *) malloc (1);
326
0
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
0
        }
332
0
    }
333
2.45M
  else if (result != resultbuf && length < allocated)
334
2.45M
    {
335
      /* Shrink the allocated memory if possible.  */
336
2.45M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
2.45M
      if (memory != NULL)
338
2.45M
        result = memory;
339
2.45M
    }
340
341
2.45M
  if (sortbuf_count > 0)
342
0
    abort ();
343
2.45M
  if (sortbuf != sortbuf_preallocated)
344
2.45M
    free (sortbuf);
345
346
2.45M
  *lengthp = length;
347
2.45M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
2.45M
}
u32_normalize
Line
Count
Source
21
4.89M
{
22
4.89M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
4.89M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
4.89M
  UNIT *result;
27
4.89M
  size_t allocated;
28
4.89M
  if (resultbuf == NULL)
29
4.89M
    {
30
4.89M
      result = NULL;
31
4.89M
      allocated = 0;
32
4.89M
    }
33
0
  else
34
0
    {
35
0
      result = resultbuf;
36
0
      allocated = *lengthp;
37
0
    }
38
4.89M
  size_t length = 0;
39
40
  /* The buffer for sorting.  */
41
4.89M
  #define SORTBUF_PREALLOCATED 64
42
4.89M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
43
4.89M
  struct ucs4_with_ccc *sortbuf = /* array of size 2 * sortbuf_allocated */
44
4.89M
    sortbuf_preallocated;
45
4.89M
  size_t sortbuf_allocated = SORTBUF_PREALLOCATED;
46
4.89M
  size_t sortbuf_count = 0;
47
48
4.89M
  {
49
4.89M
    const UNIT *s_end = s + n;
50
51
4.89M
    for (;;)
52
12.0M
      {
53
12.0M
        int count;
54
12.0M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
55
12.0M
        int decomposed_count;
56
57
12.0M
        if (s < s_end)
58
7.19M
          {
59
            /* Fetch the next character.  */
60
7.19M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
61
7.19M
            decomposed_count = 1;
62
63
            /* Decompose it, recursively.
64
               It would be possible to precompute the recursive decomposition
65
               and store it in a table.  But this would significantly increase
66
               the size of the decomposition tables, because for example for
67
               U+1FC1 the recursive canonical decomposition and the recursive
68
               compatibility decomposition are different.  */
69
14.4M
            for (int curr = 0; curr < decomposed_count; )
70
7.26M
              {
71
                /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
72
                   all elements are atomic.  */
73
7.26M
                ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
74
7.26M
                int curr_decomposed_count;
75
76
7.26M
                curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
77
7.26M
                if (curr_decomposed_count >= 0)
78
33.5k
                  {
79
                    /* Move curr_decomposed[0..curr_decomposed_count-1] over
80
                       decomposed[curr], making room.  It's not worth using
81
                       memcpy() here, since the counts are so small.  */
82
33.5k
                    int shift = curr_decomposed_count - 1;
83
84
33.5k
                    if (shift < 0)
85
0
                      abort ();
86
33.5k
                    if (shift > 0)
87
33.2k
                      {
88
33.2k
                        decomposed_count += shift;
89
33.2k
                        if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
90
0
                          abort ();
91
43.9k
                        for (int j = decomposed_count - 1 - shift; j > curr; j--)
92
10.6k
                          decomposed[j + shift] = decomposed[j];
93
33.2k
                      }
94
100k
                    for (; shift >= 0; shift--)
95
66.8k
                      decomposed[curr + shift] = curr_decomposed[shift];
96
33.5k
                  }
97
7.23M
                else
98
7.23M
                  {
99
                    /* decomposed[curr] is atomic.  */
100
7.23M
                    curr++;
101
7.23M
                  }
102
7.26M
              }
103
7.19M
          }
104
4.89M
        else
105
4.89M
          {
106
4.89M
            count = 0;
107
4.89M
            decomposed_count = 0;
108
4.89M
          }
109
110
12.0M
        int i = 0;
111
12.0M
        for (;;)
112
19.3M
          {
113
19.3M
            ucs4_t uc;
114
19.3M
            int ccc;
115
116
19.3M
            if (s < s_end)
117
14.4M
              {
118
                /* Fetch the next character from the decomposition.  */
119
14.4M
                if (i == decomposed_count)
120
7.19M
                  break;
121
7.23M
                uc = decomposed[i];
122
7.23M
                ccc = uc_combining_class (uc);
123
7.23M
              }
124
4.89M
            else
125
4.89M
              {
126
                /* End of string reached.  */
127
4.89M
                uc = 0;
128
4.89M
                ccc = 0;
129
4.89M
              }
130
131
12.1M
            if (ccc == 0)
132
12.0M
              {
133
                /* Apply the canonical ordering algorithm to the accumulated
134
                   sequence of characters.  */
135
12.0M
                if (sortbuf_count > 1)
136
23.0k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
137
23.0k
                                                           sortbuf + sortbuf_count);
138
139
12.0M
                if (composer != NULL)
140
12.0M
                  {
141
                    /* Attempt to combine decomposed characters, as specified
142
                       in the Unicode Standard Annex #15 "Unicode Normalization
143
                       Forms".  We need to check
144
                         1. whether the first accumulated character is a
145
                            "starter" (i.e. has ccc = 0).  This is usually the
146
                            case.  But when the string starts with a
147
                            non-starter, the sortbuf also starts with a
148
                            non-starter.  Btw, this check could also be
149
                            omitted, because the composition table has only
150
                            entries (code1, code2) for which code1 is a
151
                            starter; if the first accumulated character is not
152
                            a starter, no lookup will succeed.
153
                         2. If the sortbuf has more than one character, check
154
                            for each of these characters that are not "blocked"
155
                            from the starter (i.e. have a ccc that is higher
156
                            than the ccc of the previous character) whether it
157
                            can be combined with the first character.
158
                         3. If only one character is left in sortbuf, check
159
                            whether it can be combined with the next character
160
                            (also a starter).  */
161
12.0M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
162
7.12M
                      {
163
7.22M
                        for (size_t j = 1; j < sortbuf_count; )
164
98.0k
                          {
165
98.0k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
166
33.7k
                              {
167
33.7k
                                ucs4_t combined =
168
33.7k
                                  composer (sortbuf[0].code, sortbuf[j].code);
169
33.7k
                                if (combined)
170
19.8k
                                  {
171
19.8k
                                    sortbuf[0].code = combined;
172
                                    /* sortbuf[0].ccc = 0, still valid.  */
173
33.3k
                                    for (size_t k = j + 1; k < sortbuf_count; k++)
174
13.5k
                                      sortbuf[k - 1] = sortbuf[k];
175
19.8k
                                    sortbuf_count--;
176
19.8k
                                    continue;
177
19.8k
                                  }
178
33.7k
                              }
179
78.2k
                            j++;
180
78.2k
                          }
181
7.12M
                        if (s < s_end && sortbuf_count == 1)
182
2.29M
                          {
183
2.29M
                            ucs4_t combined =
184
2.29M
                              composer (sortbuf[0].code, uc);
185
2.29M
                            if (combined)
186
11.7k
                              {
187
11.7k
                                uc = combined;
188
11.7k
                                ccc = 0;
189
                                /* uc could be further combined with subsequent
190
                                   characters.  So don't put it into sortbuf[0] in
191
                                   this round, only in the next round.  */
192
11.7k
                                sortbuf_count = 0;
193
11.7k
                              }
194
2.29M
                          }
195
7.12M
                      }
196
12.0M
                  }
197
198
19.2M
                for (size_t j = 0; j < sortbuf_count; j++)
199
7.20M
                  {
200
7.20M
                    ucs4_t muc = sortbuf[j].code;
201
202
                    /* Append muc to the result accumulator.  */
203
7.20M
                    if (length < allocated)
204
2.37M
                      {
205
2.37M
                        int ret =
206
2.37M
                          U_UCTOMB (result + length, muc, allocated - length);
207
2.37M
                        if (ret == -1)
208
0
                          {
209
0
                            errno = EINVAL;
210
0
                            goto fail;
211
0
                          }
212
2.37M
                        if (ret >= 0)
213
2.37M
                          {
214
2.37M
                            length += ret;
215
2.37M
                            goto done_appending;
216
2.37M
                          }
217
2.37M
                      }
218
4.82M
                    {
219
4.82M
                      size_t old_allocated = allocated;
220
4.82M
                      size_t new_allocated = 2 * old_allocated;
221
4.82M
                      if (new_allocated < 64)
222
4.82M
                        new_allocated = 64;
223
4.82M
                      if (new_allocated < old_allocated) /* integer overflow? */
224
0
                        abort ();
225
4.82M
                      {
226
4.82M
                        UNIT *larger_result;
227
4.82M
                        if (result == NULL)
228
4.82M
                          {
229
4.82M
                            larger_result =
230
4.82M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
231
4.82M
                            if (larger_result == NULL)
232
0
                              {
233
0
                                errno = ENOMEM;
234
0
                                goto fail;
235
0
                              }
236
4.82M
                          }
237
2.72k
                        else if (result == resultbuf)
238
0
                          {
239
0
                            larger_result =
240
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
241
0
                            if (larger_result == NULL)
242
0
                              {
243
0
                                errno = ENOMEM;
244
0
                                goto fail;
245
0
                              }
246
0
                            U_CPY (larger_result, resultbuf, length);
247
0
                          }
248
2.72k
                        else
249
2.72k
                          {
250
2.72k
                            larger_result =
251
2.72k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
252
2.72k
                            if (larger_result == NULL)
253
0
                              {
254
0
                                errno = ENOMEM;
255
0
                                goto fail;
256
0
                              }
257
2.72k
                          }
258
4.82M
                        result = larger_result;
259
4.82M
                        allocated = new_allocated;
260
4.82M
                        {
261
4.82M
                          int ret =
262
4.82M
                            U_UCTOMB (result + length, muc, allocated - length);
263
4.82M
                          if (ret == -1)
264
0
                            {
265
0
                              errno = EINVAL;
266
0
                              goto fail;
267
0
                            }
268
4.82M
                          if (ret < 0)
269
0
                            abort ();
270
4.82M
                          length += ret;
271
4.82M
                          goto done_appending;
272
4.82M
                        }
273
4.82M
                      }
274
4.82M
                    }
275
7.20M
                   done_appending: ;
276
7.20M
                  }
277
278
                /* sortbuf is now empty.  */
279
12.0M
                sortbuf_count = 0;
280
12.0M
              }
281
282
12.1M
            if (!(s < s_end))
283
              /* End of string reached.  */
284
4.89M
              break;
285
286
            /* Append (uc, ccc) to sortbuf.  */
287
7.23M
            if (sortbuf_count == sortbuf_allocated)
288
808
              {
289
808
                sortbuf_allocated = 2 * sortbuf_allocated;
290
808
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
291
0
                  abort ();
292
808
                struct ucs4_with_ccc *new_sortbuf =
293
808
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
294
808
                if (new_sortbuf == NULL)
295
0
                  {
296
0
                    errno = ENOMEM;
297
0
                    goto fail;
298
0
                  }
299
808
                memcpy (new_sortbuf, sortbuf,
300
808
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
301
808
                if (sortbuf != sortbuf_preallocated)
302
808
                  free (sortbuf);
303
808
                sortbuf = new_sortbuf;
304
808
              }
305
7.23M
            sortbuf[sortbuf_count].code = uc;
306
7.23M
            sortbuf[sortbuf_count].ccc = ccc;
307
7.23M
            sortbuf_count++;
308
309
7.23M
            i++;
310
7.23M
          }
311
312
12.0M
        if (!(s < s_end))
313
          /* End of string reached.  */
314
4.89M
          break;
315
316
7.19M
        s += count;
317
7.19M
      }
318
4.89M
  }
319
320
4.89M
  if (length == 0)
321
69.0k
    {
322
69.0k
      if (result == NULL)
323
69.0k
        {
324
          /* Return a non-NULL value.  NULL means error.  */
325
69.0k
          result = (UNIT *) malloc (1);
326
69.0k
          if (result == NULL)
327
0
            {
328
0
              errno = ENOMEM;
329
0
              goto fail;
330
0
            }
331
69.0k
        }
332
69.0k
    }
333
4.82M
  else if (result != resultbuf && length < allocated)
334
4.82M
    {
335
      /* Shrink the allocated memory if possible.  */
336
4.82M
      UNIT *memory = (UNIT *) realloc (result, length * sizeof (UNIT));
337
4.82M
      if (memory != NULL)
338
4.82M
        result = memory;
339
4.82M
    }
340
341
4.89M
  if (sortbuf_count > 0)
342
0
    abort ();
343
4.89M
  if (sortbuf != sortbuf_preallocated)
344
4.89M
    free (sortbuf);
345
346
4.89M
  *lengthp = length;
347
4.89M
  return result;
348
349
0
 fail:
350
0
  {
351
0
    int saved_errno = errno;
352
0
    if (sortbuf != sortbuf_preallocated)
353
0
      free (sortbuf);
354
0
    if (result != resultbuf)
355
0
      free (result);
356
0
    errno = saved_errno;
357
0
  }
358
  return NULL;
359
4.89M
}