Coverage Report

Created: 2026-02-09 06:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
39.4k
{
22
39.4k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
39.4k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
39.4k
  UNIT *result;
27
39.4k
  size_t length;
28
39.4k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
39.4k
  #define SORTBUF_PREALLOCATED 64
31
39.4k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
39.4k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
39.4k
  size_t sortbuf_allocated;
34
39.4k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
39.4k
  if (resultbuf == NULL)
38
39.4k
    {
39
39.4k
      result = NULL;
40
39.4k
      allocated = 0;
41
39.4k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
39.4k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
39.4k
  sortbuf = sortbuf_preallocated;
51
39.4k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
39.4k
  sortbuf_count = 0;
53
54
39.4k
  {
55
39.4k
    const UNIT *s_end = s + n;
56
57
39.4k
    for (;;)
58
370k
      {
59
370k
        int count;
60
370k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
370k
        int decomposed_count;
62
370k
        int i;
63
64
370k
        if (s < s_end)
65
331k
          {
66
            /* Fetch the next character.  */
67
331k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
331k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
331k
            {
77
331k
              int curr;
78
79
691k
              for (curr = 0; curr < decomposed_count; )
80
359k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
359k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
359k
                  int curr_decomposed_count;
85
86
359k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
359k
                  if (curr_decomposed_count >= 0)
88
14.2k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
14.2k
                      int shift = curr_decomposed_count - 1;
93
94
14.2k
                      if (shift < 0)
95
0
                        abort ();
96
14.2k
                      if (shift > 0)
97
14.0k
                        {
98
14.0k
                          int j;
99
100
14.0k
                          decomposed_count += shift;
101
14.0k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
19.7k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
5.71k
                            decomposed[j + shift] = decomposed[j];
105
14.0k
                        }
106
42.5k
                      for (; shift >= 0; shift--)
107
28.2k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
14.2k
                    }
109
345k
                  else
110
345k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
345k
                      curr++;
113
345k
                    }
114
359k
                }
115
331k
            }
116
331k
          }
117
39.4k
        else
118
39.4k
          {
119
39.4k
            count = 0;
120
39.4k
            decomposed_count = 0;
121
39.4k
          }
122
123
370k
        i = 0;
124
370k
        for (;;)
125
716k
          {
126
716k
            ucs4_t uc;
127
716k
            int ccc;
128
129
716k
            if (s < s_end)
130
676k
              {
131
                /* Fetch the next character from the decomposition.  */
132
676k
                if (i == decomposed_count)
133
331k
                  break;
134
345k
                uc = decomposed[i];
135
345k
                ccc = uc_combining_class (uc);
136
345k
              }
137
39.4k
            else
138
39.4k
              {
139
                /* End of string reached.  */
140
39.4k
                uc = 0;
141
39.4k
                ccc = 0;
142
39.4k
              }
143
144
384k
            if (ccc == 0)
145
331k
              {
146
331k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
331k
                if (sortbuf_count > 1)
151
11.4k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
11.4k
                                                           sortbuf + sortbuf_count);
153
154
331k
                if (composer != NULL)
155
331k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
331k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
291k
                      {
178
342k
                        for (j = 1; j < sortbuf_count; )
179
50.7k
                          {
180
50.7k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
24.5k
                              {
182
24.5k
                                ucs4_t combined =
183
24.5k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
24.5k
                                if (combined)
185
9.84k
                                  {
186
9.84k
                                    size_t k;
187
188
9.84k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
34.1k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
24.2k
                                      sortbuf[k - 1] = sortbuf[k];
192
9.84k
                                    sortbuf_count--;
193
9.84k
                                    continue;
194
9.84k
                                  }
195
24.5k
                              }
196
40.9k
                            j++;
197
40.9k
                          }
198
291k
                        if (s < s_end && sortbuf_count == 1)
199
246k
                          {
200
246k
                            ucs4_t combined =
201
246k
                              composer (sortbuf[0].code, uc);
202
246k
                            if (combined)
203
3.37k
                              {
204
3.37k
                                uc = combined;
205
3.37k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
3.37k
                                sortbuf_count = 0;
210
3.37k
                              }
211
246k
                          }
212
291k
                      }
213
331k
                  }
214
215
663k
                for (j = 0; j < sortbuf_count; j++)
216
332k
                  {
217
332k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
332k
                    if (length < allocated)
221
292k
                      {
222
292k
                        int ret =
223
292k
                          U_UCTOMB (result + length, muc, allocated - length);
224
292k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
292k
                        if (ret >= 0)
230
292k
                          {
231
292k
                            length += ret;
232
292k
                            goto done_appending;
233
292k
                          }
234
292k
                      }
235
39.9k
                    {
236
39.9k
                      size_t old_allocated = allocated;
237
39.9k
                      size_t new_allocated = 2 * old_allocated;
238
39.9k
                      if (new_allocated < 64)
239
38.8k
                        new_allocated = 64;
240
39.9k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
39.9k
                      {
243
39.9k
                        UNIT *larger_result;
244
39.9k
                        if (result == NULL)
245
38.8k
                          {
246
38.8k
                            larger_result =
247
38.8k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
38.8k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
38.8k
                          }
254
1.04k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
1.04k
                        else
266
1.04k
                          {
267
1.04k
                            larger_result =
268
1.04k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
1.04k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
1.04k
                          }
275
39.9k
                        result = larger_result;
276
39.9k
                        allocated = new_allocated;
277
39.9k
                        {
278
39.9k
                          int ret =
279
39.9k
                            U_UCTOMB (result + length, muc, allocated - length);
280
39.9k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
39.9k
                          if (ret < 0)
286
0
                            abort ();
287
39.9k
                          length += ret;
288
39.9k
                          goto done_appending;
289
39.9k
                        }
290
39.9k
                      }
291
39.9k
                    }
292
332k
                   done_appending: ;
293
332k
                  }
294
295
                /* sortbuf is now empty.  */
296
331k
                sortbuf_count = 0;
297
331k
              }
298
299
384k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
39.4k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
345k
            if (sortbuf_count == sortbuf_allocated)
305
180
              {
306
180
                struct ucs4_with_ccc *new_sortbuf;
307
308
180
                sortbuf_allocated = 2 * sortbuf_allocated;
309
180
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
180
                new_sortbuf =
312
180
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
180
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
180
                memcpy (new_sortbuf, sortbuf,
319
180
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
180
                if (sortbuf != sortbuf_preallocated)
321
180
                  free (sortbuf);
322
180
                sortbuf = new_sortbuf;
323
180
              }
324
345k
            sortbuf[sortbuf_count].code = uc;
325
345k
            sortbuf[sortbuf_count].ccc = ccc;
326
345k
            sortbuf_count++;
327
328
345k
            i++;
329
345k
          }
330
331
370k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
39.4k
          break;
334
335
331k
        s += count;
336
331k
      }
337
39.4k
  }
338
339
39.4k
  if (length == 0)
340
581
    {
341
581
      if (result == NULL)
342
581
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
581
          result = (UNIT *) malloc (1);
345
581
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
581
        }
351
581
    }
352
38.8k
  else if (result != resultbuf && length < allocated)
353
38.8k
    {
354
      /* Shrink the allocated memory if possible.  */
355
38.8k
      UNIT *memory;
356
357
38.8k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
38.8k
      if (memory != NULL)
359
38.8k
        result = memory;
360
38.8k
    }
361
362
39.4k
  if (sortbuf_count > 0)
363
0
    abort ();
364
39.4k
  if (sortbuf != sortbuf_preallocated)
365
39.4k
    free (sortbuf);
366
367
39.4k
  *lengthp = length;
368
39.4k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
39.4k
}
u16_normalize
Line
Count
Source
21
10.6k
{
22
10.6k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
10.6k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
10.6k
  UNIT *result;
27
10.6k
  size_t length;
28
10.6k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
10.6k
  #define SORTBUF_PREALLOCATED 64
31
10.6k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
10.6k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
10.6k
  size_t sortbuf_allocated;
34
10.6k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
10.6k
  if (resultbuf == NULL)
38
10.6k
    {
39
10.6k
      result = NULL;
40
10.6k
      allocated = 0;
41
10.6k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
10.6k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
10.6k
  sortbuf = sortbuf_preallocated;
51
10.6k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
10.6k
  sortbuf_count = 0;
53
54
10.6k
  {
55
10.6k
    const UNIT *s_end = s + n;
56
57
10.6k
    for (;;)
58
58.7k
      {
59
58.7k
        int count;
60
58.7k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
58.7k
        int decomposed_count;
62
58.7k
        int i;
63
64
58.7k
        if (s < s_end)
65
48.1k
          {
66
            /* Fetch the next character.  */
67
48.1k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
48.1k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
48.1k
            {
77
48.1k
              int curr;
78
79
96.3k
              for (curr = 0; curr < decomposed_count; )
80
48.1k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
48.1k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
48.1k
                  int curr_decomposed_count;
85
86
48.1k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
48.1k
                  if (curr_decomposed_count >= 0)
88
0
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
0
                      int shift = curr_decomposed_count - 1;
93
94
0
                      if (shift < 0)
95
0
                        abort ();
96
0
                      if (shift > 0)
97
0
                        {
98
0
                          int j;
99
100
0
                          decomposed_count += shift;
101
0
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
0
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
0
                            decomposed[j + shift] = decomposed[j];
105
0
                        }
106
0
                      for (; shift >= 0; shift--)
107
0
                        decomposed[curr + shift] = curr_decomposed[shift];
108
0
                    }
109
48.1k
                  else
110
48.1k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
48.1k
                      curr++;
113
48.1k
                    }
114
48.1k
                }
115
48.1k
            }
116
48.1k
          }
117
10.6k
        else
118
10.6k
          {
119
10.6k
            count = 0;
120
10.6k
            decomposed_count = 0;
121
10.6k
          }
122
123
58.7k
        i = 0;
124
58.7k
        for (;;)
125
106k
          {
126
106k
            ucs4_t uc;
127
106k
            int ccc;
128
129
106k
            if (s < s_end)
130
96.3k
              {
131
                /* Fetch the next character from the decomposition.  */
132
96.3k
                if (i == decomposed_count)
133
48.1k
                  break;
134
48.1k
                uc = decomposed[i];
135
48.1k
                ccc = uc_combining_class (uc);
136
48.1k
              }
137
10.6k
            else
138
10.6k
              {
139
                /* End of string reached.  */
140
10.6k
                uc = 0;
141
10.6k
                ccc = 0;
142
10.6k
              }
143
144
58.7k
            if (ccc == 0)
145
58.7k
              {
146
58.7k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
58.7k
                if (sortbuf_count > 1)
151
0
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
0
                                                           sortbuf + sortbuf_count);
153
154
58.7k
                if (composer != NULL)
155
58.7k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
58.7k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
48.1k
                      {
178
48.1k
                        for (j = 1; j < sortbuf_count; )
179
0
                          {
180
0
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
0
                              {
182
0
                                ucs4_t combined =
183
0
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
0
                                if (combined)
185
0
                                  {
186
0
                                    size_t k;
187
188
0
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
0
                                    for (k = j + 1; k < sortbuf_count; k++)
191
0
                                      sortbuf[k - 1] = sortbuf[k];
192
0
                                    sortbuf_count--;
193
0
                                    continue;
194
0
                                  }
195
0
                              }
196
0
                            j++;
197
0
                          }
198
48.1k
                        if (s < s_end && sortbuf_count == 1)
199
37.5k
                          {
200
37.5k
                            ucs4_t combined =
201
37.5k
                              composer (sortbuf[0].code, uc);
202
37.5k
                            if (combined)
203
0
                              {
204
0
                                uc = combined;
205
0
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
0
                                sortbuf_count = 0;
210
0
                              }
211
37.5k
                          }
212
48.1k
                      }
213
58.7k
                  }
214
215
106k
                for (j = 0; j < sortbuf_count; j++)
216
48.1k
                  {
217
48.1k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
48.1k
                    if (length < allocated)
221
37.5k
                      {
222
37.5k
                        int ret =
223
37.5k
                          U_UCTOMB (result + length, muc, allocated - length);
224
37.5k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
37.5k
                        if (ret >= 0)
230
37.5k
                          {
231
37.5k
                            length += ret;
232
37.5k
                            goto done_appending;
233
37.5k
                          }
234
37.5k
                      }
235
10.6k
                    {
236
10.6k
                      size_t old_allocated = allocated;
237
10.6k
                      size_t new_allocated = 2 * old_allocated;
238
10.6k
                      if (new_allocated < 64)
239
10.6k
                        new_allocated = 64;
240
10.6k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
10.6k
                      {
243
10.6k
                        UNIT *larger_result;
244
10.6k
                        if (result == NULL)
245
10.6k
                          {
246
10.6k
                            larger_result =
247
10.6k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
10.6k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
10.6k
                          }
254
0
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
0
                        else
266
0
                          {
267
0
                            larger_result =
268
0
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
0
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
0
                          }
275
10.6k
                        result = larger_result;
276
10.6k
                        allocated = new_allocated;
277
10.6k
                        {
278
10.6k
                          int ret =
279
10.6k
                            U_UCTOMB (result + length, muc, allocated - length);
280
10.6k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
10.6k
                          if (ret < 0)
286
0
                            abort ();
287
10.6k
                          length += ret;
288
10.6k
                          goto done_appending;
289
10.6k
                        }
290
10.6k
                      }
291
10.6k
                    }
292
48.1k
                   done_appending: ;
293
48.1k
                  }
294
295
                /* sortbuf is now empty.  */
296
58.7k
                sortbuf_count = 0;
297
58.7k
              }
298
299
58.7k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
10.6k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
48.1k
            if (sortbuf_count == sortbuf_allocated)
305
0
              {
306
0
                struct ucs4_with_ccc *new_sortbuf;
307
308
0
                sortbuf_allocated = 2 * sortbuf_allocated;
309
0
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
0
                new_sortbuf =
312
0
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
0
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
0
                memcpy (new_sortbuf, sortbuf,
319
0
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
0
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
0
                sortbuf = new_sortbuf;
323
0
              }
324
48.1k
            sortbuf[sortbuf_count].code = uc;
325
48.1k
            sortbuf[sortbuf_count].ccc = ccc;
326
48.1k
            sortbuf_count++;
327
328
48.1k
            i++;
329
48.1k
          }
330
331
58.7k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
10.6k
          break;
334
335
48.1k
        s += count;
336
48.1k
      }
337
10.6k
  }
338
339
10.6k
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
10.6k
  else if (result != resultbuf && length < allocated)
353
10.6k
    {
354
      /* Shrink the allocated memory if possible.  */
355
10.6k
      UNIT *memory;
356
357
10.6k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
10.6k
      if (memory != NULL)
359
10.6k
        result = memory;
360
10.6k
    }
361
362
10.6k
  if (sortbuf_count > 0)
363
0
    abort ();
364
10.6k
  if (sortbuf != sortbuf_preallocated)
365
10.6k
    free (sortbuf);
366
367
10.6k
  *lengthp = length;
368
10.6k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
10.6k
}
u32_normalize
Line
Count
Source
21
28.8k
{
22
28.8k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
28.8k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
28.8k
  UNIT *result;
27
28.8k
  size_t length;
28
28.8k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
28.8k
  #define SORTBUF_PREALLOCATED 64
31
28.8k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
28.8k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
28.8k
  size_t sortbuf_allocated;
34
28.8k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
28.8k
  if (resultbuf == NULL)
38
28.8k
    {
39
28.8k
      result = NULL;
40
28.8k
      allocated = 0;
41
28.8k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
28.8k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
28.8k
  sortbuf = sortbuf_preallocated;
51
28.8k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
28.8k
  sortbuf_count = 0;
53
54
28.8k
  {
55
28.8k
    const UNIT *s_end = s + n;
56
57
28.8k
    for (;;)
58
312k
      {
59
312k
        int count;
60
312k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
312k
        int decomposed_count;
62
312k
        int i;
63
64
312k
        if (s < s_end)
65
283k
          {
66
            /* Fetch the next character.  */
67
283k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
283k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
283k
            {
77
283k
              int curr;
78
79
594k
              for (curr = 0; curr < decomposed_count; )
80
311k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
311k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
311k
                  int curr_decomposed_count;
85
86
311k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
311k
                  if (curr_decomposed_count >= 0)
88
14.2k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
14.2k
                      int shift = curr_decomposed_count - 1;
93
94
14.2k
                      if (shift < 0)
95
0
                        abort ();
96
14.2k
                      if (shift > 0)
97
14.0k
                        {
98
14.0k
                          int j;
99
100
14.0k
                          decomposed_count += shift;
101
14.0k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
19.7k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
5.71k
                            decomposed[j + shift] = decomposed[j];
105
14.0k
                        }
106
42.5k
                      for (; shift >= 0; shift--)
107
28.2k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
14.2k
                    }
109
297k
                  else
110
297k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
297k
                      curr++;
113
297k
                    }
114
311k
                }
115
283k
            }
116
283k
          }
117
28.8k
        else
118
28.8k
          {
119
28.8k
            count = 0;
120
28.8k
            decomposed_count = 0;
121
28.8k
          }
122
123
312k
        i = 0;
124
312k
        for (;;)
125
609k
          {
126
609k
            ucs4_t uc;
127
609k
            int ccc;
128
129
609k
            if (s < s_end)
130
580k
              {
131
                /* Fetch the next character from the decomposition.  */
132
580k
                if (i == decomposed_count)
133
283k
                  break;
134
297k
                uc = decomposed[i];
135
297k
                ccc = uc_combining_class (uc);
136
297k
              }
137
28.8k
            else
138
28.8k
              {
139
                /* End of string reached.  */
140
28.8k
                uc = 0;
141
28.8k
                ccc = 0;
142
28.8k
              }
143
144
326k
            if (ccc == 0)
145
272k
              {
146
272k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
272k
                if (sortbuf_count > 1)
151
11.4k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
11.4k
                                                           sortbuf + sortbuf_count);
153
154
272k
                if (composer != NULL)
155
272k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
272k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
243k
                      {
178
294k
                        for (j = 1; j < sortbuf_count; )
179
50.7k
                          {
180
50.7k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
24.5k
                              {
182
24.5k
                                ucs4_t combined =
183
24.5k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
24.5k
                                if (combined)
185
9.84k
                                  {
186
9.84k
                                    size_t k;
187
188
9.84k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
34.1k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
24.2k
                                      sortbuf[k - 1] = sortbuf[k];
192
9.84k
                                    sortbuf_count--;
193
9.84k
                                    continue;
194
9.84k
                                  }
195
24.5k
                              }
196
40.9k
                            j++;
197
40.9k
                          }
198
243k
                        if (s < s_end && sortbuf_count == 1)
199
208k
                          {
200
208k
                            ucs4_t combined =
201
208k
                              composer (sortbuf[0].code, uc);
202
208k
                            if (combined)
203
3.37k
                              {
204
3.37k
                                uc = combined;
205
3.37k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
3.37k
                                sortbuf_count = 0;
210
3.37k
                              }
211
208k
                          }
212
243k
                      }
213
272k
                  }
214
215
556k
                for (j = 0; j < sortbuf_count; j++)
216
284k
                  {
217
284k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
284k
                    if (length < allocated)
221
254k
                      {
222
254k
                        int ret =
223
254k
                          U_UCTOMB (result + length, muc, allocated - length);
224
254k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
254k
                        if (ret >= 0)
230
254k
                          {
231
254k
                            length += ret;
232
254k
                            goto done_appending;
233
254k
                          }
234
254k
                      }
235
29.3k
                    {
236
29.3k
                      size_t old_allocated = allocated;
237
29.3k
                      size_t new_allocated = 2 * old_allocated;
238
29.3k
                      if (new_allocated < 64)
239
28.2k
                        new_allocated = 64;
240
29.3k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
29.3k
                      {
243
29.3k
                        UNIT *larger_result;
244
29.3k
                        if (result == NULL)
245
28.2k
                          {
246
28.2k
                            larger_result =
247
28.2k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
28.2k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
28.2k
                          }
254
1.04k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
1.04k
                        else
266
1.04k
                          {
267
1.04k
                            larger_result =
268
1.04k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
1.04k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
1.04k
                          }
275
29.3k
                        result = larger_result;
276
29.3k
                        allocated = new_allocated;
277
29.3k
                        {
278
29.3k
                          int ret =
279
29.3k
                            U_UCTOMB (result + length, muc, allocated - length);
280
29.3k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
29.3k
                          if (ret < 0)
286
0
                            abort ();
287
29.3k
                          length += ret;
288
29.3k
                          goto done_appending;
289
29.3k
                        }
290
29.3k
                      }
291
29.3k
                    }
292
284k
                   done_appending: ;
293
284k
                  }
294
295
                /* sortbuf is now empty.  */
296
272k
                sortbuf_count = 0;
297
272k
              }
298
299
326k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
28.8k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
297k
            if (sortbuf_count == sortbuf_allocated)
305
180
              {
306
180
                struct ucs4_with_ccc *new_sortbuf;
307
308
180
                sortbuf_allocated = 2 * sortbuf_allocated;
309
180
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
180
                new_sortbuf =
312
180
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
180
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
180
                memcpy (new_sortbuf, sortbuf,
319
180
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
180
                if (sortbuf != sortbuf_preallocated)
321
180
                  free (sortbuf);
322
180
                sortbuf = new_sortbuf;
323
180
              }
324
297k
            sortbuf[sortbuf_count].code = uc;
325
297k
            sortbuf[sortbuf_count].ccc = ccc;
326
297k
            sortbuf_count++;
327
328
297k
            i++;
329
297k
          }
330
331
312k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
28.8k
          break;
334
335
283k
        s += count;
336
283k
      }
337
28.8k
  }
338
339
28.8k
  if (length == 0)
340
581
    {
341
581
      if (result == NULL)
342
581
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
581
          result = (UNIT *) malloc (1);
345
581
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
581
        }
351
581
    }
352
28.2k
  else if (result != resultbuf && length < allocated)
353
28.1k
    {
354
      /* Shrink the allocated memory if possible.  */
355
28.1k
      UNIT *memory;
356
357
28.1k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
28.1k
      if (memory != NULL)
359
28.1k
        result = memory;
360
28.1k
    }
361
362
28.8k
  if (sortbuf_count > 0)
363
0
    abort ();
364
28.8k
  if (sortbuf != sortbuf_preallocated)
365
28.8k
    free (sortbuf);
366
367
28.8k
  *lengthp = length;
368
28.8k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
28.8k
}