Coverage Report

Created: 2025-12-31 06:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
40.0k
{
22
40.0k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
40.0k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
40.0k
  UNIT *result;
27
40.0k
  size_t length;
28
40.0k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
40.0k
  #define SORTBUF_PREALLOCATED 64
31
40.0k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
40.0k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
40.0k
  size_t sortbuf_allocated;
34
40.0k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
40.0k
  if (resultbuf == NULL)
38
40.0k
    {
39
40.0k
      result = NULL;
40
40.0k
      allocated = 0;
41
40.0k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
40.0k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
40.0k
  sortbuf = sortbuf_preallocated;
51
40.0k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
40.0k
  sortbuf_count = 0;
53
54
40.0k
  {
55
40.0k
    const UNIT *s_end = s + n;
56
57
40.0k
    for (;;)
58
379k
      {
59
379k
        int count;
60
379k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
379k
        int decomposed_count;
62
379k
        int i;
63
64
379k
        if (s < s_end)
65
339k
          {
66
            /* Fetch the next character.  */
67
339k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
339k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
339k
            {
77
339k
              int curr;
78
79
707k
              for (curr = 0; curr < decomposed_count; )
80
368k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
368k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
368k
                  int curr_decomposed_count;
85
86
368k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
368k
                  if (curr_decomposed_count >= 0)
88
14.5k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
14.5k
                      int shift = curr_decomposed_count - 1;
93
94
14.5k
                      if (shift < 0)
95
0
                        abort ();
96
14.5k
                      if (shift > 0)
97
14.3k
                        {
98
14.3k
                          int j;
99
100
14.3k
                          decomposed_count += shift;
101
14.3k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
20.1k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
5.85k
                            decomposed[j + shift] = decomposed[j];
105
14.3k
                        }
106
43.4k
                      for (; shift >= 0; shift--)
107
28.8k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
14.5k
                    }
109
353k
                  else
110
353k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
353k
                      curr++;
113
353k
                    }
114
368k
                }
115
339k
            }
116
339k
          }
117
40.0k
        else
118
40.0k
          {
119
40.0k
            count = 0;
120
40.0k
            decomposed_count = 0;
121
40.0k
          }
122
123
379k
        i = 0;
124
379k
        for (;;)
125
732k
          {
126
732k
            ucs4_t uc;
127
732k
            int ccc;
128
129
732k
            if (s < s_end)
130
692k
              {
131
                /* Fetch the next character from the decomposition.  */
132
692k
                if (i == decomposed_count)
133
339k
                  break;
134
353k
                uc = decomposed[i];
135
353k
                ccc = uc_combining_class (uc);
136
353k
              }
137
40.0k
            else
138
40.0k
              {
139
                /* End of string reached.  */
140
40.0k
                uc = 0;
141
40.0k
                ccc = 0;
142
40.0k
              }
143
144
393k
            if (ccc == 0)
145
338k
              {
146
338k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
338k
                if (sortbuf_count > 1)
151
11.3k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
11.3k
                                                           sortbuf + sortbuf_count);
153
154
338k
                if (composer != NULL)
155
338k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
338k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
297k
                      {
178
350k
                        for (j = 1; j < sortbuf_count; )
179
52.7k
                          {
180
52.7k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
24.5k
                              {
182
24.5k
                                ucs4_t combined =
183
24.5k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
24.5k
                                if (combined)
185
9.51k
                                  {
186
9.51k
                                    size_t k;
187
188
9.51k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
36.2k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
26.7k
                                      sortbuf[k - 1] = sortbuf[k];
192
9.51k
                                    sortbuf_count--;
193
9.51k
                                    continue;
194
9.51k
                                  }
195
24.5k
                              }
196
43.2k
                            j++;
197
43.2k
                          }
198
297k
                        if (s < s_end && sortbuf_count == 1)
199
251k
                          {
200
251k
                            ucs4_t combined =
201
251k
                              composer (sortbuf[0].code, uc);
202
251k
                            if (combined)
203
3.98k
                              {
204
3.98k
                                uc = combined;
205
3.98k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
3.98k
                                sortbuf_count = 0;
210
3.98k
                              }
211
251k
                          }
212
297k
                      }
213
338k
                  }
214
215
678k
                for (j = 0; j < sortbuf_count; j++)
216
339k
                  {
217
339k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
339k
                    if (length < allocated)
221
299k
                      {
222
299k
                        int ret =
223
299k
                          U_UCTOMB (result + length, muc, allocated - length);
224
299k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
299k
                        if (ret >= 0)
230
299k
                          {
231
299k
                            length += ret;
232
299k
                            goto done_appending;
233
299k
                          }
234
299k
                      }
235
40.5k
                    {
236
40.5k
                      size_t old_allocated = allocated;
237
40.5k
                      size_t new_allocated = 2 * old_allocated;
238
40.5k
                      if (new_allocated < 64)
239
39.4k
                        new_allocated = 64;
240
40.5k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
40.5k
                      {
243
40.5k
                        UNIT *larger_result;
244
40.5k
                        if (result == NULL)
245
39.4k
                          {
246
39.4k
                            larger_result =
247
39.4k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
39.4k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
39.4k
                          }
254
1.06k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
1.06k
                        else
266
1.06k
                          {
267
1.06k
                            larger_result =
268
1.06k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
1.06k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
1.06k
                          }
275
40.5k
                        result = larger_result;
276
40.5k
                        allocated = new_allocated;
277
40.5k
                        {
278
40.5k
                          int ret =
279
40.5k
                            U_UCTOMB (result + length, muc, allocated - length);
280
40.5k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
40.5k
                          if (ret < 0)
286
0
                            abort ();
287
40.5k
                          length += ret;
288
40.5k
                          goto done_appending;
289
40.5k
                        }
290
40.5k
                      }
291
40.5k
                    }
292
339k
                   done_appending: ;
293
339k
                  }
294
295
                /* sortbuf is now empty.  */
296
338k
                sortbuf_count = 0;
297
338k
              }
298
299
393k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
40.0k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
353k
            if (sortbuf_count == sortbuf_allocated)
305
185
              {
306
185
                struct ucs4_with_ccc *new_sortbuf;
307
308
185
                sortbuf_allocated = 2 * sortbuf_allocated;
309
185
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
185
                new_sortbuf =
312
185
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
185
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
185
                memcpy (new_sortbuf, sortbuf,
319
185
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
185
                if (sortbuf != sortbuf_preallocated)
321
185
                  free (sortbuf);
322
185
                sortbuf = new_sortbuf;
323
185
              }
324
353k
            sortbuf[sortbuf_count].code = uc;
325
353k
            sortbuf[sortbuf_count].ccc = ccc;
326
353k
            sortbuf_count++;
327
328
353k
            i++;
329
353k
          }
330
331
379k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
40.0k
          break;
334
335
339k
        s += count;
336
339k
      }
337
40.0k
  }
338
339
40.0k
  if (length == 0)
340
570
    {
341
570
      if (result == NULL)
342
570
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
570
          result = (UNIT *) malloc (1);
345
570
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
570
        }
351
570
    }
352
39.4k
  else if (result != resultbuf && length < allocated)
353
39.3k
    {
354
      /* Shrink the allocated memory if possible.  */
355
39.3k
      UNIT *memory;
356
357
39.3k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
39.3k
      if (memory != NULL)
359
39.3k
        result = memory;
360
39.3k
    }
361
362
40.0k
  if (sortbuf_count > 0)
363
0
    abort ();
364
40.0k
  if (sortbuf != sortbuf_preallocated)
365
40.0k
    free (sortbuf);
366
367
40.0k
  *lengthp = length;
368
40.0k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
40.0k
}
u16_normalize
Line
Count
Source
21
10.6k
{
22
10.6k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
10.6k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
10.6k
  UNIT *result;
27
10.6k
  size_t length;
28
10.6k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
10.6k
  #define SORTBUF_PREALLOCATED 64
31
10.6k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
10.6k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
10.6k
  size_t sortbuf_allocated;
34
10.6k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
10.6k
  if (resultbuf == NULL)
38
10.6k
    {
39
10.6k
      result = NULL;
40
10.6k
      allocated = 0;
41
10.6k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
10.6k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
10.6k
  sortbuf = sortbuf_preallocated;
51
10.6k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
10.6k
  sortbuf_count = 0;
53
54
10.6k
  {
55
10.6k
    const UNIT *s_end = s + n;
56
57
10.6k
    for (;;)
58
59.5k
      {
59
59.5k
        int count;
60
59.5k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
59.5k
        int decomposed_count;
62
59.5k
        int i;
63
64
59.5k
        if (s < s_end)
65
48.9k
          {
66
            /* Fetch the next character.  */
67
48.9k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
48.9k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
48.9k
            {
77
48.9k
              int curr;
78
79
97.8k
              for (curr = 0; curr < decomposed_count; )
80
48.9k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
48.9k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
48.9k
                  int curr_decomposed_count;
85
86
48.9k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
48.9k
                  if (curr_decomposed_count >= 0)
88
0
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
0
                      int shift = curr_decomposed_count - 1;
93
94
0
                      if (shift < 0)
95
0
                        abort ();
96
0
                      if (shift > 0)
97
0
                        {
98
0
                          int j;
99
100
0
                          decomposed_count += shift;
101
0
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
0
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
0
                            decomposed[j + shift] = decomposed[j];
105
0
                        }
106
0
                      for (; shift >= 0; shift--)
107
0
                        decomposed[curr + shift] = curr_decomposed[shift];
108
0
                    }
109
48.9k
                  else
110
48.9k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
48.9k
                      curr++;
113
48.9k
                    }
114
48.9k
                }
115
48.9k
            }
116
48.9k
          }
117
10.6k
        else
118
10.6k
          {
119
10.6k
            count = 0;
120
10.6k
            decomposed_count = 0;
121
10.6k
          }
122
123
59.5k
        i = 0;
124
59.5k
        for (;;)
125
108k
          {
126
108k
            ucs4_t uc;
127
108k
            int ccc;
128
129
108k
            if (s < s_end)
130
97.8k
              {
131
                /* Fetch the next character from the decomposition.  */
132
97.8k
                if (i == decomposed_count)
133
48.9k
                  break;
134
48.9k
                uc = decomposed[i];
135
48.9k
                ccc = uc_combining_class (uc);
136
48.9k
              }
137
10.6k
            else
138
10.6k
              {
139
                /* End of string reached.  */
140
10.6k
                uc = 0;
141
10.6k
                ccc = 0;
142
10.6k
              }
143
144
59.5k
            if (ccc == 0)
145
59.5k
              {
146
59.5k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
59.5k
                if (sortbuf_count > 1)
151
0
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
0
                                                           sortbuf + sortbuf_count);
153
154
59.5k
                if (composer != NULL)
155
59.5k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
59.5k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
48.9k
                      {
178
48.9k
                        for (j = 1; j < sortbuf_count; )
179
0
                          {
180
0
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
0
                              {
182
0
                                ucs4_t combined =
183
0
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
0
                                if (combined)
185
0
                                  {
186
0
                                    size_t k;
187
188
0
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
0
                                    for (k = j + 1; k < sortbuf_count; k++)
191
0
                                      sortbuf[k - 1] = sortbuf[k];
192
0
                                    sortbuf_count--;
193
0
                                    continue;
194
0
                                  }
195
0
                              }
196
0
                            j++;
197
0
                          }
198
48.9k
                        if (s < s_end && sortbuf_count == 1)
199
38.3k
                          {
200
38.3k
                            ucs4_t combined =
201
38.3k
                              composer (sortbuf[0].code, uc);
202
38.3k
                            if (combined)
203
0
                              {
204
0
                                uc = combined;
205
0
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
0
                                sortbuf_count = 0;
210
0
                              }
211
38.3k
                          }
212
48.9k
                      }
213
59.5k
                  }
214
215
108k
                for (j = 0; j < sortbuf_count; j++)
216
48.9k
                  {
217
48.9k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
48.9k
                    if (length < allocated)
221
38.3k
                      {
222
38.3k
                        int ret =
223
38.3k
                          U_UCTOMB (result + length, muc, allocated - length);
224
38.3k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
38.3k
                        if (ret >= 0)
230
38.3k
                          {
231
38.3k
                            length += ret;
232
38.3k
                            goto done_appending;
233
38.3k
                          }
234
38.3k
                      }
235
10.6k
                    {
236
10.6k
                      size_t old_allocated = allocated;
237
10.6k
                      size_t new_allocated = 2 * old_allocated;
238
10.6k
                      if (new_allocated < 64)
239
10.6k
                        new_allocated = 64;
240
10.6k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
10.6k
                      {
243
10.6k
                        UNIT *larger_result;
244
10.6k
                        if (result == NULL)
245
10.6k
                          {
246
10.6k
                            larger_result =
247
10.6k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
10.6k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
10.6k
                          }
254
0
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
0
                        else
266
0
                          {
267
0
                            larger_result =
268
0
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
0
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
0
                          }
275
10.6k
                        result = larger_result;
276
10.6k
                        allocated = new_allocated;
277
10.6k
                        {
278
10.6k
                          int ret =
279
10.6k
                            U_UCTOMB (result + length, muc, allocated - length);
280
10.6k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
10.6k
                          if (ret < 0)
286
0
                            abort ();
287
10.6k
                          length += ret;
288
10.6k
                          goto done_appending;
289
10.6k
                        }
290
10.6k
                      }
291
10.6k
                    }
292
48.9k
                   done_appending: ;
293
48.9k
                  }
294
295
                /* sortbuf is now empty.  */
296
59.5k
                sortbuf_count = 0;
297
59.5k
              }
298
299
59.5k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
10.6k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
48.9k
            if (sortbuf_count == sortbuf_allocated)
305
0
              {
306
0
                struct ucs4_with_ccc *new_sortbuf;
307
308
0
                sortbuf_allocated = 2 * sortbuf_allocated;
309
0
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
0
                new_sortbuf =
312
0
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
0
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
0
                memcpy (new_sortbuf, sortbuf,
319
0
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
0
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
0
                sortbuf = new_sortbuf;
323
0
              }
324
48.9k
            sortbuf[sortbuf_count].code = uc;
325
48.9k
            sortbuf[sortbuf_count].ccc = ccc;
326
48.9k
            sortbuf_count++;
327
328
48.9k
            i++;
329
48.9k
          }
330
331
59.5k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
10.6k
          break;
334
335
48.9k
        s += count;
336
48.9k
      }
337
10.6k
  }
338
339
10.6k
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
10.6k
  else if (result != resultbuf && length < allocated)
353
10.6k
    {
354
      /* Shrink the allocated memory if possible.  */
355
10.6k
      UNIT *memory;
356
357
10.6k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
10.6k
      if (memory != NULL)
359
10.6k
        result = memory;
360
10.6k
    }
361
362
10.6k
  if (sortbuf_count > 0)
363
0
    abort ();
364
10.6k
  if (sortbuf != sortbuf_preallocated)
365
10.6k
    free (sortbuf);
366
367
10.6k
  *lengthp = length;
368
10.6k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
10.6k
}
u32_normalize
Line
Count
Source
21
29.4k
{
22
29.4k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
29.4k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
29.4k
  UNIT *result;
27
29.4k
  size_t length;
28
29.4k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
29.4k
  #define SORTBUF_PREALLOCATED 64
31
29.4k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
29.4k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
29.4k
  size_t sortbuf_allocated;
34
29.4k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
29.4k
  if (resultbuf == NULL)
38
29.4k
    {
39
29.4k
      result = NULL;
40
29.4k
      allocated = 0;
41
29.4k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
29.4k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
29.4k
  sortbuf = sortbuf_preallocated;
51
29.4k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
29.4k
  sortbuf_count = 0;
53
54
29.4k
  {
55
29.4k
    const UNIT *s_end = s + n;
56
57
29.4k
    for (;;)
58
319k
      {
59
319k
        int count;
60
319k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
319k
        int decomposed_count;
62
319k
        int i;
63
64
319k
        if (s < s_end)
65
290k
          {
66
            /* Fetch the next character.  */
67
290k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
290k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
290k
            {
77
290k
              int curr;
78
79
609k
              for (curr = 0; curr < decomposed_count; )
80
319k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
319k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
319k
                  int curr_decomposed_count;
85
86
319k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
319k
                  if (curr_decomposed_count >= 0)
88
14.5k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
14.5k
                      int shift = curr_decomposed_count - 1;
93
94
14.5k
                      if (shift < 0)
95
0
                        abort ();
96
14.5k
                      if (shift > 0)
97
14.3k
                        {
98
14.3k
                          int j;
99
100
14.3k
                          decomposed_count += shift;
101
14.3k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
20.1k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
5.85k
                            decomposed[j + shift] = decomposed[j];
105
14.3k
                        }
106
43.4k
                      for (; shift >= 0; shift--)
107
28.8k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
14.5k
                    }
109
304k
                  else
110
304k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
304k
                      curr++;
113
304k
                    }
114
319k
                }
115
290k
            }
116
290k
          }
117
29.4k
        else
118
29.4k
          {
119
29.4k
            count = 0;
120
29.4k
            decomposed_count = 0;
121
29.4k
          }
122
123
319k
        i = 0;
124
319k
        for (;;)
125
624k
          {
126
624k
            ucs4_t uc;
127
624k
            int ccc;
128
129
624k
            if (s < s_end)
130
594k
              {
131
                /* Fetch the next character from the decomposition.  */
132
594k
                if (i == decomposed_count)
133
290k
                  break;
134
304k
                uc = decomposed[i];
135
304k
                ccc = uc_combining_class (uc);
136
304k
              }
137
29.4k
            else
138
29.4k
              {
139
                /* End of string reached.  */
140
29.4k
                uc = 0;
141
29.4k
                ccc = 0;
142
29.4k
              }
143
144
333k
            if (ccc == 0)
145
278k
              {
146
278k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
278k
                if (sortbuf_count > 1)
151
11.3k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
11.3k
                                                           sortbuf + sortbuf_count);
153
154
278k
                if (composer != NULL)
155
278k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
278k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
249k
                      {
178
301k
                        for (j = 1; j < sortbuf_count; )
179
52.7k
                          {
180
52.7k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
24.5k
                              {
182
24.5k
                                ucs4_t combined =
183
24.5k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
24.5k
                                if (combined)
185
9.51k
                                  {
186
9.51k
                                    size_t k;
187
188
9.51k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
36.2k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
26.7k
                                      sortbuf[k - 1] = sortbuf[k];
192
9.51k
                                    sortbuf_count--;
193
9.51k
                                    continue;
194
9.51k
                                  }
195
24.5k
                              }
196
43.2k
                            j++;
197
43.2k
                          }
198
249k
                        if (s < s_end && sortbuf_count == 1)
199
213k
                          {
200
213k
                            ucs4_t combined =
201
213k
                              composer (sortbuf[0].code, uc);
202
213k
                            if (combined)
203
3.98k
                              {
204
3.98k
                                uc = combined;
205
3.98k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
3.98k
                                sortbuf_count = 0;
210
3.98k
                              }
211
213k
                          }
212
249k
                      }
213
278k
                  }
214
215
569k
                for (j = 0; j < sortbuf_count; j++)
216
291k
                  {
217
291k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
291k
                    if (length < allocated)
221
261k
                      {
222
261k
                        int ret =
223
261k
                          U_UCTOMB (result + length, muc, allocated - length);
224
261k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
261k
                        if (ret >= 0)
230
261k
                          {
231
261k
                            length += ret;
232
261k
                            goto done_appending;
233
261k
                          }
234
261k
                      }
235
29.9k
                    {
236
29.9k
                      size_t old_allocated = allocated;
237
29.9k
                      size_t new_allocated = 2 * old_allocated;
238
29.9k
                      if (new_allocated < 64)
239
28.8k
                        new_allocated = 64;
240
29.9k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
29.9k
                      {
243
29.9k
                        UNIT *larger_result;
244
29.9k
                        if (result == NULL)
245
28.8k
                          {
246
28.8k
                            larger_result =
247
28.8k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
28.8k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
28.8k
                          }
254
1.06k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
1.06k
                        else
266
1.06k
                          {
267
1.06k
                            larger_result =
268
1.06k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
1.06k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
1.06k
                          }
275
29.9k
                        result = larger_result;
276
29.9k
                        allocated = new_allocated;
277
29.9k
                        {
278
29.9k
                          int ret =
279
29.9k
                            U_UCTOMB (result + length, muc, allocated - length);
280
29.9k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
29.9k
                          if (ret < 0)
286
0
                            abort ();
287
29.9k
                          length += ret;
288
29.9k
                          goto done_appending;
289
29.9k
                        }
290
29.9k
                      }
291
29.9k
                    }
292
291k
                   done_appending: ;
293
291k
                  }
294
295
                /* sortbuf is now empty.  */
296
278k
                sortbuf_count = 0;
297
278k
              }
298
299
333k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
29.4k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
304k
            if (sortbuf_count == sortbuf_allocated)
305
185
              {
306
185
                struct ucs4_with_ccc *new_sortbuf;
307
308
185
                sortbuf_allocated = 2 * sortbuf_allocated;
309
185
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
185
                new_sortbuf =
312
185
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
185
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
185
                memcpy (new_sortbuf, sortbuf,
319
185
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
185
                if (sortbuf != sortbuf_preallocated)
321
185
                  free (sortbuf);
322
185
                sortbuf = new_sortbuf;
323
185
              }
324
304k
            sortbuf[sortbuf_count].code = uc;
325
304k
            sortbuf[sortbuf_count].ccc = ccc;
326
304k
            sortbuf_count++;
327
328
304k
            i++;
329
304k
          }
330
331
319k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
29.4k
          break;
334
335
290k
        s += count;
336
290k
      }
337
29.4k
  }
338
339
29.4k
  if (length == 0)
340
570
    {
341
570
      if (result == NULL)
342
570
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
570
          result = (UNIT *) malloc (1);
345
570
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
570
        }
351
570
    }
352
28.8k
  else if (result != resultbuf && length < allocated)
353
28.7k
    {
354
      /* Shrink the allocated memory if possible.  */
355
28.7k
      UNIT *memory;
356
357
28.7k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
28.7k
      if (memory != NULL)
359
28.7k
        result = memory;
360
28.7k
    }
361
362
29.4k
  if (sortbuf_count > 0)
363
0
    abort ();
364
29.4k
  if (sortbuf != sortbuf_preallocated)
365
29.4k
    free (sortbuf);
366
367
29.4k
  *lengthp = length;
368
29.4k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
29.4k
}