Coverage Report

Created: 2025-11-16 06:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2025 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
38.3k
{
22
38.3k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
38.3k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
38.3k
  UNIT *result;
27
38.3k
  size_t length;
28
38.3k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
38.3k
  #define SORTBUF_PREALLOCATED 64
31
38.3k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
38.3k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
38.3k
  size_t sortbuf_allocated;
34
38.3k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
38.3k
  if (resultbuf == NULL)
38
38.3k
    {
39
38.3k
      result = NULL;
40
38.3k
      allocated = 0;
41
38.3k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
38.3k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
38.3k
  sortbuf = sortbuf_preallocated;
51
38.3k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
38.3k
  sortbuf_count = 0;
53
54
38.3k
  {
55
38.3k
    const UNIT *s_end = s + n;
56
57
38.3k
    for (;;)
58
371k
      {
59
371k
        int count;
60
371k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
371k
        int decomposed_count;
62
371k
        int i;
63
64
371k
        if (s < s_end)
65
332k
          {
66
            /* Fetch the next character.  */
67
332k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
332k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
332k
            {
77
332k
              int curr;
78
79
693k
              for (curr = 0; curr < decomposed_count; )
80
361k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
361k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
361k
                  int curr_decomposed_count;
85
86
361k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
361k
                  if (curr_decomposed_count >= 0)
88
14.2k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
14.2k
                      int shift = curr_decomposed_count - 1;
93
94
14.2k
                      if (shift < 0)
95
0
                        abort ();
96
14.2k
                      if (shift > 0)
97
14.0k
                        {
98
14.0k
                          int j;
99
100
14.0k
                          decomposed_count += shift;
101
14.0k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
19.5k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
5.50k
                            decomposed[j + shift] = decomposed[j];
105
14.0k
                        }
106
42.4k
                      for (; shift >= 0; shift--)
107
28.2k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
14.2k
                    }
109
346k
                  else
110
346k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
346k
                      curr++;
113
346k
                    }
114
361k
                }
115
332k
            }
116
332k
          }
117
38.3k
        else
118
38.3k
          {
119
38.3k
            count = 0;
120
38.3k
            decomposed_count = 0;
121
38.3k
          }
122
123
371k
        i = 0;
124
371k
        for (;;)
125
718k
          {
126
718k
            ucs4_t uc;
127
718k
            int ccc;
128
129
718k
            if (s < s_end)
130
679k
              {
131
                /* Fetch the next character from the decomposition.  */
132
679k
                if (i == decomposed_count)
133
332k
                  break;
134
346k
                uc = decomposed[i];
135
346k
                ccc = uc_combining_class (uc);
136
346k
              }
137
38.3k
            else
138
38.3k
              {
139
                /* End of string reached.  */
140
38.3k
                uc = 0;
141
38.3k
                ccc = 0;
142
38.3k
              }
143
144
385k
            if (ccc == 0)
145
329k
              {
146
329k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
329k
                if (sortbuf_count > 1)
151
11.8k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
11.8k
                                                           sortbuf + sortbuf_count);
153
154
329k
                if (composer != NULL)
155
329k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
329k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
290k
                      {
178
343k
                        for (j = 1; j < sortbuf_count; )
179
53.3k
                          {
180
53.3k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
25.1k
                              {
182
25.1k
                                ucs4_t combined =
183
25.1k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
25.1k
                                if (combined)
185
9.96k
                                  {
186
9.96k
                                    size_t k;
187
188
9.96k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
36.5k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
26.5k
                                      sortbuf[k - 1] = sortbuf[k];
192
9.96k
                                    sortbuf_count--;
193
9.96k
                                    continue;
194
9.96k
                                  }
195
25.1k
                              }
196
43.3k
                            j++;
197
43.3k
                          }
198
290k
                        if (s < s_end && sortbuf_count == 1)
199
246k
                          {
200
246k
                            ucs4_t combined =
201
246k
                              composer (sortbuf[0].code, uc);
202
246k
                            if (combined)
203
3.17k
                              {
204
3.17k
                                uc = combined;
205
3.17k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
3.17k
                                sortbuf_count = 0;
210
3.17k
                              }
211
246k
                          }
212
290k
                      }
213
329k
                  }
214
215
662k
                for (j = 0; j < sortbuf_count; j++)
216
333k
                  {
217
333k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
333k
                    if (length < allocated)
221
294k
                      {
222
294k
                        int ret =
223
294k
                          U_UCTOMB (result + length, muc, allocated - length);
224
294k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
294k
                        if (ret >= 0)
230
294k
                          {
231
294k
                            length += ret;
232
294k
                            goto done_appending;
233
294k
                          }
234
294k
                      }
235
38.8k
                    {
236
38.8k
                      size_t old_allocated = allocated;
237
38.8k
                      size_t new_allocated = 2 * old_allocated;
238
38.8k
                      if (new_allocated < 64)
239
37.8k
                        new_allocated = 64;
240
38.8k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
38.8k
                      {
243
38.8k
                        UNIT *larger_result;
244
38.8k
                        if (result == NULL)
245
37.8k
                          {
246
37.8k
                            larger_result =
247
37.8k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
37.8k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
37.8k
                          }
254
1.09k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
1.09k
                        else
266
1.09k
                          {
267
1.09k
                            larger_result =
268
1.09k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
1.09k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
1.09k
                          }
275
38.8k
                        result = larger_result;
276
38.8k
                        allocated = new_allocated;
277
38.8k
                        {
278
38.8k
                          int ret =
279
38.8k
                            U_UCTOMB (result + length, muc, allocated - length);
280
38.8k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
38.8k
                          if (ret < 0)
286
0
                            abort ();
287
38.8k
                          length += ret;
288
38.8k
                          goto done_appending;
289
38.8k
                        }
290
38.8k
                      }
291
38.8k
                    }
292
333k
                   done_appending: ;
293
333k
                  }
294
295
                /* sortbuf is now empty.  */
296
329k
                sortbuf_count = 0;
297
329k
              }
298
299
385k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
38.3k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
346k
            if (sortbuf_count == sortbuf_allocated)
305
179
              {
306
179
                struct ucs4_with_ccc *new_sortbuf;
307
308
179
                sortbuf_allocated = 2 * sortbuf_allocated;
309
179
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
179
                new_sortbuf =
312
179
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
179
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
179
                memcpy (new_sortbuf, sortbuf,
319
179
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
179
                if (sortbuf != sortbuf_preallocated)
321
179
                  free (sortbuf);
322
179
                sortbuf = new_sortbuf;
323
179
              }
324
346k
            sortbuf[sortbuf_count].code = uc;
325
346k
            sortbuf[sortbuf_count].ccc = ccc;
326
346k
            sortbuf_count++;
327
328
346k
            i++;
329
346k
          }
330
331
371k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
38.3k
          break;
334
335
332k
        s += count;
336
332k
      }
337
38.3k
  }
338
339
38.3k
  if (length == 0)
340
584
    {
341
584
      if (result == NULL)
342
584
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
584
          result = (UNIT *) malloc (1);
345
584
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
584
        }
351
584
    }
352
37.8k
  else if (result != resultbuf && length < allocated)
353
37.7k
    {
354
      /* Shrink the allocated memory if possible.  */
355
37.7k
      UNIT *memory;
356
357
37.7k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
37.7k
      if (memory != NULL)
359
37.7k
        result = memory;
360
37.7k
    }
361
362
38.3k
  if (sortbuf_count > 0)
363
0
    abort ();
364
38.3k
  if (sortbuf != sortbuf_preallocated)
365
38.3k
    free (sortbuf);
366
367
38.3k
  *lengthp = length;
368
38.3k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
38.3k
}
u16_normalize
Line
Count
Source
21
10.1k
{
22
10.1k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
10.1k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
10.1k
  UNIT *result;
27
10.1k
  size_t length;
28
10.1k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
10.1k
  #define SORTBUF_PREALLOCATED 64
31
10.1k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
10.1k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
10.1k
  size_t sortbuf_allocated;
34
10.1k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
10.1k
  if (resultbuf == NULL)
38
10.1k
    {
39
10.1k
      result = NULL;
40
10.1k
      allocated = 0;
41
10.1k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
10.1k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
10.1k
  sortbuf = sortbuf_preallocated;
51
10.1k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
10.1k
  sortbuf_count = 0;
53
54
10.1k
  {
55
10.1k
    const UNIT *s_end = s + n;
56
57
10.1k
    for (;;)
58
56.7k
      {
59
56.7k
        int count;
60
56.7k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
56.7k
        int decomposed_count;
62
56.7k
        int i;
63
64
56.7k
        if (s < s_end)
65
46.5k
          {
66
            /* Fetch the next character.  */
67
46.5k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
46.5k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
46.5k
            {
77
46.5k
              int curr;
78
79
93.1k
              for (curr = 0; curr < decomposed_count; )
80
46.5k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
46.5k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
46.5k
                  int curr_decomposed_count;
85
86
46.5k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
46.5k
                  if (curr_decomposed_count >= 0)
88
0
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
0
                      int shift = curr_decomposed_count - 1;
93
94
0
                      if (shift < 0)
95
0
                        abort ();
96
0
                      if (shift > 0)
97
0
                        {
98
0
                          int j;
99
100
0
                          decomposed_count += shift;
101
0
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
0
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
0
                            decomposed[j + shift] = decomposed[j];
105
0
                        }
106
0
                      for (; shift >= 0; shift--)
107
0
                        decomposed[curr + shift] = curr_decomposed[shift];
108
0
                    }
109
46.5k
                  else
110
46.5k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
46.5k
                      curr++;
113
46.5k
                    }
114
46.5k
                }
115
46.5k
            }
116
46.5k
          }
117
10.1k
        else
118
10.1k
          {
119
10.1k
            count = 0;
120
10.1k
            decomposed_count = 0;
121
10.1k
          }
122
123
56.7k
        i = 0;
124
56.7k
        for (;;)
125
103k
          {
126
103k
            ucs4_t uc;
127
103k
            int ccc;
128
129
103k
            if (s < s_end)
130
93.1k
              {
131
                /* Fetch the next character from the decomposition.  */
132
93.1k
                if (i == decomposed_count)
133
46.5k
                  break;
134
46.5k
                uc = decomposed[i];
135
46.5k
                ccc = uc_combining_class (uc);
136
46.5k
              }
137
10.1k
            else
138
10.1k
              {
139
                /* End of string reached.  */
140
10.1k
                uc = 0;
141
10.1k
                ccc = 0;
142
10.1k
              }
143
144
56.7k
            if (ccc == 0)
145
56.7k
              {
146
56.7k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
56.7k
                if (sortbuf_count > 1)
151
0
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
0
                                                           sortbuf + sortbuf_count);
153
154
56.7k
                if (composer != NULL)
155
56.7k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
56.7k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
46.5k
                      {
178
46.5k
                        for (j = 1; j < sortbuf_count; )
179
0
                          {
180
0
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
0
                              {
182
0
                                ucs4_t combined =
183
0
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
0
                                if (combined)
185
0
                                  {
186
0
                                    size_t k;
187
188
0
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
0
                                    for (k = j + 1; k < sortbuf_count; k++)
191
0
                                      sortbuf[k - 1] = sortbuf[k];
192
0
                                    sortbuf_count--;
193
0
                                    continue;
194
0
                                  }
195
0
                              }
196
0
                            j++;
197
0
                          }
198
46.5k
                        if (s < s_end && sortbuf_count == 1)
199
36.4k
                          {
200
36.4k
                            ucs4_t combined =
201
36.4k
                              composer (sortbuf[0].code, uc);
202
36.4k
                            if (combined)
203
0
                              {
204
0
                                uc = combined;
205
0
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
0
                                sortbuf_count = 0;
210
0
                              }
211
36.4k
                          }
212
46.5k
                      }
213
56.7k
                  }
214
215
103k
                for (j = 0; j < sortbuf_count; j++)
216
46.5k
                  {
217
46.5k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
46.5k
                    if (length < allocated)
221
36.4k
                      {
222
36.4k
                        int ret =
223
36.4k
                          U_UCTOMB (result + length, muc, allocated - length);
224
36.4k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
36.4k
                        if (ret >= 0)
230
36.4k
                          {
231
36.4k
                            length += ret;
232
36.4k
                            goto done_appending;
233
36.4k
                          }
234
36.4k
                      }
235
10.1k
                    {
236
10.1k
                      size_t old_allocated = allocated;
237
10.1k
                      size_t new_allocated = 2 * old_allocated;
238
10.1k
                      if (new_allocated < 64)
239
10.1k
                        new_allocated = 64;
240
10.1k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
10.1k
                      {
243
10.1k
                        UNIT *larger_result;
244
10.1k
                        if (result == NULL)
245
10.1k
                          {
246
10.1k
                            larger_result =
247
10.1k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
10.1k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
10.1k
                          }
254
0
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
0
                        else
266
0
                          {
267
0
                            larger_result =
268
0
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
0
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
0
                          }
275
10.1k
                        result = larger_result;
276
10.1k
                        allocated = new_allocated;
277
10.1k
                        {
278
10.1k
                          int ret =
279
10.1k
                            U_UCTOMB (result + length, muc, allocated - length);
280
10.1k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
10.1k
                          if (ret < 0)
286
0
                            abort ();
287
10.1k
                          length += ret;
288
10.1k
                          goto done_appending;
289
10.1k
                        }
290
10.1k
                      }
291
10.1k
                    }
292
46.5k
                   done_appending: ;
293
46.5k
                  }
294
295
                /* sortbuf is now empty.  */
296
56.7k
                sortbuf_count = 0;
297
56.7k
              }
298
299
56.7k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
10.1k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
46.5k
            if (sortbuf_count == sortbuf_allocated)
305
0
              {
306
0
                struct ucs4_with_ccc *new_sortbuf;
307
308
0
                sortbuf_allocated = 2 * sortbuf_allocated;
309
0
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
0
                new_sortbuf =
312
0
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
0
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
0
                memcpy (new_sortbuf, sortbuf,
319
0
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
0
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
0
                sortbuf = new_sortbuf;
323
0
              }
324
46.5k
            sortbuf[sortbuf_count].code = uc;
325
46.5k
            sortbuf[sortbuf_count].ccc = ccc;
326
46.5k
            sortbuf_count++;
327
328
46.5k
            i++;
329
46.5k
          }
330
331
56.7k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
10.1k
          break;
334
335
46.5k
        s += count;
336
46.5k
      }
337
10.1k
  }
338
339
10.1k
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
10.1k
  else if (result != resultbuf && length < allocated)
353
10.1k
    {
354
      /* Shrink the allocated memory if possible.  */
355
10.1k
      UNIT *memory;
356
357
10.1k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
10.1k
      if (memory != NULL)
359
10.1k
        result = memory;
360
10.1k
    }
361
362
10.1k
  if (sortbuf_count > 0)
363
0
    abort ();
364
10.1k
  if (sortbuf != sortbuf_preallocated)
365
10.1k
    free (sortbuf);
366
367
10.1k
  *lengthp = length;
368
10.1k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
10.1k
}
u32_normalize
Line
Count
Source
21
28.2k
{
22
28.2k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
28.2k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
28.2k
  UNIT *result;
27
28.2k
  size_t length;
28
28.2k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
28.2k
  #define SORTBUF_PREALLOCATED 64
31
28.2k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
28.2k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
28.2k
  size_t sortbuf_allocated;
34
28.2k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
28.2k
  if (resultbuf == NULL)
38
28.2k
    {
39
28.2k
      result = NULL;
40
28.2k
      allocated = 0;
41
28.2k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
28.2k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
28.2k
  sortbuf = sortbuf_preallocated;
51
28.2k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
28.2k
  sortbuf_count = 0;
53
54
28.2k
  {
55
28.2k
    const UNIT *s_end = s + n;
56
57
28.2k
    for (;;)
58
314k
      {
59
314k
        int count;
60
314k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
314k
        int decomposed_count;
62
314k
        int i;
63
64
314k
        if (s < s_end)
65
286k
          {
66
            /* Fetch the next character.  */
67
286k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
286k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
286k
            {
77
286k
              int curr;
78
79
600k
              for (curr = 0; curr < decomposed_count; )
80
314k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
314k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
314k
                  int curr_decomposed_count;
85
86
314k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
314k
                  if (curr_decomposed_count >= 0)
88
14.2k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
14.2k
                      int shift = curr_decomposed_count - 1;
93
94
14.2k
                      if (shift < 0)
95
0
                        abort ();
96
14.2k
                      if (shift > 0)
97
14.0k
                        {
98
14.0k
                          int j;
99
100
14.0k
                          decomposed_count += shift;
101
14.0k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
19.5k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
5.50k
                            decomposed[j + shift] = decomposed[j];
105
14.0k
                        }
106
42.4k
                      for (; shift >= 0; shift--)
107
28.2k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
14.2k
                    }
109
300k
                  else
110
300k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
300k
                      curr++;
113
300k
                    }
114
314k
                }
115
286k
            }
116
286k
          }
117
28.2k
        else
118
28.2k
          {
119
28.2k
            count = 0;
120
28.2k
            decomposed_count = 0;
121
28.2k
          }
122
123
314k
        i = 0;
124
314k
        for (;;)
125
614k
          {
126
614k
            ucs4_t uc;
127
614k
            int ccc;
128
129
614k
            if (s < s_end)
130
586k
              {
131
                /* Fetch the next character from the decomposition.  */
132
586k
                if (i == decomposed_count)
133
286k
                  break;
134
300k
                uc = decomposed[i];
135
300k
                ccc = uc_combining_class (uc);
136
300k
              }
137
28.2k
            else
138
28.2k
              {
139
                /* End of string reached.  */
140
28.2k
                uc = 0;
141
28.2k
                ccc = 0;
142
28.2k
              }
143
144
328k
            if (ccc == 0)
145
272k
              {
146
272k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
272k
                if (sortbuf_count > 1)
151
11.8k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
11.8k
                                                           sortbuf + sortbuf_count);
153
154
272k
                if (composer != NULL)
155
272k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
272k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
244k
                      {
178
297k
                        for (j = 1; j < sortbuf_count; )
179
53.3k
                          {
180
53.3k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
25.1k
                              {
182
25.1k
                                ucs4_t combined =
183
25.1k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
25.1k
                                if (combined)
185
9.96k
                                  {
186
9.96k
                                    size_t k;
187
188
9.96k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
36.5k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
26.5k
                                      sortbuf[k - 1] = sortbuf[k];
192
9.96k
                                    sortbuf_count--;
193
9.96k
                                    continue;
194
9.96k
                                  }
195
25.1k
                              }
196
43.3k
                            j++;
197
43.3k
                          }
198
244k
                        if (s < s_end && sortbuf_count == 1)
199
209k
                          {
200
209k
                            ucs4_t combined =
201
209k
                              composer (sortbuf[0].code, uc);
202
209k
                            if (combined)
203
3.17k
                              {
204
3.17k
                                uc = combined;
205
3.17k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
3.17k
                                sortbuf_count = 0;
210
3.17k
                              }
211
209k
                          }
212
244k
                      }
213
272k
                  }
214
215
559k
                for (j = 0; j < sortbuf_count; j++)
216
287k
                  {
217
287k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
287k
                    if (length < allocated)
221
258k
                      {
222
258k
                        int ret =
223
258k
                          U_UCTOMB (result + length, muc, allocated - length);
224
258k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
258k
                        if (ret >= 0)
230
258k
                          {
231
258k
                            length += ret;
232
258k
                            goto done_appending;
233
258k
                          }
234
258k
                      }
235
28.7k
                    {
236
28.7k
                      size_t old_allocated = allocated;
237
28.7k
                      size_t new_allocated = 2 * old_allocated;
238
28.7k
                      if (new_allocated < 64)
239
27.6k
                        new_allocated = 64;
240
28.7k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
28.7k
                      {
243
28.7k
                        UNIT *larger_result;
244
28.7k
                        if (result == NULL)
245
27.6k
                          {
246
27.6k
                            larger_result =
247
27.6k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
27.6k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
27.6k
                          }
254
1.09k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
1.09k
                        else
266
1.09k
                          {
267
1.09k
                            larger_result =
268
1.09k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
1.09k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
1.09k
                          }
275
28.7k
                        result = larger_result;
276
28.7k
                        allocated = new_allocated;
277
28.7k
                        {
278
28.7k
                          int ret =
279
28.7k
                            U_UCTOMB (result + length, muc, allocated - length);
280
28.7k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
28.7k
                          if (ret < 0)
286
0
                            abort ();
287
28.7k
                          length += ret;
288
28.7k
                          goto done_appending;
289
28.7k
                        }
290
28.7k
                      }
291
28.7k
                    }
292
287k
                   done_appending: ;
293
287k
                  }
294
295
                /* sortbuf is now empty.  */
296
272k
                sortbuf_count = 0;
297
272k
              }
298
299
328k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
28.2k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
300k
            if (sortbuf_count == sortbuf_allocated)
305
179
              {
306
179
                struct ucs4_with_ccc *new_sortbuf;
307
308
179
                sortbuf_allocated = 2 * sortbuf_allocated;
309
179
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
179
                new_sortbuf =
312
179
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
179
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
179
                memcpy (new_sortbuf, sortbuf,
319
179
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
179
                if (sortbuf != sortbuf_preallocated)
321
179
                  free (sortbuf);
322
179
                sortbuf = new_sortbuf;
323
179
              }
324
300k
            sortbuf[sortbuf_count].code = uc;
325
300k
            sortbuf[sortbuf_count].ccc = ccc;
326
300k
            sortbuf_count++;
327
328
300k
            i++;
329
300k
          }
330
331
314k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
28.2k
          break;
334
335
286k
        s += count;
336
286k
      }
337
28.2k
  }
338
339
28.2k
  if (length == 0)
340
584
    {
341
584
      if (result == NULL)
342
584
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
584
          result = (UNIT *) malloc (1);
345
584
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
584
        }
351
584
    }
352
27.6k
  else if (result != resultbuf && length < allocated)
353
27.5k
    {
354
      /* Shrink the allocated memory if possible.  */
355
27.5k
      UNIT *memory;
356
357
27.5k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
27.5k
      if (memory != NULL)
359
27.5k
        result = memory;
360
27.5k
    }
361
362
28.2k
  if (sortbuf_count > 0)
363
0
    abort ();
364
28.2k
  if (sortbuf != sortbuf_preallocated)
365
28.2k
    free (sortbuf);
366
367
28.2k
  *lengthp = length;
368
28.2k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
  return NULL;
380
28.2k
}