Coverage Report

Created: 2025-07-23 06:43

/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source (jump to first uncovered line)
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2024 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
40.3k
{
22
40.3k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
40.3k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
40.3k
  UNIT *result;
27
40.3k
  size_t length;
28
40.3k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
40.3k
  #define SORTBUF_PREALLOCATED 64
31
40.3k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
40.3k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
40.3k
  size_t sortbuf_allocated;
34
40.3k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
40.3k
  if (resultbuf == NULL)
38
40.3k
    {
39
40.3k
      result = NULL;
40
40.3k
      allocated = 0;
41
40.3k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
40.3k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
40.3k
  sortbuf = sortbuf_preallocated;
51
40.3k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
40.3k
  sortbuf_count = 0;
53
54
40.3k
  {
55
40.3k
    const UNIT *s_end = s + n;
56
57
40.3k
    for (;;)
58
378k
      {
59
378k
        int count;
60
378k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
378k
        int decomposed_count;
62
378k
        int i;
63
64
378k
        if (s < s_end)
65
338k
          {
66
            /* Fetch the next character.  */
67
338k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
338k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
338k
            {
77
338k
              int curr;
78
79
710k
              for (curr = 0; curr < decomposed_count; )
80
372k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
372k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
372k
                  int curr_decomposed_count;
85
86
372k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
372k
                  if (curr_decomposed_count >= 0)
88
17.4k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
17.4k
                      int shift = curr_decomposed_count - 1;
93
94
17.4k
                      if (shift < 0)
95
0
                        abort ();
96
17.4k
                      if (shift > 0)
97
17.2k
                        {
98
17.2k
                          int j;
99
100
17.2k
                          decomposed_count += shift;
101
17.2k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
22.8k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
5.63k
                            decomposed[j + shift] = decomposed[j];
105
17.2k
                        }
106
52.1k
                      for (; shift >= 0; shift--)
107
34.6k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
17.4k
                    }
109
355k
                  else
110
355k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
355k
                      curr++;
113
355k
                    }
114
372k
                }
115
338k
            }
116
338k
          }
117
40.3k
        else
118
40.3k
          {
119
40.3k
            count = 0;
120
40.3k
            decomposed_count = 0;
121
40.3k
          }
122
123
378k
        i = 0;
124
378k
        for (;;)
125
733k
          {
126
733k
            ucs4_t uc;
127
733k
            int ccc;
128
129
733k
            if (s < s_end)
130
693k
              {
131
                /* Fetch the next character from the decomposition.  */
132
693k
                if (i == decomposed_count)
133
338k
                  break;
134
355k
                uc = decomposed[i];
135
355k
                ccc = uc_combining_class (uc);
136
355k
              }
137
40.3k
            else
138
40.3k
              {
139
                /* End of string reached.  */
140
40.3k
                uc = 0;
141
40.3k
                ccc = 0;
142
40.3k
              }
143
144
395k
            if (ccc == 0)
145
338k
              {
146
338k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
338k
                if (sortbuf_count > 1)
151
14.8k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
14.8k
                                                           sortbuf + sortbuf_count);
153
154
338k
                if (composer != NULL)
155
338k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
338k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
298k
                      {
178
352k
                        for (j = 1; j < sortbuf_count; )
179
54.4k
                          {
180
54.4k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
27.6k
                              {
182
27.6k
                                ucs4_t combined =
183
27.6k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
27.6k
                                if (combined)
185
13.5k
                                  {
186
13.5k
                                    size_t k;
187
188
13.5k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
39.6k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
26.0k
                                      sortbuf[k - 1] = sortbuf[k];
192
13.5k
                                    sortbuf_count--;
193
13.5k
                                    continue;
194
13.5k
                                  }
195
27.6k
                              }
196
40.8k
                            j++;
197
40.8k
                          }
198
298k
                        if (s < s_end && sortbuf_count == 1)
199
251k
                          {
200
251k
                            ucs4_t combined =
201
251k
                              composer (sortbuf[0].code, uc);
202
251k
                            if (combined)
203
2.86k
                              {
204
2.86k
                                uc = combined;
205
2.86k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
2.86k
                                sortbuf_count = 0;
210
2.86k
                              }
211
251k
                          }
212
298k
                      }
213
338k
                  }
214
215
677k
                for (j = 0; j < sortbuf_count; j++)
216
338k
                  {
217
338k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
338k
                    if (length < allocated)
221
298k
                      {
222
298k
                        int ret =
223
298k
                          U_UCTOMB (result + length, muc, allocated - length);
224
298k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
298k
                        if (ret >= 0)
230
298k
                          {
231
298k
                            length += ret;
232
298k
                            goto done_appending;
233
298k
                          }
234
298k
                      }
235
40.8k
                    {
236
40.8k
                      size_t old_allocated = allocated;
237
40.8k
                      size_t new_allocated = 2 * old_allocated;
238
40.8k
                      if (new_allocated < 64)
239
39.7k
                        new_allocated = 64;
240
40.8k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
40.8k
                      {
243
40.8k
                        UNIT *larger_result;
244
40.8k
                        if (result == NULL)
245
39.7k
                          {
246
39.7k
                            larger_result =
247
39.7k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
39.7k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
39.7k
                          }
254
1.07k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
1.07k
                        else
266
1.07k
                          {
267
1.07k
                            larger_result =
268
1.07k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
1.07k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
1.07k
                          }
275
40.8k
                        result = larger_result;
276
40.8k
                        allocated = new_allocated;
277
40.8k
                        {
278
40.8k
                          int ret =
279
40.8k
                            U_UCTOMB (result + length, muc, allocated - length);
280
40.8k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
40.8k
                          if (ret < 0)
286
0
                            abort ();
287
40.8k
                          length += ret;
288
40.8k
                          goto done_appending;
289
40.8k
                        }
290
40.8k
                      }
291
40.8k
                    }
292
338k
                   done_appending: ;
293
338k
                  }
294
295
                /* sortbuf is now empty.  */
296
338k
                sortbuf_count = 0;
297
338k
              }
298
299
395k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
40.3k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
355k
            if (sortbuf_count == sortbuf_allocated)
305
159
              {
306
159
                struct ucs4_with_ccc *new_sortbuf;
307
308
159
                sortbuf_allocated = 2 * sortbuf_allocated;
309
159
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
159
                new_sortbuf =
312
159
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
159
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
159
                memcpy (new_sortbuf, sortbuf,
319
159
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
159
                if (sortbuf != sortbuf_preallocated)
321
55
                  free (sortbuf);
322
159
                sortbuf = new_sortbuf;
323
159
              }
324
355k
            sortbuf[sortbuf_count].code = uc;
325
355k
            sortbuf[sortbuf_count].ccc = ccc;
326
355k
            sortbuf_count++;
327
328
355k
            i++;
329
355k
          }
330
331
378k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
40.3k
          break;
334
335
338k
        s += count;
336
338k
      }
337
40.3k
  }
338
339
40.3k
  if (length == 0)
340
599
    {
341
599
      if (result == NULL)
342
599
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
599
          result = (UNIT *) malloc (1);
345
599
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
599
        }
351
599
    }
352
39.7k
  else if (result != resultbuf && length < allocated)
353
39.6k
    {
354
      /* Shrink the allocated memory if possible.  */
355
39.6k
      UNIT *memory;
356
357
39.6k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
39.6k
      if (memory != NULL)
359
39.6k
        result = memory;
360
39.6k
    }
361
362
40.3k
  if (sortbuf_count > 0)
363
0
    abort ();
364
40.3k
  if (sortbuf != sortbuf_preallocated)
365
104
    free (sortbuf);
366
367
40.3k
  *lengthp = length;
368
40.3k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
40.3k
}
u16_normalize
Line
Count
Source
21
11.2k
{
22
11.2k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
11.2k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
11.2k
  UNIT *result;
27
11.2k
  size_t length;
28
11.2k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
11.2k
  #define SORTBUF_PREALLOCATED 64
31
11.2k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
11.2k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
11.2k
  size_t sortbuf_allocated;
34
11.2k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
11.2k
  if (resultbuf == NULL)
38
11.2k
    {
39
11.2k
      result = NULL;
40
11.2k
      allocated = 0;
41
11.2k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
11.2k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
11.2k
  sortbuf = sortbuf_preallocated;
51
11.2k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
11.2k
  sortbuf_count = 0;
53
54
11.2k
  {
55
11.2k
    const UNIT *s_end = s + n;
56
57
11.2k
    for (;;)
58
61.8k
      {
59
61.8k
        int count;
60
61.8k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
61.8k
        int decomposed_count;
62
61.8k
        int i;
63
64
61.8k
        if (s < s_end)
65
50.6k
          {
66
            /* Fetch the next character.  */
67
50.6k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
50.6k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
50.6k
            {
77
50.6k
              int curr;
78
79
101k
              for (curr = 0; curr < decomposed_count; )
80
50.6k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
50.6k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
50.6k
                  int curr_decomposed_count;
85
86
50.6k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
50.6k
                  if (curr_decomposed_count >= 0)
88
0
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
0
                      int shift = curr_decomposed_count - 1;
93
94
0
                      if (shift < 0)
95
0
                        abort ();
96
0
                      if (shift > 0)
97
0
                        {
98
0
                          int j;
99
100
0
                          decomposed_count += shift;
101
0
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
0
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
0
                            decomposed[j + shift] = decomposed[j];
105
0
                        }
106
0
                      for (; shift >= 0; shift--)
107
0
                        decomposed[curr + shift] = curr_decomposed[shift];
108
0
                    }
109
50.6k
                  else
110
50.6k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
50.6k
                      curr++;
113
50.6k
                    }
114
50.6k
                }
115
50.6k
            }
116
50.6k
          }
117
11.2k
        else
118
11.2k
          {
119
11.2k
            count = 0;
120
11.2k
            decomposed_count = 0;
121
11.2k
          }
122
123
61.8k
        i = 0;
124
61.8k
        for (;;)
125
112k
          {
126
112k
            ucs4_t uc;
127
112k
            int ccc;
128
129
112k
            if (s < s_end)
130
101k
              {
131
                /* Fetch the next character from the decomposition.  */
132
101k
                if (i == decomposed_count)
133
50.6k
                  break;
134
50.6k
                uc = decomposed[i];
135
50.6k
                ccc = uc_combining_class (uc);
136
50.6k
              }
137
11.2k
            else
138
11.2k
              {
139
                /* End of string reached.  */
140
11.2k
                uc = 0;
141
11.2k
                ccc = 0;
142
11.2k
              }
143
144
61.8k
            if (ccc == 0)
145
61.8k
              {
146
61.8k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
61.8k
                if (sortbuf_count > 1)
151
0
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
0
                                                           sortbuf + sortbuf_count);
153
154
61.8k
                if (composer != NULL)
155
61.8k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
61.8k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
50.6k
                      {
178
50.6k
                        for (j = 1; j < sortbuf_count; )
179
0
                          {
180
0
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
0
                              {
182
0
                                ucs4_t combined =
183
0
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
0
                                if (combined)
185
0
                                  {
186
0
                                    size_t k;
187
188
0
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
0
                                    for (k = j + 1; k < sortbuf_count; k++)
191
0
                                      sortbuf[k - 1] = sortbuf[k];
192
0
                                    sortbuf_count--;
193
0
                                    continue;
194
0
                                  }
195
0
                              }
196
0
                            j++;
197
0
                          }
198
50.6k
                        if (s < s_end && sortbuf_count == 1)
199
39.4k
                          {
200
39.4k
                            ucs4_t combined =
201
39.4k
                              composer (sortbuf[0].code, uc);
202
39.4k
                            if (combined)
203
0
                              {
204
0
                                uc = combined;
205
0
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
0
                                sortbuf_count = 0;
210
0
                              }
211
39.4k
                          }
212
50.6k
                      }
213
61.8k
                  }
214
215
112k
                for (j = 0; j < sortbuf_count; j++)
216
50.6k
                  {
217
50.6k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
50.6k
                    if (length < allocated)
221
39.4k
                      {
222
39.4k
                        int ret =
223
39.4k
                          U_UCTOMB (result + length, muc, allocated - length);
224
39.4k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
39.4k
                        if (ret >= 0)
230
39.4k
                          {
231
39.4k
                            length += ret;
232
39.4k
                            goto done_appending;
233
39.4k
                          }
234
39.4k
                      }
235
11.2k
                    {
236
11.2k
                      size_t old_allocated = allocated;
237
11.2k
                      size_t new_allocated = 2 * old_allocated;
238
11.2k
                      if (new_allocated < 64)
239
11.2k
                        new_allocated = 64;
240
11.2k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
11.2k
                      {
243
11.2k
                        UNIT *larger_result;
244
11.2k
                        if (result == NULL)
245
11.2k
                          {
246
11.2k
                            larger_result =
247
11.2k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
11.2k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
11.2k
                          }
254
0
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
0
                        else
266
0
                          {
267
0
                            larger_result =
268
0
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
0
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
0
                          }
275
11.2k
                        result = larger_result;
276
11.2k
                        allocated = new_allocated;
277
11.2k
                        {
278
11.2k
                          int ret =
279
11.2k
                            U_UCTOMB (result + length, muc, allocated - length);
280
11.2k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
11.2k
                          if (ret < 0)
286
0
                            abort ();
287
11.2k
                          length += ret;
288
11.2k
                          goto done_appending;
289
11.2k
                        }
290
11.2k
                      }
291
11.2k
                    }
292
50.6k
                   done_appending: ;
293
50.6k
                  }
294
295
                /* sortbuf is now empty.  */
296
61.8k
                sortbuf_count = 0;
297
61.8k
              }
298
299
61.8k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
11.2k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
50.6k
            if (sortbuf_count == sortbuf_allocated)
305
0
              {
306
0
                struct ucs4_with_ccc *new_sortbuf;
307
308
0
                sortbuf_allocated = 2 * sortbuf_allocated;
309
0
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
0
                new_sortbuf =
312
0
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
0
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
0
                memcpy (new_sortbuf, sortbuf,
319
0
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
0
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
0
                sortbuf = new_sortbuf;
323
0
              }
324
50.6k
            sortbuf[sortbuf_count].code = uc;
325
50.6k
            sortbuf[sortbuf_count].ccc = ccc;
326
50.6k
            sortbuf_count++;
327
328
50.6k
            i++;
329
50.6k
          }
330
331
61.8k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
11.2k
          break;
334
335
50.6k
        s += count;
336
50.6k
      }
337
11.2k
  }
338
339
11.2k
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
11.2k
  else if (result != resultbuf && length < allocated)
353
11.2k
    {
354
      /* Shrink the allocated memory if possible.  */
355
11.2k
      UNIT *memory;
356
357
11.2k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
11.2k
      if (memory != NULL)
359
11.2k
        result = memory;
360
11.2k
    }
361
362
11.2k
  if (sortbuf_count > 0)
363
0
    abort ();
364
11.2k
  if (sortbuf != sortbuf_preallocated)
365
0
    free (sortbuf);
366
367
11.2k
  *lengthp = length;
368
11.2k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
11.2k
}
u32_normalize
Line
Count
Source
21
29.1k
{
22
29.1k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
29.1k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
29.1k
  UNIT *result;
27
29.1k
  size_t length;
28
29.1k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
29.1k
  #define SORTBUF_PREALLOCATED 64
31
29.1k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
29.1k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
29.1k
  size_t sortbuf_allocated;
34
29.1k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
29.1k
  if (resultbuf == NULL)
38
29.1k
    {
39
29.1k
      result = NULL;
40
29.1k
      allocated = 0;
41
29.1k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
29.1k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
29.1k
  sortbuf = sortbuf_preallocated;
51
29.1k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
29.1k
  sortbuf_count = 0;
53
54
29.1k
  {
55
29.1k
    const UNIT *s_end = s + n;
56
57
29.1k
    for (;;)
58
316k
      {
59
316k
        int count;
60
316k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
316k
        int decomposed_count;
62
316k
        int i;
63
64
316k
        if (s < s_end)
65
287k
          {
66
            /* Fetch the next character.  */
67
287k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
287k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
287k
            {
77
287k
              int curr;
78
79
609k
              for (curr = 0; curr < decomposed_count; )
80
322k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
322k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
322k
                  int curr_decomposed_count;
85
86
322k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
322k
                  if (curr_decomposed_count >= 0)
88
17.4k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
17.4k
                      int shift = curr_decomposed_count - 1;
93
94
17.4k
                      if (shift < 0)
95
0
                        abort ();
96
17.4k
                      if (shift > 0)
97
17.2k
                        {
98
17.2k
                          int j;
99
100
17.2k
                          decomposed_count += shift;
101
17.2k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
22.8k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
5.63k
                            decomposed[j + shift] = decomposed[j];
105
17.2k
                        }
106
52.1k
                      for (; shift >= 0; shift--)
107
34.6k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
17.4k
                    }
109
304k
                  else
110
304k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
304k
                      curr++;
113
304k
                    }
114
322k
                }
115
287k
            }
116
287k
          }
117
29.1k
        else
118
29.1k
          {
119
29.1k
            count = 0;
120
29.1k
            decomposed_count = 0;
121
29.1k
          }
122
123
316k
        i = 0;
124
316k
        for (;;)
125
621k
          {
126
621k
            ucs4_t uc;
127
621k
            int ccc;
128
129
621k
            if (s < s_end)
130
592k
              {
131
                /* Fetch the next character from the decomposition.  */
132
592k
                if (i == decomposed_count)
133
287k
                  break;
134
304k
                uc = decomposed[i];
135
304k
                ccc = uc_combining_class (uc);
136
304k
              }
137
29.1k
            else
138
29.1k
              {
139
                /* End of string reached.  */
140
29.1k
                uc = 0;
141
29.1k
                ccc = 0;
142
29.1k
              }
143
144
333k
            if (ccc == 0)
145
277k
              {
146
277k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
277k
                if (sortbuf_count > 1)
151
14.8k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
14.8k
                                                           sortbuf + sortbuf_count);
153
154
277k
                if (composer != NULL)
155
277k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
277k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
247k
                      {
178
302k
                        for (j = 1; j < sortbuf_count; )
179
54.4k
                          {
180
54.4k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
27.6k
                              {
182
27.6k
                                ucs4_t combined =
183
27.6k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
27.6k
                                if (combined)
185
13.5k
                                  {
186
13.5k
                                    size_t k;
187
188
13.5k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
39.6k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
26.0k
                                      sortbuf[k - 1] = sortbuf[k];
192
13.5k
                                    sortbuf_count--;
193
13.5k
                                    continue;
194
13.5k
                                  }
195
27.6k
                              }
196
40.8k
                            j++;
197
40.8k
                          }
198
247k
                        if (s < s_end && sortbuf_count == 1)
199
212k
                          {
200
212k
                            ucs4_t combined =
201
212k
                              composer (sortbuf[0].code, uc);
202
212k
                            if (combined)
203
2.86k
                              {
204
2.86k
                                uc = combined;
205
2.86k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
2.86k
                                sortbuf_count = 0;
210
2.86k
                              }
211
212k
                          }
212
247k
                      }
213
277k
                  }
214
215
565k
                for (j = 0; j < sortbuf_count; j++)
216
288k
                  {
217
288k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
288k
                    if (length < allocated)
221
258k
                      {
222
258k
                        int ret =
223
258k
                          U_UCTOMB (result + length, muc, allocated - length);
224
258k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
258k
                        if (ret >= 0)
230
258k
                          {
231
258k
                            length += ret;
232
258k
                            goto done_appending;
233
258k
                          }
234
258k
                      }
235
29.6k
                    {
236
29.6k
                      size_t old_allocated = allocated;
237
29.6k
                      size_t new_allocated = 2 * old_allocated;
238
29.6k
                      if (new_allocated < 64)
239
28.5k
                        new_allocated = 64;
240
29.6k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
29.6k
                      {
243
29.6k
                        UNIT *larger_result;
244
29.6k
                        if (result == NULL)
245
28.5k
                          {
246
28.5k
                            larger_result =
247
28.5k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
28.5k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
28.5k
                          }
254
1.07k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
1.07k
                        else
266
1.07k
                          {
267
1.07k
                            larger_result =
268
1.07k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
1.07k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
1.07k
                          }
275
29.6k
                        result = larger_result;
276
29.6k
                        allocated = new_allocated;
277
29.6k
                        {
278
29.6k
                          int ret =
279
29.6k
                            U_UCTOMB (result + length, muc, allocated - length);
280
29.6k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
29.6k
                          if (ret < 0)
286
0
                            abort ();
287
29.6k
                          length += ret;
288
29.6k
                          goto done_appending;
289
29.6k
                        }
290
29.6k
                      }
291
29.6k
                    }
292
288k
                   done_appending: ;
293
288k
                  }
294
295
                /* sortbuf is now empty.  */
296
277k
                sortbuf_count = 0;
297
277k
              }
298
299
333k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
29.1k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
304k
            if (sortbuf_count == sortbuf_allocated)
305
159
              {
306
159
                struct ucs4_with_ccc *new_sortbuf;
307
308
159
                sortbuf_allocated = 2 * sortbuf_allocated;
309
159
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
159
                new_sortbuf =
312
159
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
159
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
159
                memcpy (new_sortbuf, sortbuf,
319
159
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
159
                if (sortbuf != sortbuf_preallocated)
321
55
                  free (sortbuf);
322
159
                sortbuf = new_sortbuf;
323
159
              }
324
304k
            sortbuf[sortbuf_count].code = uc;
325
304k
            sortbuf[sortbuf_count].ccc = ccc;
326
304k
            sortbuf_count++;
327
328
304k
            i++;
329
304k
          }
330
331
316k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
29.1k
          break;
334
335
287k
        s += count;
336
287k
      }
337
29.1k
  }
338
339
29.1k
  if (length == 0)
340
599
    {
341
599
      if (result == NULL)
342
599
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
599
          result = (UNIT *) malloc (1);
345
599
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
599
        }
351
599
    }
352
28.5k
  else if (result != resultbuf && length < allocated)
353
28.4k
    {
354
      /* Shrink the allocated memory if possible.  */
355
28.4k
      UNIT *memory;
356
357
28.4k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
28.4k
      if (memory != NULL)
359
28.4k
        result = memory;
360
28.4k
    }
361
362
29.1k
  if (sortbuf_count > 0)
363
0
    abort ();
364
29.1k
  if (sortbuf != sortbuf_preallocated)
365
104
    free (sortbuf);
366
367
29.1k
  *lengthp = length;
368
29.1k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
29.1k
}