Coverage Report

Created: 2024-11-25 06:31

/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source (jump to first uncovered line)
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2024 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
24.0k
{
22
24.0k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
24.0k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
24.0k
  UNIT *result;
27
24.0k
  size_t length;
28
24.0k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
24.0k
  #define SORTBUF_PREALLOCATED 64
31
24.0k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
24.0k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
24.0k
  size_t sortbuf_allocated;
34
24.0k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
24.0k
  if (resultbuf == NULL)
38
24.0k
    {
39
24.0k
      result = NULL;
40
24.0k
      allocated = 0;
41
24.0k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
24.0k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
24.0k
  sortbuf = sortbuf_preallocated;
51
24.0k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
24.0k
  sortbuf_count = 0;
53
54
24.0k
  {
55
24.0k
    const UNIT *s_end = s + n;
56
57
24.0k
    for (;;)
58
151k
      {
59
151k
        int count;
60
151k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
151k
        int decomposed_count;
62
151k
        int i;
63
64
151k
        if (s < s_end)
65
127k
          {
66
            /* Fetch the next character.  */
67
127k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
127k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
127k
            {
77
127k
              int curr;
78
79
255k
              for (curr = 0; curr < decomposed_count; )
80
127k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
127k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
127k
                  int curr_decomposed_count;
85
86
127k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
127k
                  if (curr_decomposed_count >= 0)
88
0
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
0
                      int shift = curr_decomposed_count - 1;
93
94
0
                      if (shift < 0)
95
0
                        abort ();
96
0
                      if (shift > 0)
97
0
                        {
98
0
                          int j;
99
100
0
                          decomposed_count += shift;
101
0
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
0
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
0
                            decomposed[j + shift] = decomposed[j];
105
0
                        }
106
0
                      for (; shift >= 0; shift--)
107
0
                        decomposed[curr + shift] = curr_decomposed[shift];
108
0
                    }
109
127k
                  else
110
127k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
127k
                      curr++;
113
127k
                    }
114
127k
                }
115
127k
            }
116
127k
          }
117
24.0k
        else
118
24.0k
          {
119
24.0k
            count = 0;
120
24.0k
            decomposed_count = 0;
121
24.0k
          }
122
123
151k
        i = 0;
124
151k
        for (;;)
125
279k
          {
126
279k
            ucs4_t uc;
127
279k
            int ccc;
128
129
279k
            if (s < s_end)
130
255k
              {
131
                /* Fetch the next character from the decomposition.  */
132
255k
                if (i == decomposed_count)
133
127k
                  break;
134
127k
                uc = decomposed[i];
135
127k
                ccc = uc_combining_class (uc);
136
127k
              }
137
24.0k
            else
138
24.0k
              {
139
                /* End of string reached.  */
140
24.0k
                uc = 0;
141
24.0k
                ccc = 0;
142
24.0k
              }
143
144
151k
            if (ccc == 0)
145
151k
              {
146
151k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
151k
                if (sortbuf_count > 1)
151
0
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
0
                                                           sortbuf + sortbuf_count);
153
154
151k
                if (composer != NULL)
155
151k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
151k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
127k
                      {
178
127k
                        for (j = 1; j < sortbuf_count; )
179
0
                          {
180
0
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
0
                              {
182
0
                                ucs4_t combined =
183
0
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
0
                                if (combined)
185
0
                                  {
186
0
                                    size_t k;
187
188
0
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
0
                                    for (k = j + 1; k < sortbuf_count; k++)
191
0
                                      sortbuf[k - 1] = sortbuf[k];
192
0
                                    sortbuf_count--;
193
0
                                    continue;
194
0
                                  }
195
0
                              }
196
0
                            j++;
197
0
                          }
198
127k
                        if (s < s_end && sortbuf_count == 1)
199
103k
                          {
200
103k
                            ucs4_t combined =
201
103k
                              composer (sortbuf[0].code, uc);
202
103k
                            if (combined)
203
0
                              {
204
0
                                uc = combined;
205
0
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
0
                                sortbuf_count = 0;
210
0
                              }
211
103k
                          }
212
127k
                      }
213
151k
                  }
214
215
279k
                for (j = 0; j < sortbuf_count; j++)
216
127k
                  {
217
127k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
127k
                    if (length < allocated)
221
103k
                      {
222
103k
                        int ret =
223
103k
                          U_UCTOMB (result + length, muc, allocated - length);
224
103k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
103k
                        if (ret >= 0)
230
103k
                          {
231
103k
                            length += ret;
232
103k
                            goto done_appending;
233
103k
                          }
234
103k
                      }
235
24.0k
                    {
236
24.0k
                      size_t old_allocated = allocated;
237
24.0k
                      size_t new_allocated = 2 * old_allocated;
238
24.0k
                      if (new_allocated < 64)
239
24.0k
                        new_allocated = 64;
240
24.0k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
24.0k
                      {
243
24.0k
                        UNIT *larger_result;
244
24.0k
                        if (result == NULL)
245
24.0k
                          {
246
24.0k
                            larger_result =
247
24.0k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
24.0k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
24.0k
                          }
254
0
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
0
                        else
266
0
                          {
267
0
                            larger_result =
268
0
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
0
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
0
                          }
275
24.0k
                        result = larger_result;
276
24.0k
                        allocated = new_allocated;
277
24.0k
                        {
278
24.0k
                          int ret =
279
24.0k
                            U_UCTOMB (result + length, muc, allocated - length);
280
24.0k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
24.0k
                          if (ret < 0)
286
0
                            abort ();
287
24.0k
                          length += ret;
288
24.0k
                          goto done_appending;
289
24.0k
                        }
290
24.0k
                      }
291
24.0k
                    }
292
127k
                   done_appending: ;
293
127k
                  }
294
295
                /* sortbuf is now empty.  */
296
151k
                sortbuf_count = 0;
297
151k
              }
298
299
151k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
24.0k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
127k
            if (sortbuf_count == sortbuf_allocated)
305
0
              {
306
0
                struct ucs4_with_ccc *new_sortbuf;
307
308
0
                sortbuf_allocated = 2 * sortbuf_allocated;
309
0
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
0
                new_sortbuf =
312
0
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
0
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
0
                memcpy (new_sortbuf, sortbuf,
319
0
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
0
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
0
                sortbuf = new_sortbuf;
323
0
              }
324
127k
            sortbuf[sortbuf_count].code = uc;
325
127k
            sortbuf[sortbuf_count].ccc = ccc;
326
127k
            sortbuf_count++;
327
328
127k
            i++;
329
127k
          }
330
331
151k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
24.0k
          break;
334
335
127k
        s += count;
336
127k
      }
337
24.0k
  }
338
339
24.0k
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
24.0k
  else if (result != resultbuf && length < allocated)
353
24.0k
    {
354
      /* Shrink the allocated memory if possible.  */
355
24.0k
      UNIT *memory;
356
357
24.0k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
24.0k
      if (memory != NULL)
359
24.0k
        result = memory;
360
24.0k
    }
361
362
24.0k
  if (sortbuf_count > 0)
363
0
    abort ();
364
24.0k
  if (sortbuf != sortbuf_preallocated)
365
0
    free (sortbuf);
366
367
24.0k
  *lengthp = length;
368
24.0k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
24.0k
}
u16_normalize
Line
Count
Source
21
10.4k
{
22
10.4k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
10.4k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
10.4k
  UNIT *result;
27
10.4k
  size_t length;
28
10.4k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
10.4k
  #define SORTBUF_PREALLOCATED 64
31
10.4k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
10.4k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
10.4k
  size_t sortbuf_allocated;
34
10.4k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
10.4k
  if (resultbuf == NULL)
38
10.4k
    {
39
10.4k
      result = NULL;
40
10.4k
      allocated = 0;
41
10.4k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
10.4k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
10.4k
  sortbuf = sortbuf_preallocated;
51
10.4k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
10.4k
  sortbuf_count = 0;
53
54
10.4k
  {
55
10.4k
    const UNIT *s_end = s + n;
56
57
10.4k
    for (;;)
58
57.4k
      {
59
57.4k
        int count;
60
57.4k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
57.4k
        int decomposed_count;
62
57.4k
        int i;
63
64
57.4k
        if (s < s_end)
65
47.0k
          {
66
            /* Fetch the next character.  */
67
47.0k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
47.0k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
47.0k
            {
77
47.0k
              int curr;
78
79
94.1k
              for (curr = 0; curr < decomposed_count; )
80
47.0k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
47.0k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
47.0k
                  int curr_decomposed_count;
85
86
47.0k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
47.0k
                  if (curr_decomposed_count >= 0)
88
0
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
0
                      int shift = curr_decomposed_count - 1;
93
94
0
                      if (shift < 0)
95
0
                        abort ();
96
0
                      if (shift > 0)
97
0
                        {
98
0
                          int j;
99
100
0
                          decomposed_count += shift;
101
0
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
0
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
0
                            decomposed[j + shift] = decomposed[j];
105
0
                        }
106
0
                      for (; shift >= 0; shift--)
107
0
                        decomposed[curr + shift] = curr_decomposed[shift];
108
0
                    }
109
47.0k
                  else
110
47.0k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
47.0k
                      curr++;
113
47.0k
                    }
114
47.0k
                }
115
47.0k
            }
116
47.0k
          }
117
10.4k
        else
118
10.4k
          {
119
10.4k
            count = 0;
120
10.4k
            decomposed_count = 0;
121
10.4k
          }
122
123
57.4k
        i = 0;
124
57.4k
        for (;;)
125
104k
          {
126
104k
            ucs4_t uc;
127
104k
            int ccc;
128
129
104k
            if (s < s_end)
130
94.1k
              {
131
                /* Fetch the next character from the decomposition.  */
132
94.1k
                if (i == decomposed_count)
133
47.0k
                  break;
134
47.0k
                uc = decomposed[i];
135
47.0k
                ccc = uc_combining_class (uc);
136
47.0k
              }
137
10.4k
            else
138
10.4k
              {
139
                /* End of string reached.  */
140
10.4k
                uc = 0;
141
10.4k
                ccc = 0;
142
10.4k
              }
143
144
57.4k
            if (ccc == 0)
145
57.4k
              {
146
57.4k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
57.4k
                if (sortbuf_count > 1)
151
0
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
0
                                                           sortbuf + sortbuf_count);
153
154
57.4k
                if (composer != NULL)
155
57.4k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
57.4k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
47.0k
                      {
178
47.0k
                        for (j = 1; j < sortbuf_count; )
179
0
                          {
180
0
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
0
                              {
182
0
                                ucs4_t combined =
183
0
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
0
                                if (combined)
185
0
                                  {
186
0
                                    size_t k;
187
188
0
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
0
                                    for (k = j + 1; k < sortbuf_count; k++)
191
0
                                      sortbuf[k - 1] = sortbuf[k];
192
0
                                    sortbuf_count--;
193
0
                                    continue;
194
0
                                  }
195
0
                              }
196
0
                            j++;
197
0
                          }
198
47.0k
                        if (s < s_end && sortbuf_count == 1)
199
36.6k
                          {
200
36.6k
                            ucs4_t combined =
201
36.6k
                              composer (sortbuf[0].code, uc);
202
36.6k
                            if (combined)
203
0
                              {
204
0
                                uc = combined;
205
0
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
0
                                sortbuf_count = 0;
210
0
                              }
211
36.6k
                          }
212
47.0k
                      }
213
57.4k
                  }
214
215
104k
                for (j = 0; j < sortbuf_count; j++)
216
47.0k
                  {
217
47.0k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
47.0k
                    if (length < allocated)
221
36.6k
                      {
222
36.6k
                        int ret =
223
36.6k
                          U_UCTOMB (result + length, muc, allocated - length);
224
36.6k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
36.6k
                        if (ret >= 0)
230
36.6k
                          {
231
36.6k
                            length += ret;
232
36.6k
                            goto done_appending;
233
36.6k
                          }
234
36.6k
                      }
235
10.4k
                    {
236
10.4k
                      size_t old_allocated = allocated;
237
10.4k
                      size_t new_allocated = 2 * old_allocated;
238
10.4k
                      if (new_allocated < 64)
239
10.4k
                        new_allocated = 64;
240
10.4k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
10.4k
                      {
243
10.4k
                        UNIT *larger_result;
244
10.4k
                        if (result == NULL)
245
10.4k
                          {
246
10.4k
                            larger_result =
247
10.4k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
10.4k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
10.4k
                          }
254
0
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
0
                        else
266
0
                          {
267
0
                            larger_result =
268
0
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
0
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
0
                          }
275
10.4k
                        result = larger_result;
276
10.4k
                        allocated = new_allocated;
277
10.4k
                        {
278
10.4k
                          int ret =
279
10.4k
                            U_UCTOMB (result + length, muc, allocated - length);
280
10.4k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
10.4k
                          if (ret < 0)
286
0
                            abort ();
287
10.4k
                          length += ret;
288
10.4k
                          goto done_appending;
289
10.4k
                        }
290
10.4k
                      }
291
10.4k
                    }
292
47.0k
                   done_appending: ;
293
47.0k
                  }
294
295
                /* sortbuf is now empty.  */
296
57.4k
                sortbuf_count = 0;
297
57.4k
              }
298
299
57.4k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
10.4k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
47.0k
            if (sortbuf_count == sortbuf_allocated)
305
0
              {
306
0
                struct ucs4_with_ccc *new_sortbuf;
307
308
0
                sortbuf_allocated = 2 * sortbuf_allocated;
309
0
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
0
                new_sortbuf =
312
0
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
0
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
0
                memcpy (new_sortbuf, sortbuf,
319
0
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
0
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
0
                sortbuf = new_sortbuf;
323
0
              }
324
47.0k
            sortbuf[sortbuf_count].code = uc;
325
47.0k
            sortbuf[sortbuf_count].ccc = ccc;
326
47.0k
            sortbuf_count++;
327
328
47.0k
            i++;
329
47.0k
          }
330
331
57.4k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
10.4k
          break;
334
335
47.0k
        s += count;
336
47.0k
      }
337
10.4k
  }
338
339
10.4k
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
10.4k
  else if (result != resultbuf && length < allocated)
353
10.4k
    {
354
      /* Shrink the allocated memory if possible.  */
355
10.4k
      UNIT *memory;
356
357
10.4k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
10.4k
      if (memory != NULL)
359
10.4k
        result = memory;
360
10.4k
    }
361
362
10.4k
  if (sortbuf_count > 0)
363
0
    abort ();
364
10.4k
  if (sortbuf != sortbuf_preallocated)
365
0
    free (sortbuf);
366
367
10.4k
  *lengthp = length;
368
10.4k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
10.4k
}
u32_normalize
Line
Count
Source
21
13.6k
{
22
13.6k
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
13.6k
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
13.6k
  UNIT *result;
27
13.6k
  size_t length;
28
13.6k
  size_t allocated;
29
  /* The buffer for sorting.  */
30
13.6k
  #define SORTBUF_PREALLOCATED 64
31
13.6k
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
13.6k
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
13.6k
  size_t sortbuf_allocated;
34
13.6k
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
13.6k
  if (resultbuf == NULL)
38
13.6k
    {
39
13.6k
      result = NULL;
40
13.6k
      allocated = 0;
41
13.6k
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
13.6k
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
13.6k
  sortbuf = sortbuf_preallocated;
51
13.6k
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
13.6k
  sortbuf_count = 0;
53
54
13.6k
  {
55
13.6k
    const UNIT *s_end = s + n;
56
57
13.6k
    for (;;)
58
94.2k
      {
59
94.2k
        int count;
60
94.2k
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
94.2k
        int decomposed_count;
62
94.2k
        int i;
63
64
94.2k
        if (s < s_end)
65
80.6k
          {
66
            /* Fetch the next character.  */
67
80.6k
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
80.6k
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
80.6k
            {
77
80.6k
              int curr;
78
79
161k
              for (curr = 0; curr < decomposed_count; )
80
80.6k
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
80.6k
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
80.6k
                  int curr_decomposed_count;
85
86
80.6k
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
80.6k
                  if (curr_decomposed_count >= 0)
88
0
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
0
                      int shift = curr_decomposed_count - 1;
93
94
0
                      if (shift < 0)
95
0
                        abort ();
96
0
                      if (shift > 0)
97
0
                        {
98
0
                          int j;
99
100
0
                          decomposed_count += shift;
101
0
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
0
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
0
                            decomposed[j + shift] = decomposed[j];
105
0
                        }
106
0
                      for (; shift >= 0; shift--)
107
0
                        decomposed[curr + shift] = curr_decomposed[shift];
108
0
                    }
109
80.6k
                  else
110
80.6k
                    {
111
                      /* decomposed[curr] is atomic.  */
112
80.6k
                      curr++;
113
80.6k
                    }
114
80.6k
                }
115
80.6k
            }
116
80.6k
          }
117
13.6k
        else
118
13.6k
          {
119
13.6k
            count = 0;
120
13.6k
            decomposed_count = 0;
121
13.6k
          }
122
123
94.2k
        i = 0;
124
94.2k
        for (;;)
125
174k
          {
126
174k
            ucs4_t uc;
127
174k
            int ccc;
128
129
174k
            if (s < s_end)
130
161k
              {
131
                /* Fetch the next character from the decomposition.  */
132
161k
                if (i == decomposed_count)
133
80.6k
                  break;
134
80.6k
                uc = decomposed[i];
135
80.6k
                ccc = uc_combining_class (uc);
136
80.6k
              }
137
13.6k
            else
138
13.6k
              {
139
                /* End of string reached.  */
140
13.6k
                uc = 0;
141
13.6k
                ccc = 0;
142
13.6k
              }
143
144
94.2k
            if (ccc == 0)
145
94.2k
              {
146
94.2k
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
94.2k
                if (sortbuf_count > 1)
151
0
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
0
                                                           sortbuf + sortbuf_count);
153
154
94.2k
                if (composer != NULL)
155
94.2k
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
94.2k
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
80.6k
                      {
178
80.6k
                        for (j = 1; j < sortbuf_count; )
179
0
                          {
180
0
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
0
                              {
182
0
                                ucs4_t combined =
183
0
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
0
                                if (combined)
185
0
                                  {
186
0
                                    size_t k;
187
188
0
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
0
                                    for (k = j + 1; k < sortbuf_count; k++)
191
0
                                      sortbuf[k - 1] = sortbuf[k];
192
0
                                    sortbuf_count--;
193
0
                                    continue;
194
0
                                  }
195
0
                              }
196
0
                            j++;
197
0
                          }
198
80.6k
                        if (s < s_end && sortbuf_count == 1)
199
67.0k
                          {
200
67.0k
                            ucs4_t combined =
201
67.0k
                              composer (sortbuf[0].code, uc);
202
67.0k
                            if (combined)
203
0
                              {
204
0
                                uc = combined;
205
0
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
0
                                sortbuf_count = 0;
210
0
                              }
211
67.0k
                          }
212
80.6k
                      }
213
94.2k
                  }
214
215
174k
                for (j = 0; j < sortbuf_count; j++)
216
80.6k
                  {
217
80.6k
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
80.6k
                    if (length < allocated)
221
67.0k
                      {
222
67.0k
                        int ret =
223
67.0k
                          U_UCTOMB (result + length, muc, allocated - length);
224
67.0k
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
67.0k
                        if (ret >= 0)
230
67.0k
                          {
231
67.0k
                            length += ret;
232
67.0k
                            goto done_appending;
233
67.0k
                          }
234
67.0k
                      }
235
13.6k
                    {
236
13.6k
                      size_t old_allocated = allocated;
237
13.6k
                      size_t new_allocated = 2 * old_allocated;
238
13.6k
                      if (new_allocated < 64)
239
13.6k
                        new_allocated = 64;
240
13.6k
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
13.6k
                      {
243
13.6k
                        UNIT *larger_result;
244
13.6k
                        if (result == NULL)
245
13.6k
                          {
246
13.6k
                            larger_result =
247
13.6k
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
13.6k
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
13.6k
                          }
254
0
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
0
                        else
266
0
                          {
267
0
                            larger_result =
268
0
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
0
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
0
                          }
275
13.6k
                        result = larger_result;
276
13.6k
                        allocated = new_allocated;
277
13.6k
                        {
278
13.6k
                          int ret =
279
13.6k
                            U_UCTOMB (result + length, muc, allocated - length);
280
13.6k
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
13.6k
                          if (ret < 0)
286
0
                            abort ();
287
13.6k
                          length += ret;
288
13.6k
                          goto done_appending;
289
13.6k
                        }
290
13.6k
                      }
291
13.6k
                    }
292
80.6k
                   done_appending: ;
293
80.6k
                  }
294
295
                /* sortbuf is now empty.  */
296
94.2k
                sortbuf_count = 0;
297
94.2k
              }
298
299
94.2k
            if (!(s < s_end))
300
              /* End of string reached.  */
301
13.6k
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
80.6k
            if (sortbuf_count == sortbuf_allocated)
305
0
              {
306
0
                struct ucs4_with_ccc *new_sortbuf;
307
308
0
                sortbuf_allocated = 2 * sortbuf_allocated;
309
0
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
0
                new_sortbuf =
312
0
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
0
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
0
                memcpy (new_sortbuf, sortbuf,
319
0
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
0
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
0
                sortbuf = new_sortbuf;
323
0
              }
324
80.6k
            sortbuf[sortbuf_count].code = uc;
325
80.6k
            sortbuf[sortbuf_count].ccc = ccc;
326
80.6k
            sortbuf_count++;
327
328
80.6k
            i++;
329
80.6k
          }
330
331
94.2k
        if (!(s < s_end))
332
          /* End of string reached.  */
333
13.6k
          break;
334
335
80.6k
        s += count;
336
80.6k
      }
337
13.6k
  }
338
339
13.6k
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
13.6k
  else if (result != resultbuf && length < allocated)
353
13.6k
    {
354
      /* Shrink the allocated memory if possible.  */
355
13.6k
      UNIT *memory;
356
357
13.6k
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
13.6k
      if (memory != NULL)
359
13.6k
        result = memory;
360
13.6k
    }
361
362
13.6k
  if (sortbuf_count > 0)
363
0
    abort ();
364
13.6k
  if (sortbuf != sortbuf_preallocated)
365
0
    free (sortbuf);
366
367
13.6k
  *lengthp = length;
368
13.6k
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
13.6k
}