Coverage Report

Created: 2024-10-06 07:21

/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source (jump to first uncovered line)
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2024 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
24.9M
{
22
24.9M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
24.9M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
24.9M
  UNIT *result;
27
24.9M
  size_t length;
28
24.9M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
24.9M
  #define SORTBUF_PREALLOCATED 64
31
24.9M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
24.9M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
24.9M
  size_t sortbuf_allocated;
34
24.9M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
24.9M
  if (resultbuf == NULL)
38
24.9M
    {
39
24.9M
      result = NULL;
40
24.9M
      allocated = 0;
41
24.9M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
24.9M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
24.9M
  sortbuf = sortbuf_preallocated;
51
24.9M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
24.9M
  sortbuf_count = 0;
53
54
24.9M
  {
55
24.9M
    const UNIT *s_end = s + n;
56
57
24.9M
    for (;;)
58
71.5M
      {
59
71.5M
        int count;
60
71.5M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
71.5M
        int decomposed_count;
62
71.5M
        int i;
63
64
71.5M
        if (s < s_end)
65
46.6M
          {
66
            /* Fetch the next character.  */
67
46.6M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
46.6M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
46.6M
            {
77
46.6M
              int curr;
78
79
102M
              for (curr = 0; curr < decomposed_count; )
80
55.8M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
55.8M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
55.8M
                  int curr_decomposed_count;
85
86
55.8M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
55.8M
                  if (curr_decomposed_count >= 0)
88
3.25M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.25M
                      int shift = curr_decomposed_count - 1;
93
94
3.25M
                      if (shift < 0)
95
0
                        abort ();
96
3.25M
                      if (shift > 0)
97
2.57M
                        {
98
2.57M
                          int j;
99
100
2.57M
                          decomposed_count += shift;
101
2.57M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.60M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
27.6k
                            decomposed[j + shift] = decomposed[j];
105
2.57M
                        }
106
12.4M
                      for (; shift >= 0; shift--)
107
9.24M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.25M
                    }
109
52.6M
                  else
110
52.6M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
52.6M
                      curr++;
113
52.6M
                    }
114
55.8M
                }
115
46.6M
            }
116
46.6M
          }
117
24.9M
        else
118
24.9M
          {
119
24.9M
            count = 0;
120
24.9M
            decomposed_count = 0;
121
24.9M
          }
122
123
71.5M
        i = 0;
124
71.5M
        for (;;)
125
124M
          {
126
124M
            ucs4_t uc;
127
124M
            int ccc;
128
129
124M
            if (s < s_end)
130
99.2M
              {
131
                /* Fetch the next character from the decomposition.  */
132
99.2M
                if (i == decomposed_count)
133
46.6M
                  break;
134
52.6M
                uc = decomposed[i];
135
52.6M
                ccc = uc_combining_class (uc);
136
52.6M
              }
137
24.9M
            else
138
24.9M
              {
139
                /* End of string reached.  */
140
24.9M
                uc = 0;
141
24.9M
                ccc = 0;
142
24.9M
              }
143
144
77.5M
            if (ccc == 0)
145
74.9M
              {
146
74.9M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
74.9M
                if (sortbuf_count > 1)
151
1.43M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.43M
                                                           sortbuf + sortbuf_count);
153
154
74.9M
                if (composer != NULL)
155
74.9M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
74.9M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
49.9M
                      {
178
52.5M
                        for (j = 1; j < sortbuf_count; )
179
2.63M
                          {
180
2.63M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.46M
                              {
182
1.46M
                                ucs4_t combined =
183
1.46M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.46M
                                if (combined)
185
1.42M
                                  {
186
1.42M
                                    size_t k;
187
188
1.42M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
2.94M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.52M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.42M
                                    sortbuf_count--;
193
1.42M
                                    continue;
194
1.42M
                                  }
195
1.46M
                              }
196
1.21M
                            j++;
197
1.21M
                          }
198
49.9M
                        if (s < s_end && sortbuf_count == 1)
199
25.1M
                          {
200
25.1M
                            ucs4_t combined =
201
25.1M
                              composer (sortbuf[0].code, uc);
202
25.1M
                            if (combined)
203
16.8k
                              {
204
16.8k
                                uc = combined;
205
16.8k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
16.8k
                                sortbuf_count = 0;
210
16.8k
                              }
211
25.1M
                          }
212
49.9M
                      }
213
74.9M
                  }
214
215
126M
                for (j = 0; j < sortbuf_count; j++)
216
51.1M
                  {
217
51.1M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
51.1M
                    if (length < allocated)
221
26.4M
                      {
222
26.4M
                        int ret =
223
26.4M
                          U_UCTOMB (result + length, muc, allocated - length);
224
26.4M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
26.4M
                        if (ret >= 0)
230
26.4M
                          {
231
26.4M
                            length += ret;
232
26.4M
                            goto done_appending;
233
26.4M
                          }
234
26.4M
                      }
235
24.7M
                    {
236
24.7M
                      size_t old_allocated = allocated;
237
24.7M
                      size_t new_allocated = 2 * old_allocated;
238
24.7M
                      if (new_allocated < 64)
239
24.7M
                        new_allocated = 64;
240
24.7M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
24.7M
                      {
243
24.7M
                        UNIT *larger_result;
244
24.7M
                        if (result == NULL)
245
24.7M
                          {
246
24.7M
                            larger_result =
247
24.7M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
24.7M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
24.7M
                          }
254
16.3k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
16.3k
                        else
266
16.3k
                          {
267
16.3k
                            larger_result =
268
16.3k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
16.3k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
16.3k
                          }
275
24.7M
                        result = larger_result;
276
24.7M
                        allocated = new_allocated;
277
24.7M
                        {
278
24.7M
                          int ret =
279
24.7M
                            U_UCTOMB (result + length, muc, allocated - length);
280
24.7M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
24.7M
                          if (ret < 0)
286
0
                            abort ();
287
24.7M
                          length += ret;
288
24.7M
                          goto done_appending;
289
24.7M
                        }
290
24.7M
                      }
291
24.7M
                    }
292
51.1M
                   done_appending: ;
293
51.1M
                  }
294
295
                /* sortbuf is now empty.  */
296
74.9M
                sortbuf_count = 0;
297
74.9M
              }
298
299
77.5M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
24.9M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
52.6M
            if (sortbuf_count == sortbuf_allocated)
305
1.87k
              {
306
1.87k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.87k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.87k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.87k
                new_sortbuf =
312
1.87k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.87k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.87k
                memcpy (new_sortbuf, sortbuf,
319
1.87k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.87k
                if (sortbuf != sortbuf_preallocated)
321
545
                  free (sortbuf);
322
1.87k
                sortbuf = new_sortbuf;
323
1.87k
              }
324
52.6M
            sortbuf[sortbuf_count].code = uc;
325
52.6M
            sortbuf[sortbuf_count].ccc = ccc;
326
52.6M
            sortbuf_count++;
327
328
52.6M
            i++;
329
52.6M
          }
330
331
71.5M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
24.9M
          break;
334
335
46.6M
        s += count;
336
46.6M
      }
337
24.9M
  }
338
339
24.9M
  if (length == 0)
340
227k
    {
341
227k
      if (result == NULL)
342
227k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
227k
          result = (UNIT *) malloc (1);
345
227k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
227k
        }
351
227k
    }
352
24.7M
  else if (result != resultbuf && length < allocated)
353
24.7M
    {
354
      /* Shrink the allocated memory if possible.  */
355
24.7M
      UNIT *memory;
356
357
24.7M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
24.7M
      if (memory != NULL)
359
24.7M
        result = memory;
360
24.7M
    }
361
362
24.9M
  if (sortbuf_count > 0)
363
0
    abort ();
364
24.9M
  if (sortbuf != sortbuf_preallocated)
365
1.33k
    free (sortbuf);
366
367
24.9M
  *lengthp = length;
368
24.9M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
24.9M
}
u8_normalize
Line
Count
Source
21
8.16M
{
22
8.16M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
8.16M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
8.16M
  UNIT *result;
27
8.16M
  size_t length;
28
8.16M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
8.16M
  #define SORTBUF_PREALLOCATED 64
31
8.16M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
8.16M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
8.16M
  size_t sortbuf_allocated;
34
8.16M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
8.16M
  if (resultbuf == NULL)
38
8.16M
    {
39
8.16M
      result = NULL;
40
8.16M
      allocated = 0;
41
8.16M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
8.16M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
8.16M
  sortbuf = sortbuf_preallocated;
51
8.16M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
8.16M
  sortbuf_count = 0;
53
54
8.16M
  {
55
8.16M
    const UNIT *s_end = s + n;
56
57
8.16M
    for (;;)
58
31.2M
      {
59
31.2M
        int count;
60
31.2M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
31.2M
        int decomposed_count;
62
31.2M
        int i;
63
64
31.2M
        if (s < s_end)
65
23.0M
          {
66
            /* Fetch the next character.  */
67
23.0M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
23.0M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
23.0M
            {
77
23.0M
              int curr;
78
79
55.2M
              for (curr = 0; curr < decomposed_count; )
80
32.2M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
32.2M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
32.2M
                  int curr_decomposed_count;
85
86
32.2M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
32.2M
                  if (curr_decomposed_count >= 0)
88
3.21M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
3.21M
                      int shift = curr_decomposed_count - 1;
93
94
3.21M
                      if (shift < 0)
95
0
                        abort ();
96
3.21M
                      if (shift > 0)
97
2.54M
                        {
98
2.54M
                          int j;
99
100
2.54M
                          decomposed_count += shift;
101
2.54M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
2.55M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
16.3k
                            decomposed[j + shift] = decomposed[j];
105
2.54M
                        }
106
12.3M
                      for (; shift >= 0; shift--)
107
9.17M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
3.21M
                    }
109
29.0M
                  else
110
29.0M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
29.0M
                      curr++;
113
29.0M
                    }
114
32.2M
                }
115
23.0M
            }
116
23.0M
          }
117
8.16M
        else
118
8.16M
          {
119
8.16M
            count = 0;
120
8.16M
            decomposed_count = 0;
121
8.16M
          }
122
123
31.2M
        i = 0;
124
31.2M
        for (;;)
125
60.2M
          {
126
60.2M
            ucs4_t uc;
127
60.2M
            int ccc;
128
129
60.2M
            if (s < s_end)
130
52.0M
              {
131
                /* Fetch the next character from the decomposition.  */
132
52.0M
                if (i == decomposed_count)
133
23.0M
                  break;
134
29.0M
                uc = decomposed[i];
135
29.0M
                ccc = uc_combining_class (uc);
136
29.0M
              }
137
8.16M
            else
138
8.16M
              {
139
                /* End of string reached.  */
140
8.16M
                uc = 0;
141
8.16M
                ccc = 0;
142
8.16M
              }
143
144
37.1M
            if (ccc == 0)
145
34.6M
              {
146
34.6M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
34.6M
                if (sortbuf_count > 1)
151
1.40M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
1.40M
                                                           sortbuf + sortbuf_count);
153
154
34.6M
                if (composer != NULL)
155
34.6M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
34.6M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
26.4M
                      {
178
29.0M
                        for (j = 1; j < sortbuf_count; )
179
2.52M
                          {
180
2.52M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
1.42M
                              {
182
1.42M
                                ucs4_t combined =
183
1.42M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
1.42M
                                if (combined)
185
1.39M
                                  {
186
1.39M
                                    size_t k;
187
188
1.39M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
2.90M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.50M
                                      sortbuf[k - 1] = sortbuf[k];
192
1.39M
                                    sortbuf_count--;
193
1.39M
                                    continue;
194
1.39M
                                  }
195
1.42M
                              }
196
1.12M
                            j++;
197
1.12M
                          }
198
26.4M
                        if (s < s_end && sortbuf_count == 1)
199
18.2M
                          {
200
18.2M
                            ucs4_t combined =
201
18.2M
                              composer (sortbuf[0].code, uc);
202
18.2M
                            if (combined)
203
6.96k
                              {
204
6.96k
                                uc = combined;
205
6.96k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
6.96k
                                sortbuf_count = 0;
210
6.96k
                              }
211
18.2M
                          }
212
26.4M
                      }
213
34.6M
                  }
214
215
62.2M
                for (j = 0; j < sortbuf_count; j++)
216
27.6M
                  {
217
27.6M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
27.6M
                    if (length < allocated)
221
19.4M
                      {
222
19.4M
                        int ret =
223
19.4M
                          U_UCTOMB (result + length, muc, allocated - length);
224
19.4M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
19.4M
                        if (ret >= 0)
230
19.4M
                          {
231
19.4M
                            length += ret;
232
19.4M
                            goto done_appending;
233
19.4M
                          }
234
19.4M
                      }
235
8.17M
                    {
236
8.17M
                      size_t old_allocated = allocated;
237
8.17M
                      size_t new_allocated = 2 * old_allocated;
238
8.17M
                      if (new_allocated < 64)
239
8.16M
                        new_allocated = 64;
240
8.17M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
8.17M
                      {
243
8.17M
                        UNIT *larger_result;
244
8.17M
                        if (result == NULL)
245
8.16M
                          {
246
8.16M
                            larger_result =
247
8.16M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
8.16M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
8.16M
                          }
254
13.1k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
13.1k
                        else
266
13.1k
                          {
267
13.1k
                            larger_result =
268
13.1k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
13.1k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
13.1k
                          }
275
8.17M
                        result = larger_result;
276
8.17M
                        allocated = new_allocated;
277
8.17M
                        {
278
8.17M
                          int ret =
279
8.17M
                            U_UCTOMB (result + length, muc, allocated - length);
280
8.17M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
8.17M
                          if (ret < 0)
286
0
                            abort ();
287
8.17M
                          length += ret;
288
8.17M
                          goto done_appending;
289
8.17M
                        }
290
8.17M
                      }
291
8.17M
                    }
292
27.6M
                   done_appending: ;
293
27.6M
                  }
294
295
                /* sortbuf is now empty.  */
296
34.6M
                sortbuf_count = 0;
297
34.6M
              }
298
299
37.1M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
8.16M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
29.0M
            if (sortbuf_count == sortbuf_allocated)
305
1.00k
              {
306
1.00k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.00k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.00k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.00k
                new_sortbuf =
312
1.00k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.00k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.00k
                memcpy (new_sortbuf, sortbuf,
319
1.00k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.00k
                if (sortbuf != sortbuf_preallocated)
321
545
                  free (sortbuf);
322
1.00k
                sortbuf = new_sortbuf;
323
1.00k
              }
324
29.0M
            sortbuf[sortbuf_count].code = uc;
325
29.0M
            sortbuf[sortbuf_count].ccc = ccc;
326
29.0M
            sortbuf_count++;
327
328
29.0M
            i++;
329
29.0M
          }
330
331
31.2M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
8.16M
          break;
334
335
23.0M
        s += count;
336
23.0M
      }
337
8.16M
  }
338
339
8.16M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
8.16M
  else if (result != resultbuf && length < allocated)
353
8.16M
    {
354
      /* Shrink the allocated memory if possible.  */
355
8.16M
      UNIT *memory;
356
357
8.16M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
8.16M
      if (memory != NULL)
359
8.16M
        result = memory;
360
8.16M
    }
361
362
8.16M
  if (sortbuf_count > 0)
363
0
    abort ();
364
8.16M
  if (sortbuf != sortbuf_preallocated)
365
459
    free (sortbuf);
366
367
8.16M
  *lengthp = length;
368
8.16M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
8.16M
}
u32_normalize
Line
Count
Source
21
16.7M
{
22
16.7M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
16.7M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
16.7M
  UNIT *result;
27
16.7M
  size_t length;
28
16.7M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
16.7M
  #define SORTBUF_PREALLOCATED 64
31
16.7M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
16.7M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
16.7M
  size_t sortbuf_allocated;
34
16.7M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
16.7M
  if (resultbuf == NULL)
38
16.7M
    {
39
16.7M
      result = NULL;
40
16.7M
      allocated = 0;
41
16.7M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
16.7M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
16.7M
  sortbuf = sortbuf_preallocated;
51
16.7M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
16.7M
  sortbuf_count = 0;
53
54
16.7M
  {
55
16.7M
    const UNIT *s_end = s + n;
56
57
16.7M
    for (;;)
58
40.3M
      {
59
40.3M
        int count;
60
40.3M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
40.3M
        int decomposed_count;
62
40.3M
        int i;
63
64
40.3M
        if (s < s_end)
65
23.5M
          {
66
            /* Fetch the next character.  */
67
23.5M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
23.5M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
23.5M
            {
77
23.5M
              int curr;
78
79
47.1M
              for (curr = 0; curr < decomposed_count; )
80
23.6M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
23.6M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
23.6M
                  int curr_decomposed_count;
85
86
23.6M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
23.6M
                  if (curr_decomposed_count >= 0)
88
37.4k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
37.4k
                      int shift = curr_decomposed_count - 1;
93
94
37.4k
                      if (shift < 0)
95
0
                        abort ();
96
37.4k
                      if (shift > 0)
97
37.2k
                        {
98
37.2k
                          int j;
99
100
37.2k
                          decomposed_count += shift;
101
37.2k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
48.5k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
11.3k
                            decomposed[j + shift] = decomposed[j];
105
37.2k
                        }
106
112k
                      for (; shift >= 0; shift--)
107
74.7k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
37.4k
                    }
109
23.5M
                  else
110
23.5M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
23.5M
                      curr++;
113
23.5M
                    }
114
23.6M
                }
115
23.5M
            }
116
23.5M
          }
117
16.7M
        else
118
16.7M
          {
119
16.7M
            count = 0;
120
16.7M
            decomposed_count = 0;
121
16.7M
          }
122
123
40.3M
        i = 0;
124
40.3M
        for (;;)
125
63.9M
          {
126
63.9M
            ucs4_t uc;
127
63.9M
            int ccc;
128
129
63.9M
            if (s < s_end)
130
47.1M
              {
131
                /* Fetch the next character from the decomposition.  */
132
47.1M
                if (i == decomposed_count)
133
23.5M
                  break;
134
23.5M
                uc = decomposed[i];
135
23.5M
                ccc = uc_combining_class (uc);
136
23.5M
              }
137
16.7M
            else
138
16.7M
              {
139
                /* End of string reached.  */
140
16.7M
                uc = 0;
141
16.7M
                ccc = 0;
142
16.7M
              }
143
144
40.3M
            if (ccc == 0)
145
40.2M
              {
146
40.2M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
40.2M
                if (sortbuf_count > 1)
151
30.7k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
30.7k
                                                           sortbuf + sortbuf_count);
153
154
40.2M
                if (composer != NULL)
155
40.2M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
40.2M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
23.4M
                      {
178
23.5M
                        for (j = 1; j < sortbuf_count; )
179
109k
                          {
180
109k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
44.3k
                              {
182
44.3k
                                ucs4_t combined =
183
44.3k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
44.3k
                                if (combined)
185
24.8k
                                  {
186
24.8k
                                    size_t k;
187
188
24.8k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
37.8k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
12.9k
                                      sortbuf[k - 1] = sortbuf[k];
192
24.8k
                                    sortbuf_count--;
193
24.8k
                                    continue;
194
24.8k
                                  }
195
44.3k
                              }
196
84.7k
                            j++;
197
84.7k
                          }
198
23.4M
                        if (s < s_end && sortbuf_count == 1)
199
6.89M
                          {
200
6.89M
                            ucs4_t combined =
201
6.89M
                              composer (sortbuf[0].code, uc);
202
6.89M
                            if (combined)
203
9.93k
                              {
204
9.93k
                                uc = combined;
205
9.93k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
9.93k
                                sortbuf_count = 0;
210
9.93k
                              }
211
6.89M
                          }
212
23.4M
                      }
213
40.2M
                  }
214
215
63.8M
                for (j = 0; j < sortbuf_count; j++)
216
23.5M
                  {
217
23.5M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
23.5M
                    if (length < allocated)
221
6.98M
                      {
222
6.98M
                        int ret =
223
6.98M
                          U_UCTOMB (result + length, muc, allocated - length);
224
6.98M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
6.98M
                        if (ret >= 0)
230
6.98M
                          {
231
6.98M
                            length += ret;
232
6.98M
                            goto done_appending;
233
6.98M
                          }
234
6.98M
                      }
235
16.5M
                    {
236
16.5M
                      size_t old_allocated = allocated;
237
16.5M
                      size_t new_allocated = 2 * old_allocated;
238
16.5M
                      if (new_allocated < 64)
239
16.5M
                        new_allocated = 64;
240
16.5M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
16.5M
                      {
243
16.5M
                        UNIT *larger_result;
244
16.5M
                        if (result == NULL)
245
16.5M
                          {
246
16.5M
                            larger_result =
247
16.5M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
16.5M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
16.5M
                          }
254
3.27k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
3.27k
                        else
266
3.27k
                          {
267
3.27k
                            larger_result =
268
3.27k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
3.27k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
3.27k
                          }
275
16.5M
                        result = larger_result;
276
16.5M
                        allocated = new_allocated;
277
16.5M
                        {
278
16.5M
                          int ret =
279
16.5M
                            U_UCTOMB (result + length, muc, allocated - length);
280
16.5M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
16.5M
                          if (ret < 0)
286
0
                            abort ();
287
16.5M
                          length += ret;
288
16.5M
                          goto done_appending;
289
16.5M
                        }
290
16.5M
                      }
291
16.5M
                    }
292
23.5M
                   done_appending: ;
293
23.5M
                  }
294
295
                /* sortbuf is now empty.  */
296
40.2M
                sortbuf_count = 0;
297
40.2M
              }
298
299
40.3M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
16.7M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
23.5M
            if (sortbuf_count == sortbuf_allocated)
305
873
              {
306
873
                struct ucs4_with_ccc *new_sortbuf;
307
308
873
                sortbuf_allocated = 2 * sortbuf_allocated;
309
873
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
873
                new_sortbuf =
312
873
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
873
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
873
                memcpy (new_sortbuf, sortbuf,
319
873
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
873
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
873
                sortbuf = new_sortbuf;
323
873
              }
324
23.5M
            sortbuf[sortbuf_count].code = uc;
325
23.5M
            sortbuf[sortbuf_count].ccc = ccc;
326
23.5M
            sortbuf_count++;
327
328
23.5M
            i++;
329
23.5M
          }
330
331
40.3M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
16.7M
          break;
334
335
23.5M
        s += count;
336
23.5M
      }
337
16.7M
  }
338
339
16.7M
  if (length == 0)
340
227k
    {
341
227k
      if (result == NULL)
342
227k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
227k
          result = (UNIT *) malloc (1);
345
227k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
227k
        }
351
227k
    }
352
16.5M
  else if (result != resultbuf && length < allocated)
353
16.5M
    {
354
      /* Shrink the allocated memory if possible.  */
355
16.5M
      UNIT *memory;
356
357
16.5M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
16.5M
      if (memory != NULL)
359
16.5M
        result = memory;
360
16.5M
    }
361
362
16.7M
  if (sortbuf_count > 0)
363
0
    abort ();
364
16.7M
  if (sortbuf != sortbuf_preallocated)
365
873
    free (sortbuf);
366
367
16.7M
  *lengthp = length;
368
16.7M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
16.7M
}