Coverage Report

Created: 2023-06-07 07:18

/src/libunistring/lib/uninorm/u-normalize-internal.h
Line
Count
Source (jump to first uncovered line)
1
/* Decomposition and composition of Unicode strings.
2
   Copyright (C) 2009-2023 Free Software Foundation, Inc.
3
   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5
   This file is free software: you can redistribute it and/or modify
6
   it under the terms of the GNU Lesser General Public License as
7
   published by the Free Software Foundation; either version 2.1 of the
8
   License, or (at your option) any later version.
9
10
   This file is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU Lesser General Public License for more details.
14
15
   You should have received a copy of the GNU Lesser General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17
18
UNIT *
19
FUNC (uninorm_t nf, const UNIT *s, size_t n,
20
      UNIT *resultbuf, size_t *lengthp)
21
22.6M
{
22
22.6M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
22.6M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
22.6M
  UNIT *result;
27
22.6M
  size_t length;
28
22.6M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
22.6M
  #define SORTBUF_PREALLOCATED 64
31
22.6M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
22.6M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
22.6M
  size_t sortbuf_allocated;
34
22.6M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
22.6M
  if (resultbuf == NULL)
38
22.6M
    {
39
22.6M
      result = NULL;
40
22.6M
      allocated = 0;
41
22.6M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
22.6M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
22.6M
  sortbuf = sortbuf_preallocated;
51
22.6M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
22.6M
  sortbuf_count = 0;
53
54
22.6M
  {
55
22.6M
    const UNIT *s_end = s + n;
56
57
22.6M
    for (;;)
58
66.0M
      {
59
66.0M
        int count;
60
66.0M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
66.0M
        int decomposed_count;
62
66.0M
        int i;
63
64
66.0M
        if (s < s_end)
65
43.4M
          {
66
            /* Fetch the next character.  */
67
43.4M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
43.4M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
43.4M
            {
77
43.4M
              int curr;
78
79
101M
              for (curr = 0; curr < decomposed_count; )
80
57.6M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
57.6M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
57.6M
                  int curr_decomposed_count;
85
86
57.6M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
57.6M
                  if (curr_decomposed_count >= 0)
88
4.32M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
4.32M
                      int shift = curr_decomposed_count - 1;
93
94
4.32M
                      if (shift < 0)
95
0
                        abort ();
96
4.32M
                      if (shift > 0)
97
3.68M
                        {
98
3.68M
                          int j;
99
100
3.68M
                          decomposed_count += shift;
101
3.68M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
3.70M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
26.6k
                            decomposed[j + shift] = decomposed[j];
105
3.68M
                        }
106
18.6M
                      for (; shift >= 0; shift--)
107
14.2M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
4.32M
                    }
109
53.3M
                  else
110
53.3M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
53.3M
                      curr++;
113
53.3M
                    }
114
57.6M
                }
115
43.4M
            }
116
43.4M
          }
117
22.6M
        else
118
22.6M
          {
119
22.6M
            count = 0;
120
22.6M
            decomposed_count = 0;
121
22.6M
          }
122
123
66.0M
        i = 0;
124
66.0M
        for (;;)
125
119M
          {
126
119M
            ucs4_t uc;
127
119M
            int ccc;
128
129
119M
            if (s < s_end)
130
96.7M
              {
131
                /* Fetch the next character from the decomposition.  */
132
96.7M
                if (i == decomposed_count)
133
43.4M
                  break;
134
53.3M
                uc = decomposed[i];
135
53.3M
                ccc = uc_combining_class (uc);
136
53.3M
              }
137
22.6M
            else
138
22.6M
              {
139
                /* End of string reached.  */
140
22.6M
                uc = 0;
141
22.6M
                ccc = 0;
142
22.6M
              }
143
144
76.0M
            if (ccc == 0)
145
71.9M
              {
146
71.9M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
71.9M
                if (sortbuf_count > 1)
151
2.06M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
2.06M
                                                           sortbuf + sortbuf_count);
153
154
71.9M
                if (composer != NULL)
155
71.9M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
71.9M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
49.3M
                      {
178
53.3M
                        for (j = 1; j < sortbuf_count; )
179
4.04M
                          {
180
4.04M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
2.10M
                              {
182
2.10M
                                ucs4_t combined =
183
2.10M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
2.10M
                                if (combined)
185
2.04M
                                  {
186
2.04M
                                    size_t k;
187
188
2.04M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.65M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.61M
                                      sortbuf[k - 1] = sortbuf[k];
192
2.04M
                                    sortbuf_count--;
193
2.04M
                                    continue;
194
2.04M
                                  }
195
2.10M
                              }
196
2.00M
                            j++;
197
2.00M
                          }
198
49.3M
                        if (s < s_end && sortbuf_count == 1)
199
26.7M
                          {
200
26.7M
                            ucs4_t combined =
201
26.7M
                              composer (sortbuf[0].code, uc);
202
26.7M
                            if (combined)
203
17.5k
                              {
204
17.5k
                                uc = combined;
205
17.5k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
17.5k
                                sortbuf_count = 0;
210
17.5k
                              }
211
26.7M
                          }
212
49.3M
                      }
213
71.9M
                  }
214
215
123M
                for (j = 0; j < sortbuf_count; j++)
216
51.3M
                  {
217
51.3M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
51.3M
                    if (length < allocated)
221
28.7M
                      {
222
28.7M
                        int ret =
223
28.7M
                          U_UCTOMB (result + length, muc, allocated - length);
224
28.7M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
28.7M
                        if (ret >= 0)
230
28.7M
                          {
231
28.7M
                            length += ret;
232
28.7M
                            goto done_appending;
233
28.7M
                          }
234
28.7M
                      }
235
22.5M
                    {
236
22.5M
                      size_t old_allocated = allocated;
237
22.5M
                      size_t new_allocated = 2 * old_allocated;
238
22.5M
                      if (new_allocated < 64)
239
22.5M
                        new_allocated = 64;
240
22.5M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
22.5M
                      {
243
22.5M
                        UNIT *larger_result;
244
22.5M
                        if (result == NULL)
245
22.5M
                          {
246
22.5M
                            larger_result =
247
22.5M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
22.5M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
22.5M
                          }
254
17.6k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
17.6k
                        else
266
17.6k
                          {
267
17.6k
                            larger_result =
268
17.6k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
17.6k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
17.6k
                          }
275
22.5M
                        result = larger_result;
276
22.5M
                        allocated = new_allocated;
277
22.5M
                        {
278
22.5M
                          int ret =
279
22.5M
                            U_UCTOMB (result + length, muc, allocated - length);
280
22.5M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
22.5M
                          if (ret < 0)
286
0
                            abort ();
287
22.5M
                          length += ret;
288
22.5M
                          goto done_appending;
289
22.5M
                        }
290
22.5M
                      }
291
22.5M
                    }
292
51.3M
                   done_appending: ;
293
51.3M
                  }
294
295
                /* sortbuf is now empty.  */
296
71.9M
                sortbuf_count = 0;
297
71.9M
              }
298
299
76.0M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
22.6M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
53.3M
            if (sortbuf_count == sortbuf_allocated)
305
2.01k
              {
306
2.01k
                struct ucs4_with_ccc *new_sortbuf;
307
308
2.01k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
2.01k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
2.01k
                new_sortbuf =
312
2.01k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
2.01k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
2.01k
                memcpy (new_sortbuf, sortbuf,
319
2.01k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
2.01k
                if (sortbuf != sortbuf_preallocated)
321
687
                  free (sortbuf);
322
2.01k
                sortbuf = new_sortbuf;
323
2.01k
              }
324
53.3M
            sortbuf[sortbuf_count].code = uc;
325
53.3M
            sortbuf[sortbuf_count].ccc = ccc;
326
53.3M
            sortbuf_count++;
327
328
53.3M
            i++;
329
53.3M
          }
330
331
66.0M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
22.6M
          break;
334
335
43.4M
        s += count;
336
43.4M
      }
337
22.6M
  }
338
339
22.6M
  if (length == 0)
340
118k
    {
341
118k
      if (result == NULL)
342
118k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
118k
          result = (UNIT *) malloc (1);
345
118k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
118k
        }
351
118k
    }
352
22.5M
  else if (result != resultbuf && length < allocated)
353
22.5M
    {
354
      /* Shrink the allocated memory if possible.  */
355
22.5M
      UNIT *memory;
356
357
22.5M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
22.5M
      if (memory != NULL)
359
22.5M
        result = memory;
360
22.5M
    }
361
362
22.6M
  if (sortbuf_count > 0)
363
0
    abort ();
364
22.6M
  if (sortbuf != sortbuf_preallocated)
365
1.32k
    free (sortbuf);
366
367
22.6M
  *lengthp = length;
368
22.6M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
22.6M
}
u32_normalize
Line
Count
Source
21
15.1M
{
22
15.1M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
15.1M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
15.1M
  UNIT *result;
27
15.1M
  size_t length;
28
15.1M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
15.1M
  #define SORTBUF_PREALLOCATED 64
31
15.1M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
15.1M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
15.1M
  size_t sortbuf_allocated;
34
15.1M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
15.1M
  if (resultbuf == NULL)
38
15.1M
    {
39
15.1M
      result = NULL;
40
15.1M
      allocated = 0;
41
15.1M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
15.1M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
15.1M
  sortbuf = sortbuf_preallocated;
51
15.1M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
15.1M
  sortbuf_count = 0;
53
54
15.1M
  {
55
15.1M
    const UNIT *s_end = s + n;
56
57
15.1M
    for (;;)
58
36.0M
      {
59
36.0M
        int count;
60
36.0M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
36.0M
        int decomposed_count;
62
36.0M
        int i;
63
64
36.0M
        if (s < s_end)
65
20.9M
          {
66
            /* Fetch the next character.  */
67
20.9M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
20.9M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
20.9M
            {
77
20.9M
              int curr;
78
79
41.9M
              for (curr = 0; curr < decomposed_count; )
80
21.0M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
21.0M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
21.0M
                  int curr_decomposed_count;
85
86
21.0M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
21.0M
                  if (curr_decomposed_count >= 0)
88
32.1k
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
32.1k
                      int shift = curr_decomposed_count - 1;
93
94
32.1k
                      if (shift < 0)
95
0
                        abort ();
96
32.1k
                      if (shift > 0)
97
31.8k
                        {
98
31.8k
                          int j;
99
100
31.8k
                          decomposed_count += shift;
101
31.8k
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
41.5k
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
9.64k
                            decomposed[j + shift] = decomposed[j];
105
31.8k
                        }
106
96.1k
                      for (; shift >= 0; shift--)
107
63.9k
                        decomposed[curr + shift] = curr_decomposed[shift];
108
32.1k
                    }
109
20.9M
                  else
110
20.9M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
20.9M
                      curr++;
113
20.9M
                    }
114
21.0M
                }
115
20.9M
            }
116
20.9M
          }
117
15.1M
        else
118
15.1M
          {
119
15.1M
            count = 0;
120
15.1M
            decomposed_count = 0;
121
15.1M
          }
122
123
36.0M
        i = 0;
124
36.0M
        for (;;)
125
57.0M
          {
126
57.0M
            ucs4_t uc;
127
57.0M
            int ccc;
128
129
57.0M
            if (s < s_end)
130
41.9M
              {
131
                /* Fetch the next character from the decomposition.  */
132
41.9M
                if (i == decomposed_count)
133
20.9M
                  break;
134
20.9M
                uc = decomposed[i];
135
20.9M
                ccc = uc_combining_class (uc);
136
20.9M
              }
137
15.1M
            else
138
15.1M
              {
139
                /* End of string reached.  */
140
15.1M
                uc = 0;
141
15.1M
                ccc = 0;
142
15.1M
              }
143
144
36.1M
            if (ccc == 0)
145
36.0M
              {
146
36.0M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
36.0M
                if (sortbuf_count > 1)
151
28.1k
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
28.1k
                                                           sortbuf + sortbuf_count);
153
154
36.0M
                if (composer != NULL)
155
36.0M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
36.0M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
20.8M
                      {
178
20.9M
                        for (j = 1; j < sortbuf_count; )
179
101k
                          {
180
101k
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
38.5k
                              {
182
38.5k
                                ucs4_t combined =
183
38.5k
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
38.5k
                                if (combined)
185
19.8k
                                  {
186
19.8k
                                    size_t k;
187
188
19.8k
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
34.0k
                                    for (k = j + 1; k < sortbuf_count; k++)
191
14.1k
                                      sortbuf[k - 1] = sortbuf[k];
192
19.8k
                                    sortbuf_count--;
193
19.8k
                                    continue;
194
19.8k
                                  }
195
38.5k
                              }
196
81.6k
                            j++;
197
81.6k
                          }
198
20.8M
                        if (s < s_end && sortbuf_count == 1)
199
5.81M
                          {
200
5.81M
                            ucs4_t combined =
201
5.81M
                              composer (sortbuf[0].code, uc);
202
5.81M
                            if (combined)
203
10.2k
                              {
204
10.2k
                                uc = combined;
205
10.2k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
10.2k
                                sortbuf_count = 0;
210
10.2k
                              }
211
5.81M
                          }
212
20.8M
                      }
213
36.0M
                  }
214
215
56.9M
                for (j = 0; j < sortbuf_count; j++)
216
20.9M
                  {
217
20.9M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
20.9M
                    if (length < allocated)
221
5.90M
                      {
222
5.90M
                        int ret =
223
5.90M
                          U_UCTOMB (result + length, muc, allocated - length);
224
5.90M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
5.90M
                        if (ret >= 0)
230
5.90M
                          {
231
5.90M
                            length += ret;
232
5.90M
                            goto done_appending;
233
5.90M
                          }
234
5.90M
                      }
235
15.0M
                    {
236
15.0M
                      size_t old_allocated = allocated;
237
15.0M
                      size_t new_allocated = 2 * old_allocated;
238
15.0M
                      if (new_allocated < 64)
239
15.0M
                        new_allocated = 64;
240
15.0M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
15.0M
                      {
243
15.0M
                        UNIT *larger_result;
244
15.0M
                        if (result == NULL)
245
15.0M
                          {
246
15.0M
                            larger_result =
247
15.0M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
15.0M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
15.0M
                          }
254
3.54k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
3.54k
                        else
266
3.54k
                          {
267
3.54k
                            larger_result =
268
3.54k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
3.54k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
3.54k
                          }
275
15.0M
                        result = larger_result;
276
15.0M
                        allocated = new_allocated;
277
15.0M
                        {
278
15.0M
                          int ret =
279
15.0M
                            U_UCTOMB (result + length, muc, allocated - length);
280
15.0M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
15.0M
                          if (ret < 0)
286
0
                            abort ();
287
15.0M
                          length += ret;
288
15.0M
                          goto done_appending;
289
15.0M
                        }
290
15.0M
                      }
291
15.0M
                    }
292
20.9M
                   done_appending: ;
293
20.9M
                  }
294
295
                /* sortbuf is now empty.  */
296
36.0M
                sortbuf_count = 0;
297
36.0M
              }
298
299
36.1M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
15.1M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
20.9M
            if (sortbuf_count == sortbuf_allocated)
305
824
              {
306
824
                struct ucs4_with_ccc *new_sortbuf;
307
308
824
                sortbuf_allocated = 2 * sortbuf_allocated;
309
824
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
824
                new_sortbuf =
312
824
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
824
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
824
                memcpy (new_sortbuf, sortbuf,
319
824
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
824
                if (sortbuf != sortbuf_preallocated)
321
0
                  free (sortbuf);
322
824
                sortbuf = new_sortbuf;
323
824
              }
324
20.9M
            sortbuf[sortbuf_count].code = uc;
325
20.9M
            sortbuf[sortbuf_count].ccc = ccc;
326
20.9M
            sortbuf_count++;
327
328
20.9M
            i++;
329
20.9M
          }
330
331
36.0M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
15.1M
          break;
334
335
20.9M
        s += count;
336
20.9M
      }
337
15.1M
  }
338
339
15.1M
  if (length == 0)
340
118k
    {
341
118k
      if (result == NULL)
342
118k
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
118k
          result = (UNIT *) malloc (1);
345
118k
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
118k
        }
351
118k
    }
352
15.0M
  else if (result != resultbuf && length < allocated)
353
15.0M
    {
354
      /* Shrink the allocated memory if possible.  */
355
15.0M
      UNIT *memory;
356
357
15.0M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
15.0M
      if (memory != NULL)
359
15.0M
        result = memory;
360
15.0M
    }
361
362
15.1M
  if (sortbuf_count > 0)
363
0
    abort ();
364
15.1M
  if (sortbuf != sortbuf_preallocated)
365
824
    free (sortbuf);
366
367
15.1M
  *lengthp = length;
368
15.1M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
15.1M
}
u8_normalize
Line
Count
Source
21
7.47M
{
22
7.47M
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23
7.47M
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25
  /* The result being accumulated.  */
26
7.47M
  UNIT *result;
27
7.47M
  size_t length;
28
7.47M
  size_t allocated;
29
  /* The buffer for sorting.  */
30
7.47M
  #define SORTBUF_PREALLOCATED 64
31
7.47M
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32
7.47M
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33
7.47M
  size_t sortbuf_allocated;
34
7.47M
  size_t sortbuf_count;
35
36
  /* Initialize the accumulator.  */
37
7.47M
  if (resultbuf == NULL)
38
7.47M
    {
39
7.47M
      result = NULL;
40
7.47M
      allocated = 0;
41
7.47M
    }
42
0
  else
43
0
    {
44
0
      result = resultbuf;
45
0
      allocated = *lengthp;
46
0
    }
47
7.47M
  length = 0;
48
49
  /* Initialize the buffer for sorting.  */
50
7.47M
  sortbuf = sortbuf_preallocated;
51
7.47M
  sortbuf_allocated = SORTBUF_PREALLOCATED;
52
7.47M
  sortbuf_count = 0;
53
54
7.47M
  {
55
7.47M
    const UNIT *s_end = s + n;
56
57
7.47M
    for (;;)
58
29.9M
      {
59
29.9M
        int count;
60
29.9M
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61
29.9M
        int decomposed_count;
62
29.9M
        int i;
63
64
29.9M
        if (s < s_end)
65
22.4M
          {
66
            /* Fetch the next character.  */
67
22.4M
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68
22.4M
            decomposed_count = 1;
69
70
            /* Decompose it, recursively.
71
               It would be possible to precompute the recursive decomposition
72
               and store it in a table.  But this would significantly increase
73
               the size of the decomposition tables, because for example for
74
               U+1FC1 the recursive canonical decomposition and the recursive
75
               compatibility decomposition are different.  */
76
22.4M
            {
77
22.4M
              int curr;
78
79
59.1M
              for (curr = 0; curr < decomposed_count; )
80
36.6M
                {
81
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82
                     all elements are atomic.  */
83
36.6M
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84
36.6M
                  int curr_decomposed_count;
85
86
36.6M
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87
36.6M
                  if (curr_decomposed_count >= 0)
88
4.29M
                    {
89
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90
                         decomposed[curr], making room.  It's not worth using
91
                         memcpy() here, since the counts are so small.  */
92
4.29M
                      int shift = curr_decomposed_count - 1;
93
94
4.29M
                      if (shift < 0)
95
0
                        abort ();
96
4.29M
                      if (shift > 0)
97
3.64M
                        {
98
3.64M
                          int j;
99
100
3.64M
                          decomposed_count += shift;
101
3.64M
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102
0
                            abort ();
103
3.66M
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104
17.0k
                            decomposed[j + shift] = decomposed[j];
105
3.64M
                        }
106
18.5M
                      for (; shift >= 0; shift--)
107
14.2M
                        decomposed[curr + shift] = curr_decomposed[shift];
108
4.29M
                    }
109
32.3M
                  else
110
32.3M
                    {
111
                      /* decomposed[curr] is atomic.  */
112
32.3M
                      curr++;
113
32.3M
                    }
114
36.6M
                }
115
22.4M
            }
116
22.4M
          }
117
7.47M
        else
118
7.47M
          {
119
7.47M
            count = 0;
120
7.47M
            decomposed_count = 0;
121
7.47M
          }
122
123
29.9M
        i = 0;
124
29.9M
        for (;;)
125
62.3M
          {
126
62.3M
            ucs4_t uc;
127
62.3M
            int ccc;
128
129
62.3M
            if (s < s_end)
130
54.8M
              {
131
                /* Fetch the next character from the decomposition.  */
132
54.8M
                if (i == decomposed_count)
133
22.4M
                  break;
134
32.3M
                uc = decomposed[i];
135
32.3M
                ccc = uc_combining_class (uc);
136
32.3M
              }
137
7.47M
            else
138
7.47M
              {
139
                /* End of string reached.  */
140
7.47M
                uc = 0;
141
7.47M
                ccc = 0;
142
7.47M
              }
143
144
39.8M
            if (ccc == 0)
145
35.9M
              {
146
35.9M
                size_t j;
147
148
                /* Apply the canonical ordering algorithm to the accumulated
149
                   sequence of characters.  */
150
35.9M
                if (sortbuf_count > 1)
151
2.03M
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152
2.03M
                                                           sortbuf + sortbuf_count);
153
154
35.9M
                if (composer != NULL)
155
35.9M
                  {
156
                    /* Attempt to combine decomposed characters, as specified
157
                       in the Unicode Standard Annex #15 "Unicode Normalization
158
                       Forms".  We need to check
159
                         1. whether the first accumulated character is a
160
                            "starter" (i.e. has ccc = 0).  This is usually the
161
                            case.  But when the string starts with a
162
                            non-starter, the sortbuf also starts with a
163
                            non-starter.  Btw, this check could also be
164
                            omitted, because the composition table has only
165
                            entries (code1, code2) for which code1 is a
166
                            starter; if the first accumulated character is not
167
                            a starter, no lookup will succeed.
168
                         2. If the sortbuf has more than one character, check
169
                            for each of these characters that are not "blocked"
170
                            from the starter (i.e. have a ccc that is higher
171
                            than the ccc of the previous character) whether it
172
                            can be combined with the first character.
173
                         3. If only one character is left in sortbuf, check
174
                            whether it can be combined with the next character
175
                            (also a starter).  */
176
35.9M
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177
28.4M
                      {
178
32.3M
                        for (j = 1; j < sortbuf_count; )
179
3.94M
                          {
180
3.94M
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181
2.06M
                              {
182
2.06M
                                ucs4_t combined =
183
2.06M
                                  composer (sortbuf[0].code, sortbuf[j].code);
184
2.06M
                                if (combined)
185
2.02M
                                  {
186
2.02M
                                    size_t k;
187
188
2.02M
                                    sortbuf[0].code = combined;
189
                                    /* sortbuf[0].ccc = 0, still valid.  */
190
3.62M
                                    for (k = j + 1; k < sortbuf_count; k++)
191
1.60M
                                      sortbuf[k - 1] = sortbuf[k];
192
2.02M
                                    sortbuf_count--;
193
2.02M
                                    continue;
194
2.02M
                                  }
195
2.06M
                              }
196
1.92M
                            j++;
197
1.92M
                          }
198
28.4M
                        if (s < s_end && sortbuf_count == 1)
199
20.9M
                          {
200
20.9M
                            ucs4_t combined =
201
20.9M
                              composer (sortbuf[0].code, uc);
202
20.9M
                            if (combined)
203
7.35k
                              {
204
7.35k
                                uc = combined;
205
7.35k
                                ccc = 0;
206
                                /* uc could be further combined with subsequent
207
                                   characters.  So don't put it into sortbuf[0] in
208
                                   this round, only in the next round.  */
209
7.35k
                                sortbuf_count = 0;
210
7.35k
                              }
211
20.9M
                          }
212
28.4M
                      }
213
35.9M
                  }
214
215
66.2M
                for (j = 0; j < sortbuf_count; j++)
216
30.3M
                  {
217
30.3M
                    ucs4_t muc = sortbuf[j].code;
218
219
                    /* Append muc to the result accumulator.  */
220
30.3M
                    if (length < allocated)
221
22.8M
                      {
222
22.8M
                        int ret =
223
22.8M
                          U_UCTOMB (result + length, muc, allocated - length);
224
22.8M
                        if (ret == -1)
225
0
                          {
226
0
                            errno = EINVAL;
227
0
                            goto fail;
228
0
                          }
229
22.8M
                        if (ret >= 0)
230
22.8M
                          {
231
22.8M
                            length += ret;
232
22.8M
                            goto done_appending;
233
22.8M
                          }
234
22.8M
                      }
235
7.49M
                    {
236
7.49M
                      size_t old_allocated = allocated;
237
7.49M
                      size_t new_allocated = 2 * old_allocated;
238
7.49M
                      if (new_allocated < 64)
239
7.47M
                        new_allocated = 64;
240
7.49M
                      if (new_allocated < old_allocated) /* integer overflow? */
241
0
                        abort ();
242
7.49M
                      {
243
7.49M
                        UNIT *larger_result;
244
7.49M
                        if (result == NULL)
245
7.47M
                          {
246
7.47M
                            larger_result =
247
7.47M
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248
7.47M
                            if (larger_result == NULL)
249
0
                              {
250
0
                                errno = ENOMEM;
251
0
                                goto fail;
252
0
                              }
253
7.47M
                          }
254
14.1k
                        else if (result == resultbuf)
255
0
                          {
256
0
                            larger_result =
257
0
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258
0
                            if (larger_result == NULL)
259
0
                              {
260
0
                                errno = ENOMEM;
261
0
                                goto fail;
262
0
                              }
263
0
                            U_CPY (larger_result, resultbuf, length);
264
0
                          }
265
14.1k
                        else
266
14.1k
                          {
267
14.1k
                            larger_result =
268
14.1k
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269
14.1k
                            if (larger_result == NULL)
270
0
                              {
271
0
                                errno = ENOMEM;
272
0
                                goto fail;
273
0
                              }
274
14.1k
                          }
275
7.49M
                        result = larger_result;
276
7.49M
                        allocated = new_allocated;
277
7.49M
                        {
278
7.49M
                          int ret =
279
7.49M
                            U_UCTOMB (result + length, muc, allocated - length);
280
7.49M
                          if (ret == -1)
281
0
                            {
282
0
                              errno = EINVAL;
283
0
                              goto fail;
284
0
                            }
285
7.49M
                          if (ret < 0)
286
0
                            abort ();
287
7.49M
                          length += ret;
288
7.49M
                          goto done_appending;
289
7.49M
                        }
290
7.49M
                      }
291
7.49M
                    }
292
30.3M
                   done_appending: ;
293
30.3M
                  }
294
295
                /* sortbuf is now empty.  */
296
35.9M
                sortbuf_count = 0;
297
35.9M
              }
298
299
39.8M
            if (!(s < s_end))
300
              /* End of string reached.  */
301
7.47M
              break;
302
303
            /* Append (uc, ccc) to sortbuf.  */
304
32.3M
            if (sortbuf_count == sortbuf_allocated)
305
1.18k
              {
306
1.18k
                struct ucs4_with_ccc *new_sortbuf;
307
308
1.18k
                sortbuf_allocated = 2 * sortbuf_allocated;
309
1.18k
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310
0
                  abort ();
311
1.18k
                new_sortbuf =
312
1.18k
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313
1.18k
                if (new_sortbuf == NULL)
314
0
                  {
315
0
                    errno = ENOMEM;
316
0
                    goto fail;
317
0
                  }
318
1.18k
                memcpy (new_sortbuf, sortbuf,
319
1.18k
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
320
1.18k
                if (sortbuf != sortbuf_preallocated)
321
687
                  free (sortbuf);
322
1.18k
                sortbuf = new_sortbuf;
323
1.18k
              }
324
32.3M
            sortbuf[sortbuf_count].code = uc;
325
32.3M
            sortbuf[sortbuf_count].ccc = ccc;
326
32.3M
            sortbuf_count++;
327
328
32.3M
            i++;
329
32.3M
          }
330
331
29.9M
        if (!(s < s_end))
332
          /* End of string reached.  */
333
7.47M
          break;
334
335
22.4M
        s += count;
336
22.4M
      }
337
7.47M
  }
338
339
7.47M
  if (length == 0)
340
0
    {
341
0
      if (result == NULL)
342
0
        {
343
          /* Return a non-NULL value.  NULL means error.  */
344
0
          result = (UNIT *) malloc (1);
345
0
          if (result == NULL)
346
0
            {
347
0
              errno = ENOMEM;
348
0
              goto fail;
349
0
            }
350
0
        }
351
0
    }
352
7.47M
  else if (result != resultbuf && length < allocated)
353
7.47M
    {
354
      /* Shrink the allocated memory if possible.  */
355
7.47M
      UNIT *memory;
356
357
7.47M
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
358
7.47M
      if (memory != NULL)
359
7.47M
        result = memory;
360
7.47M
    }
361
362
7.47M
  if (sortbuf_count > 0)
363
0
    abort ();
364
7.47M
  if (sortbuf != sortbuf_preallocated)
365
501
    free (sortbuf);
366
367
7.47M
  *lengthp = length;
368
7.47M
  return result;
369
370
0
 fail:
371
0
  {
372
0
    int saved_errno = errno;
373
0
    if (sortbuf != sortbuf_preallocated)
374
0
      free (sortbuf);
375
0
    if (result != resultbuf)
376
0
      free (result);
377
0
    errno = saved_errno;
378
0
  }
379
0
  return NULL;
380
7.47M
}