Coverage Report

Created: 2025-07-11 06:23

/src/libidn/lib/nfkc.c
Line
Count
Source (jump to first uncovered line)
1
/* nfkc.c --- Unicode normalization utilities.
2
   Copyright (C) 2002-2025 Simon Josefsson
3
4
   This file is part of GNU Libidn.
5
6
   GNU Libidn is free software: you can redistribute it and/or
7
   modify it under the terms of either:
8
9
     * the GNU Lesser General Public License as published by the Free
10
       Software Foundation; either version 3 of the License, or (at
11
       your option) any later version.
12
13
   or
14
15
     * the GNU General Public License as published by the Free
16
       Software Foundation; either version 2 of the License, or (at
17
       your option) any later version.
18
19
   or both in parallel, as here.
20
21
   GNU Libidn is distributed in the hope that it will be useful,
22
   but WITHOUT ANY WARRANTY; without even the implied warranty of
23
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24
   General Public License for more details.
25
26
   You should have received copies of the GNU General Public License and
27
   the GNU Lesser General Public License along with this program.  If
28
   not, see <https://www.gnu.org/licenses/>. */
29
30
#ifdef HAVE_CONFIG_H
31
# include "config.h"
32
#endif
33
34
#include <stdlib.h>
35
#include <string.h>
36
37
#include "stringprep.h"
38
39
/* Hacks to make syncing with GLIB code easier. */
40
12.2M
#define gboolean int
41
35.2M
#define gchar char
42
#define guchar unsigned char
43
17.6M
#define gint int
44
20.5M
#define guint unsigned int
45
3.70M
#define gushort unsigned short
46
#define gint16 int16_t
47
#define guint16 uint16_t
48
96.7M
#define gunichar uint32_t
49
75.4M
#define gsize size_t
50
#define gssize ssize_t
51
41.6M
#define g_malloc malloc
52
0
#define g_free free
53
19.2M
#define g_return_val_if_fail(expr,val)  {   \
54
19.2M
    if (!(expr))         \
55
19.2M
      return (val);         \
56
19.2M
  }
57
58
/* Code from GLIB gmacros.h starts here. */
59
60
/* GLIB - Library of useful routines for C programming
61
 * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald
62
 *
63
 * This library is free software; you can redistribute it and/or
64
 * modify it under the terms of the GNU Lesser General Public
65
 * License as published by the Free Software Foundation; either
66
 * version 2 of the License, or (at your option) any later version.
67
 *
68
 * This library is distributed in the hope that it will be useful,
69
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
70
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
71
 * Lesser General Public License for more details.
72
 */
73
74
#ifndef FALSE
75
168M
# define  FALSE (0)
76
#endif
77
78
#ifndef TRUE
79
161M
# define  TRUE  (!FALSE)
80
#endif
81
82
14.1M
#define G_N_ELEMENTS(arr)   (sizeof (arr) / sizeof ((arr)[0]))
83
84
18.2M
#define G_UNLIKELY(expr) (expr)
85
86
/* Code from GLIB gunicode.h starts here. */
87
88
/* gunicode.h - Unicode manipulation functions
89
 *
90
 *  Copyright (C) 1999, 2000 Tom Tromey
91
 *  Copyright 2000, 2005 Red Hat, Inc.
92
 *
93
 * The Gnome Library is free software; you can redistribute it and/or
94
 * modify it under the terms of the GNU Lesser General Public License as
95
 * published by the Free Software Foundation; either version 2 of the
96
 * License, or (at your option) any later version.
97
 *
98
 * The Gnome Library is distributed in the hope that it will be useful,
99
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
100
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
101
 * Lesser General Public License for more details.
102
 */
103
104
typedef enum
105
{
106
  G_NORMALIZE_DEFAULT,
107
  G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
108
  G_NORMALIZE_DEFAULT_COMPOSE,
109
  G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
110
  G_NORMALIZE_ALL,
111
  G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
112
  G_NORMALIZE_ALL_COMPOSE,
113
  G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
114
}
115
GNormalizeMode;
116
117
43.8M
#define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
118
119
/* Code from GLIB gutf8.c starts here. */
120
121
/* gutf8.c - Operations on UTF-8 strings.
122
 *
123
 * Copyright (C) 1999 Tom Tromey
124
 * Copyright (C) 2000 Red Hat, Inc.
125
 *
126
 * This library is free software; you can redistribute it and/or
127
 * modify it under the terms of the GNU Lesser General Public
128
 * License as published by the Free Software Foundation; either
129
 * version 2 of the License, or (at your option) any later version.
130
 *
131
 * This library is distributed in the hope that it will be useful,
132
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
133
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
134
 * Lesser General Public License for more details.
135
 */
136
137
#define UTF8_COMPUTE(Char, Mask, Len)   \
138
18.2M
  if (Char < 128)       \
139
18.2M
    {           \
140
3.04M
      Len = 1;          \
141
3.04M
      Mask = 0x7f;        \
142
3.04M
    }            \
143
18.2M
  else if ((Char & 0xe0) == 0xc0)   \
144
15.1M
    {           \
145
13.6M
      Len = 2;          \
146
13.6M
      Mask = 0x1f;        \
147
13.6M
    }            \
148
15.1M
  else if ((Char & 0xf0) == 0xe0)   \
149
1.57M
    {           \
150
1.50M
      Len = 3;          \
151
1.50M
      Mask = 0x0f;        \
152
1.50M
    }            \
153
1.57M
  else if ((Char & 0xf8) == 0xf0)   \
154
70.6k
    {           \
155
70.6k
      Len = 4;          \
156
70.6k
      Mask = 0x07;        \
157
70.6k
    }            \
158
70.6k
  else if ((Char & 0xfc) == 0xf8)   \
159
0
    {           \
160
0
      Len = 5;          \
161
0
      Mask = 0x03;        \
162
0
    }            \
163
0
  else if ((Char & 0xfe) == 0xfc)   \
164
0
    {           \
165
0
      Len = 6;          \
166
0
      Mask = 0x01;        \
167
0
    }            \
168
0
  else            \
169
0
    Len = -1;
170
171
#define UTF8_LENGTH(Char)     \
172
20.5M
  ((Char) < 0x80 ? 1 :        \
173
20.5M
   ((Char) < 0x800 ? 2 :      \
174
17.9M
    ((Char) < 0x10000 ? 3 :      \
175
554k
     ((Char) < 0x200000 ? 4 :      \
176
43.8k
      ((Char) < 0x4000000 ? 5 : 6)))))
177
178
#define UTF8_GET(Result, Chars, Count, Mask, Len)           \
179
18.2M
  (Result) = (Chars)[0] & (Mask);               \
180
35.0M
  for ((Count) = 1; (Count) < (Len); ++(Count))             \
181
18.2M
    {                       \
182
16.8M
      if (((Chars)[(Count)] & 0xc0) != 0x80)             \
183
16.8M
  {                     \
184
0
    (Result) = -1;                  \
185
0
    break;                    \
186
0
  }                      \
187
16.8M
      (Result) <<= 6;                   \
188
16.8M
      (Result) |= ((Chars)[(Count)] & 0x3f);              \
189
16.8M
    }
190
191
static const gchar utf8_skip_data[256] = {
192
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
193
  1, 1, 1, 1, 1, 1, 1,
194
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
195
  1, 1, 1, 1, 1, 1, 1,
196
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
197
  1, 1, 1, 1, 1, 1, 1,
198
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
199
  1, 1, 1, 1, 1, 1, 1,
200
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
201
  1, 1, 1, 1, 1, 1, 1,
202
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
203
  1, 1, 1, 1, 1, 1, 1,
204
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
205
  2, 2, 2, 2, 2, 2, 2,
206
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
207
  5, 5, 5, 6, 6, 1, 1
208
};
209
210
static const gchar *const g_utf8_skip = utf8_skip_data;
211
212
/*
213
 * g_utf8_strlen:
214
 * @p: pointer to the start of a UTF-8 encoded string
215
 * @max: the maximum number of bytes to examine. If @max
216
 *       is less than 0, then the string is assumed to be
217
 *       nul-terminated. If @max is 0, @p will not be examined and
218
 *       may be %NULL.
219
 *
220
 * Computes the length of the string in characters, not including
221
 * the terminating nul character.
222
 *
223
 * Return value: the length of the string in characters
224
 **/
225
static gsize
226
g_utf8_strlen (const gchar *p)
227
1.31M
{
228
1.31M
  gsize len = 0;
229
230
1.31M
  g_return_val_if_fail (p != NULL, 0);
231
232
5.36M
  while (*p)
233
4.05M
    {
234
4.05M
      p = g_utf8_next_char (p);
235
4.05M
      ++len;
236
4.05M
    }
237
238
1.31M
  return len;
239
1.31M
}
240
241
/*
242
 * g_utf8_get_char:
243
 * @p: a pointer to Unicode character encoded as UTF-8
244
 *
245
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
246
 * If @p does not point to a valid UTF-8 encoded character, results are
247
 * undefined. If you are not sure that the bytes are complete
248
 * valid Unicode characters, you should use g_utf8_get_char_validated()
249
 * instead.
250
 *
251
 * Return value: the resulting character
252
 **/
253
static gunichar
254
g_utf8_get_char (const gchar *p)
255
18.2M
{
256
18.2M
  int i, mask = 0, len;
257
18.2M
  gunichar result;
258
18.2M
  unsigned char c = (unsigned char) *p;
259
260
18.2M
  UTF8_COMPUTE (c, mask, len);
261
18.2M
  if (len == -1)
262
0
    return (gunichar) - 1;
263
18.2M
  UTF8_GET (result, p, i, mask, len);
264
265
18.2M
  return result;
266
18.2M
}
267
268
/*
269
 * g_unichar_to_utf8:
270
 * @c: a Unicode character code
271
 * @outbuf: output buffer, must have at least 6 bytes of space.
272
 *       If %NULL, the length will be computed and returned
273
 *       and nothing will be written to @outbuf.
274
 *
275
 * Converts a single character to UTF-8.
276
 *
277
 * Return value: number of bytes written
278
 **/
279
static int
280
g_unichar_to_utf8 (gunichar c, gchar *outbuf)
281
20.5M
{
282
  /* If this gets modified, also update the copy in g_string_insert_unichar() */
283
20.5M
  guint len = 0;
284
20.5M
  int first;
285
20.5M
  int i;
286
287
20.5M
  if (c < 0x80)
288
2.58M
    {
289
2.58M
      first = 0;
290
2.58M
      len = 1;
291
2.58M
    }
292
17.9M
  else if (c < 0x800)
293
17.3M
    {
294
17.3M
      first = 0xc0;
295
17.3M
      len = 2;
296
17.3M
    }
297
554k
  else if (c < 0x10000)
298
510k
    {
299
510k
      first = 0xe0;
300
510k
      len = 3;
301
510k
    }
302
43.8k
  else if (c < 0x200000)
303
43.8k
    {
304
43.8k
      first = 0xf0;
305
43.8k
      len = 4;
306
43.8k
    }
307
0
  else if (c < 0x4000000)
308
0
    {
309
0
      first = 0xf8;
310
0
      len = 5;
311
0
    }
312
0
  else
313
0
    {
314
0
      first = 0xfc;
315
0
      len = 6;
316
0
    }
317
318
20.5M
  if (outbuf)
319
20.5M
    {
320
39.0M
      for (i = len - 1; i > 0; --i)
321
18.5M
  {
322
18.5M
    outbuf[i] = (c & 0x3f) | 0x80;
323
18.5M
    c >>= 6;
324
18.5M
  }
325
20.5M
      outbuf[0] = c | first;
326
20.5M
    }
327
328
20.5M
  return len;
329
20.5M
}
330
331
/*
332
 * g_utf8_to_ucs4_fast:
333
 * @str: a UTF-8 encoded string
334
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
335
 *       then the string is nul-terminated.
336
 * @items_written: location to store the number of characters in the
337
 *                 result, or %NULL.
338
 *
339
 * Convert a string from UTF-8 to a 32-bit fixed width
340
 * representation as UCS-4, assuming valid UTF-8 input.
341
 * This function is roughly twice as fast as g_utf8_to_ucs4()
342
 * but does no error checking on the input. A trailing 0 character
343
 * will be added to the string after the converted text.
344
 *
345
 * Return value: a pointer to a newly allocated UCS-4 string.
346
 *               This value must be freed with g_free().
347
 **/
348
static gunichar *
349
g_utf8_to_ucs4_fast (const gchar *str, gssize len, gsize *items_written)
350
17.9M
{
351
17.9M
  gunichar *result;
352
17.9M
  gsize n_chars, i;
353
17.9M
  const gchar *p;
354
355
17.9M
  g_return_val_if_fail (str != NULL, NULL);
356
357
17.9M
  p = str;
358
17.9M
  n_chars = 0;
359
17.9M
  if (len < 0)
360
17.9M
    {
361
39.4M
      while (*p)
362
21.5M
  {
363
21.5M
    p = g_utf8_next_char (p);
364
21.5M
    ++n_chars;
365
21.5M
  }
366
17.9M
    }
367
0
  else
368
0
    {
369
0
      while (p < str + len && *p)
370
0
  {
371
0
    p = g_utf8_next_char (p);
372
0
    ++n_chars;
373
0
  }
374
0
    }
375
376
17.9M
  result = g_malloc (sizeof (gunichar) * (n_chars + 1));
377
17.9M
  if (!result)
378
0
    return NULL;
379
380
17.9M
  p = str;
381
39.4M
  for (i = 0; i < n_chars; i++)
382
21.5M
    {
383
21.5M
      gunichar wc = (guchar) * p++;
384
385
21.5M
      if (wc < 0x80)
386
3.28M
  {
387
3.28M
    result[i] = wc;
388
3.28M
  }
389
18.2M
      else
390
18.2M
  {
391
18.2M
    gunichar mask = 0x40;
392
393
18.2M
    if (G_UNLIKELY ((wc & mask) == 0))
394
0
      {
395
        /* It's an out-of-sequence 10xxxxxxx byte.
396
         * Rather than making an ugly hash of this and the next byte
397
         * and overrunning the buffer, it's more useful to treat it
398
         * with a replacement character */
399
0
        result[i] = 0xfffd;
400
0
        continue;
401
0
      }
402
403
18.2M
    do
404
19.0M
      {
405
19.0M
        wc <<= 6;
406
19.0M
        wc |= (guchar) (*p++) & 0x3f;
407
19.0M
        mask <<= 5;
408
19.0M
      }
409
19.0M
    while ((wc & mask) != 0);
410
411
18.2M
    wc &= mask - 1;
412
413
18.2M
    result[i] = wc;
414
18.2M
  }
415
21.5M
    }
416
17.9M
  result[i] = 0;
417
418
17.9M
  if (items_written)
419
12.1M
    *items_written = i;
420
421
17.9M
  return result;
422
17.9M
}
423
424
/*
425
 * g_ucs4_to_utf8:
426
 * @str: a UCS-4 encoded string
427
 * @len: the maximum length (number of characters) of @str to use.
428
 *       If @len < 0, then the string is nul-terminated.
429
 * @items_read: location to store number of characters read, or %NULL.
430
 * @items_written: location to store number of bytes written or %NULL.
431
 *                 The value here stored does not include the trailing 0
432
 *                 byte.
433
 * @error: location to store the error occurring, or %NULL to ignore
434
 *         errors. Any of the errors in #GConvertError other than
435
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
436
 *
437
 * Convert a string from a 32-bit fixed width representation as UCS-4.
438
 * to UTF-8. The result will be terminated with a 0 byte.
439
 *
440
 * Return value: a pointer to a newly allocated UTF-8 string.
441
 *               This value must be freed with g_free(). If an
442
 *               error occurs, %NULL will be returned and
443
 *               @error set. In that case, @items_read will be
444
 *               set to the position of the first invalid input
445
 *               character.
446
 **/
447
static gchar *
448
g_ucs4_to_utf8 (const gunichar *str,
449
    gsize len, gsize *items_read, gsize *items_written)
450
17.6M
{
451
17.6M
  gint result_length;
452
17.6M
  gchar *result = NULL;
453
17.6M
  gchar *p;
454
17.6M
  gsize i;
455
456
17.6M
  result_length = 0;
457
38.1M
  for (i = 0; i < len; i++)
458
20.5M
    {
459
20.5M
      if (!str[i])
460
0
  break;
461
462
20.5M
      if (str[i] >= 0x80000000)
463
0
  goto err_out;
464
465
20.5M
      result_length += UTF8_LENGTH (str[i]);
466
20.5M
    }
467
468
17.6M
  result = g_malloc (result_length + 1);
469
17.6M
  if (!result)
470
0
    return NULL;
471
17.6M
  p = result;
472
473
17.6M
  i = 0;
474
38.1M
  while (p < result + result_length)
475
20.5M
    p += g_unichar_to_utf8 (str[i++], p);
476
477
17.6M
  *p = '\0';
478
479
17.6M
  if (items_written)
480
0
    *items_written = p - result;
481
482
17.6M
err_out:
483
17.6M
  if (items_read)
484
0
    *items_read = i;
485
486
17.6M
  return result;
487
17.6M
}
488
489
/* Code from GLIB gunidecomp.c starts here. */
490
491
/* decomp.c - Character decomposition.
492
 *
493
 *  Copyright (C) 1999, 2000 Tom Tromey
494
 *  Copyright 2000 Red Hat, Inc.
495
 *
496
 * The Gnome Library is free software; you can redistribute it and/or
497
 * modify it under the terms of the GNU Lesser General Public License as
498
 * published by the Free Software Foundation; either version 2 of the
499
 * License, or (at your option) any later version.
500
 *
501
 * The Gnome Library is distributed in the hope that it will be useful,
502
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
503
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
504
 * Lesser General Public License for more details.
505
 */
506
507
#include "gunidecomp.h"
508
#include "gunicomp.h"
509
510
#define CC_PART1(Page, Char)            \
511
36.4M
  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
512
36.4M
   ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX)  \
513
36.4M
   : (cclass_data[combining_class_table_part1[Page]][Char]))
514
515
#define CC_PART2(Page, Char)            \
516
11.4k
  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
517
11.4k
   ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
518
11.4k
   : (cclass_data[combining_class_table_part2[Page]][Char]))
519
520
#define COMBINING_CLASS(Char)         \
521
36.6M
  (((Char) <= G_UNICODE_LAST_CHAR_PART1)     \
522
36.6M
   ? CC_PART1 ((Char) >> 8, (Char) & 0xff)     \
523
36.6M
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
524
125k
      ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
525
125k
      : 0))
526
527
/* constants for hangul syllable [de]composition */
528
39.6M
#define SBase 0xAC00
529
12.2M
#define LBase 0x1100
530
3.72M
#define VBase 0x1161
531
3.71M
#define TBase 0x11A7
532
1.30M
#define LCount 19
533
201k
#define VCount 21
534
210k
#define TCount 28
535
195k
#define NCount (VCount * TCount)
536
190k
#define SCount (LCount * NCount)
537
538
/*
539
 * g_unicode_canonical_ordering:
540
 * @string: a UCS-4 encoded string.
541
 * @len: the maximum length of @string to use.
542
 *
543
 * Computes the canonical ordering of a string in-place.
544
 * This rearranges decomposed characters in the string
545
 * according to their combining classes.  See the Unicode
546
 * manual for more information.
547
 **/
548
static void
549
g_unicode_canonical_ordering (gunichar *string, gsize len)
550
13.1M
{
551
13.1M
  gsize i;
552
13.1M
  int swap = 1;
553
554
26.3M
  while (swap)
555
13.1M
    {
556
13.1M
      int last;
557
13.1M
      swap = 0;
558
13.1M
      last = COMBINING_CLASS (string[0]);
559
19.6M
      for (i = 0; i < len - 1; ++i)
560
6.47M
  {
561
6.47M
    int next = COMBINING_CLASS (string[i + 1]);
562
6.47M
    if (next != 0 && last > next)
563
4.80k
      {
564
4.80k
        gsize j;
565
        /* Percolate item leftward through string.  */
566
14.9k
        for (j = i + 1; j > 0; --j)
567
14.7k
    {
568
14.7k
      gunichar t;
569
14.7k
      if (COMBINING_CLASS (string[j - 1]) <= next)
570
4.56k
        break;
571
10.1k
      t = string[j];
572
10.1k
      string[j] = string[j - 1];
573
10.1k
      string[j - 1] = t;
574
10.1k
      swap = 1;
575
10.1k
    }
576
        /* We're re-entering the loop looking at the old
577
           character again.  */
578
4.80k
        next = last;
579
4.80k
      }
580
6.47M
    last = next;
581
6.47M
  }
582
13.1M
    }
583
13.1M
}
584
585
/* http://www.unicode.org/unicode/reports/tr15/#Hangul
586
 * r should be null or have sufficient space. Calling with r == NULL will
587
 * only calculate the result_len; however, a buffer with space for three
588
 * characters will always be big enough. */
589
static void
590
decompose_hangul (gunichar s, gunichar *r, gsize *result_len)
591
4.95k
{
592
4.95k
  gint SIndex = s - SBase;
593
4.95k
  gint TIndex = SIndex % TCount;
594
595
4.95k
  if (r)
596
2.47k
    {
597
2.47k
      r[0] = LBase + SIndex / NCount;
598
2.47k
      r[1] = VBase + (SIndex % NCount) / TCount;
599
2.47k
    }
600
601
4.95k
  if (TIndex)
602
2.56k
    {
603
2.56k
      if (r)
604
1.28k
  r[2] = TBase + TIndex;
605
2.56k
      *result_len = 3;
606
2.56k
    }
607
2.39k
  else
608
2.39k
    *result_len = 2;
609
4.95k
}
610
611
/* returns a pointer to a null-terminated UTF-8 string */
612
static const gchar *
613
find_decomposition (gunichar ch, gboolean compat)
614
14.1M
{
615
14.1M
  int start = 0;
616
14.1M
  int end = G_N_ELEMENTS (decomp_table);
617
618
14.1M
  if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
619
12.5M
    {
620
161M
      while (TRUE)
621
161M
  {
622
161M
    int half = (start + end) / 2;
623
161M
    if (ch == decomp_table[half].ch)
624
2.63M
      {
625
2.63M
        int offset;
626
627
2.63M
        if (compat)
628
2.63M
    {
629
2.63M
      offset = decomp_table[half].compat_offset;
630
2.63M
      if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
631
1.08M
        offset = decomp_table[half].canon_offset;
632
2.63M
    }
633
0
        else
634
0
    {
635
0
      offset = decomp_table[half].canon_offset;
636
0
      if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
637
0
        return NULL;
638
0
    }
639
640
2.63M
        return &(decomp_expansion_string[offset]);
641
2.63M
      }
642
158M
    else if (half == start)
643
9.94M
      break;
644
148M
    else if (ch > decomp_table[half].ch)
645
45.7M
      start = half;
646
102M
    else
647
102M
      end = half;
648
161M
  }
649
12.5M
    }
650
651
11.5M
  return NULL;
652
14.1M
}
653
654
/* L,V => LV and LV,T => LVT  */
655
static gboolean
656
combine_hangul (gunichar a, gunichar b, gunichar *result)
657
3.70M
{
658
3.70M
  if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
659
2.95k
    {
660
2.95k
      gint LIndex = a - LBase;
661
2.95k
      gint VIndex = b - VBase;
662
663
2.95k
      *result = SBase + (LIndex * VCount + VIndex) * TCount;
664
2.95k
      return TRUE;
665
2.95k
    }
666
667
3.70M
  if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
668
1.50k
    {
669
1.50k
      gint SIndex = a - SBase;
670
671
1.50k
      if ((SIndex % TCount) == 0)
672
1.28k
  {
673
1.28k
    gint TIndex = b - TBase;
674
675
1.28k
    *result = a + TIndex;
676
1.28k
    return TRUE;
677
1.28k
  }
678
1.50k
    }
679
680
3.70M
  return FALSE;
681
3.70M
}
682
683
#define CI(Page, Char)          \
684
6.76M
  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
685
6.76M
   ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX)  \
686
6.76M
   : (compose_data[compose_table[Page]][Char]))
687
688
#define COMPOSE_INDEX(Char)           \
689
7.13M
  (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
690
691
static gboolean
692
combine (gunichar a, gunichar b, gunichar *result)
693
3.70M
{
694
3.70M
  gushort index_a, index_b;
695
696
3.70M
  if (combine_hangul (a, b, result))
697
4.23k
    return TRUE;
698
699
3.70M
  index_a = COMPOSE_INDEX (a);
700
701
3.70M
  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
702
278k
    {
703
278k
      if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
704
24.3k
  {
705
24.3k
    *result =
706
24.3k
      compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
707
24.3k
    return TRUE;
708
24.3k
  }
709
253k
      else
710
253k
  return FALSE;
711
278k
    }
712
713
3.42M
  index_b = COMPOSE_INDEX (b);
714
715
3.42M
  if (index_b >= COMPOSE_SECOND_SINGLE_START)
716
3.31k
    {
717
3.31k
      if (a ==
718
3.31k
    compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
719
2.94k
  {
720
2.94k
    *result =
721
2.94k
      compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
722
2.94k
    return TRUE;
723
2.94k
  }
724
374
      else
725
374
  return FALSE;
726
3.31k
    }
727
728
3.42M
  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
729
3.42M
      && index_b >= COMPOSE_SECOND_START
730
3.42M
      && index_b < COMPOSE_SECOND_SINGLE_START)
731
530k
    {
732
530k
      gunichar res =
733
530k
  compose_array[index_a - COMPOSE_FIRST_START][index_b -
734
530k
                 COMPOSE_SECOND_START];
735
736
530k
      if (res)
737
530k
  {
738
530k
    *result = res;
739
530k
    return TRUE;
740
530k
  }
741
530k
    }
742
743
2.89M
  return FALSE;
744
3.42M
}
745
746
static gunichar *
747
_g_utf8_normalize_wc (const gchar *str, gssize max_len, GNormalizeMode mode)
748
6.11M
{
749
6.11M
  gsize n_wc;
750
6.11M
  gunichar *wc_buffer;
751
6.11M
  const char *p;
752
6.11M
  gsize last_start;
753
6.11M
  gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
754
6.11M
  gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
755
756
6.11M
  n_wc = 0;
757
6.11M
  p = str;
758
13.2M
  while ((max_len < 0 || p < str + max_len) && *p)
759
7.09M
    {
760
7.09M
      const gchar *decomp;
761
7.09M
      gunichar wc = g_utf8_get_char (p);
762
763
7.09M
      if (wc >= SBase && wc < SBase + SCount)
764
2.47k
  {
765
2.47k
    gsize result_len;
766
2.47k
    decompose_hangul (wc, NULL, &result_len);
767
2.47k
    n_wc += result_len;
768
2.47k
  }
769
7.09M
      else
770
7.09M
  {
771
7.09M
    decomp = find_decomposition (wc, do_compat);
772
773
7.09M
    if (decomp)
774
1.31M
      n_wc += g_utf8_strlen (decomp);
775
5.77M
    else
776
5.77M
      n_wc++;
777
7.09M
  }
778
779
7.09M
      p = g_utf8_next_char (p);
780
7.09M
    }
781
782
6.11M
  wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
783
6.11M
  if (!wc_buffer)
784
0
    return NULL;
785
786
6.11M
  last_start = 0;
787
6.11M
  n_wc = 0;
788
6.11M
  p = str;
789
13.2M
  while ((max_len < 0 || p < str + max_len) && *p)
790
7.09M
    {
791
7.09M
      gunichar wc = g_utf8_get_char (p);
792
7.09M
      const gchar *decomp;
793
7.09M
      int cc;
794
7.09M
      gsize old_n_wc = n_wc;
795
796
7.09M
      if (wc >= SBase && wc < SBase + SCount)
797
2.47k
  {
798
2.47k
    gsize result_len;
799
2.47k
    decompose_hangul (wc, wc_buffer + n_wc, &result_len);
800
2.47k
    n_wc += result_len;
801
2.47k
  }
802
7.09M
      else
803
7.09M
  {
804
7.09M
    decomp = find_decomposition (wc, do_compat);
805
806
7.09M
    if (decomp)
807
1.31M
      {
808
1.31M
        const char *pd;
809
5.36M
        for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
810
4.05M
    wc_buffer[n_wc++] = g_utf8_get_char (pd);
811
1.31M
      }
812
5.77M
    else
813
5.77M
      wc_buffer[n_wc++] = wc;
814
7.09M
  }
815
816
7.09M
      if (n_wc > 0)
817
7.09M
  {
818
7.09M
    cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
819
820
7.09M
    if (cc == 0)
821
7.07M
      {
822
7.07M
        g_unicode_canonical_ordering (wc_buffer + last_start,
823
7.07M
              n_wc - last_start);
824
7.07M
        last_start = old_n_wc;
825
7.07M
      }
826
7.09M
  }
827
828
7.09M
      p = g_utf8_next_char (p);
829
7.09M
    }
830
831
6.11M
  if (n_wc > 0)
832
6.11M
    {
833
6.11M
      g_unicode_canonical_ordering (wc_buffer + last_start,
834
6.11M
            n_wc - last_start);
835
      /* dead assignment: last_start = n_wc; */
836
6.11M
    }
837
838
6.11M
  wc_buffer[n_wc] = 0;
839
840
  /* All decomposed and reordered */
841
842
6.11M
  if (do_compose && n_wc > 0)
843
6.11M
    {
844
6.11M
      gsize i, j;
845
6.11M
      int last_cc = 0;
846
6.11M
      last_start = 0;
847
848
15.9M
      for (i = 0; i < n_wc; i++)
849
9.83M
  {
850
9.83M
    int cc = COMBINING_CLASS (wc_buffer[i]);
851
852
9.83M
    if (i > 0 &&
853
9.83M
        (last_cc == 0 || last_cc != cc) &&
854
9.83M
        combine (wc_buffer[last_start], wc_buffer[i],
855
3.70M
           &wc_buffer[last_start]))
856
562k
      {
857
1.04M
        for (j = i + 1; j < n_wc; j++)
858
479k
    wc_buffer[j - 1] = wc_buffer[j];
859
562k
        n_wc--;
860
562k
        i--;
861
862
562k
        if (i == last_start)
863
559k
    last_cc = 0;
864
2.26k
        else
865
2.26k
    last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
866
867
562k
        continue;
868
562k
      }
869
870
9.27M
    if (cc == 0)
871
9.24M
      last_start = i;
872
873
9.27M
    last_cc = cc;
874
9.27M
  }
875
6.11M
    }
876
877
6.11M
  wc_buffer[n_wc] = 0;
878
879
6.11M
  return wc_buffer;
880
6.11M
}
881
882
/*
883
 * g_utf8_normalize:
884
 * @str: a UTF-8 encoded string.
885
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
886
 * @mode: the type of normalization to perform.
887
 *
888
 * Converts a string into canonical form, standardizing
889
 * such issues as whether a character with an accent
890
 * is represented as a base character and combining
891
 * accent or as a single precomposed character. The
892
 * string has to be valid UTF-8, otherwise %NULL is
893
 * returned. You should generally call g_utf8_normalize()
894
 * before comparing two Unicode strings.
895
 *
896
 * The normalization mode %G_NORMALIZE_DEFAULT only
897
 * standardizes differences that do not affect the
898
 * text content, such as the above-mentioned accent
899
 * representation. %G_NORMALIZE_ALL also standardizes
900
 * the "compatibility" characters in Unicode, such
901
 * as SUPERSCRIPT THREE to the standard forms
902
 * (in this case DIGIT THREE). Formatting information
903
 * may be lost but for most text operations such
904
 * characters should be considered the same.
905
 *
906
 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
907
 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
908
 * but returned a result with composed forms rather
909
 * than a maximally decomposed form. This is often
910
 * useful if you intend to convert the string to
911
 * a legacy encoding or pass it to a system with
912
 * less capable Unicode handling.
913
 *
914
 * Return value: a newly allocated string, that is the
915
 *   normalized form of @str, or %NULL if @str is not
916
 *   valid UTF-8.
917
 **/
918
static gchar *
919
g_utf8_normalize (const gchar *str, gssize len, GNormalizeMode mode)
920
0
{
921
0
  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
922
0
  gchar *result = NULL;
923
924
0
  if (result_wc)
925
0
    result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
926
927
0
  g_free (result_wc);
928
929
0
  return result;
930
0
}
931
932
/* Public Libidn API starts here. */
933
934
/**
935
 * stringprep_utf8_to_unichar:
936
 * @p: a pointer to Unicode character encoded as UTF-8
937
 *
938
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
939
 * If @p does not point to a valid UTF-8 encoded character, results are
940
 * undefined.
941
 *
942
 * Return value: the resulting character.
943
 **/
944
uint32_t
945
stringprep_utf8_to_unichar (const char *p)
946
0
{
947
0
  return g_utf8_get_char (p);
948
0
}
949
950
/**
951
 * stringprep_unichar_to_utf8:
952
 * @c: a ISO10646 character code
953
 * @outbuf: output buffer, must have at least 6 bytes of space.
954
 *       If %NULL, the length will be computed and returned
955
 *       and nothing will be written to @outbuf.
956
 *
957
 * Converts a single character to UTF-8.
958
 *
959
 * Return value: number of bytes written.
960
 **/
961
int
962
stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
963
0
{
964
0
  return g_unichar_to_utf8 (c, outbuf);
965
0
}
966
967
#include <unistr.h>
968
969
/**
970
 * stringprep_utf8_to_ucs4:
971
 * @str: a UTF-8 encoded string
972
 * @len: the maximum length of @str to use. If @len < 0, then
973
 *       the string is nul-terminated.
974
 * @items_written: location to store the number of characters in the
975
 *                 result, or %NULL.
976
 *
977
 * Convert a string from UTF-8 to a 32-bit fixed width representation
978
 * as UCS-4.  The function now performs error checking to verify that
979
 * the input is valid UTF-8 (before it was documented to not do error
980
 * checking).
981
 *
982
 * Return value: a pointer to a newly allocated UCS-4 string.
983
 *               This value must be deallocated by the caller.
984
 **/
985
uint32_t *
986
stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
987
17.9M
{
988
17.9M
  size_t n;
989
990
17.9M
  if (len < 0)
991
17.9M
    n = strlen (str);
992
0
  else
993
0
    n = len;
994
995
17.9M
  if (u8_check ((const uint8_t *) str, n))
996
2.09k
    return NULL;
997
998
17.9M
  return g_utf8_to_ucs4_fast (str, len, items_written);
999
17.9M
}
1000
1001
/**
1002
 * stringprep_ucs4_to_utf8:
1003
 * @str: a UCS-4 encoded string
1004
 * @len: the maximum length of @str to use. If @len < 0, then
1005
 *       the string is terminated with a 0 character.
1006
 * @items_read: location to store number of characters read read, or %NULL.
1007
 * @items_written: location to store number of bytes written or %NULL.
1008
 *                 The value here stored does not include the trailing 0
1009
 *                 byte.
1010
 *
1011
 * Convert a string from a 32-bit fixed width representation as UCS-4.
1012
 * to UTF-8. The result will be terminated with a 0 byte.
1013
 *
1014
 * Return value: a pointer to a newly allocated UTF-8 string.
1015
 *               This value must be deallocated by the caller.
1016
 *               If an error occurs, %NULL will be returned.
1017
 **/
1018
char *
1019
stringprep_ucs4_to_utf8 (const uint32_t *str, ssize_t len,
1020
       size_t *items_read, size_t *items_written)
1021
17.6M
{
1022
17.6M
  return g_ucs4_to_utf8 (str, len, items_read, items_written);
1023
17.6M
}
1024
1025
/**
1026
 * stringprep_utf8_nfkc_normalize:
1027
 * @str: a UTF-8 encoded string.
1028
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1029
 *
1030
 * Converts a string into canonical form, standardizing
1031
 * such issues as whether a character with an accent
1032
 * is represented as a base character and combining
1033
 * accent or as a single precomposed character.
1034
 *
1035
 * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1036
 * differences that do not affect the text content, such as the
1037
 * above-mentioned accent representation. It standardizes the
1038
 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1039
 * the standard forms (in this case DIGIT THREE). Formatting
1040
 * information may be lost but for most text operations such
1041
 * characters should be considered the same. It returns a result with
1042
 * composed forms rather than a maximally decomposed form.
1043
 *
1044
 * Return value: a newly allocated string, that is the
1045
 *   NFKC normalized form of @str.
1046
 **/
1047
char *
1048
stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1049
0
{
1050
0
  size_t n;
1051
1052
0
  if (len < 0)
1053
0
    n = strlen (str);
1054
0
  else
1055
0
    n = len;
1056
1057
0
  if (u8_check ((const uint8_t *) str, n))
1058
0
    return NULL;
1059
1060
0
  return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1061
0
}
1062
1063
#include <stdio.h>
1064
/**
1065
 * stringprep_ucs4_nfkc_normalize:
1066
 * @str: a Unicode string.
1067
 * @len: length of @str array, or -1 if @str is nul-terminated.
1068
 *
1069
 * Converts a UCS4 string into canonical form, see
1070
 * stringprep_utf8_nfkc_normalize() for more information.
1071
 *
1072
 * Return value: a newly allocated Unicode string, that is the NFKC
1073
 *   normalized form of @str.
1074
 **/
1075
uint32_t *
1076
stringprep_ucs4_nfkc_normalize (const uint32_t *str, ssize_t len)
1077
6.11M
{
1078
6.11M
  char *p;
1079
6.11M
  uint32_t *result_wc;
1080
1081
6.11M
  p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1082
6.11M
  if (!p)
1083
0
    return NULL;
1084
1085
6.11M
  result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1086
6.11M
  free (p);
1087
1088
6.11M
  return result_wc;
1089
6.11M
}