Coverage Report

Created: 2025-07-17 06:51

/src/libidn/lib/nfkc.c
Line
Count
Source (jump to first uncovered line)
1
/* nfkc.c --- Unicode normalization utilities.
2
   Copyright (C) 2002-2025 Simon Josefsson
3
4
   This file is part of GNU Libidn.
5
6
   GNU Libidn is free software: you can redistribute it and/or
7
   modify it under the terms of either:
8
9
     * the GNU Lesser General Public License as published by the Free
10
       Software Foundation; either version 3 of the License, or (at
11
       your option) any later version.
12
13
   or
14
15
     * the GNU General Public License as published by the Free
16
       Software Foundation; either version 2 of the License, or (at
17
       your option) any later version.
18
19
   or both in parallel, as here.
20
21
   GNU Libidn is distributed in the hope that it will be useful,
22
   but WITHOUT ANY WARRANTY; without even the implied warranty of
23
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24
   General Public License for more details.
25
26
   You should have received copies of the GNU General Public License and
27
   the GNU Lesser General Public License along with this program.  If
28
   not, see <https://www.gnu.org/licenses/>. */
29
30
#ifdef HAVE_CONFIG_H
31
# include "config.h"
32
#endif
33
34
#include <stdlib.h>
35
#include <string.h>
36
37
#include "stringprep.h"
38
39
/* Hacks to make syncing with GLIB code easier. */
40
303k
#define gboolean int
41
504k
#define gchar char
42
#define guchar unsigned char
43
316k
#define gint int
44
13.1M
#define guint unsigned int
45
58.9M
#define gushort unsigned short
46
#define gint16 int16_t
47
#define guint16 uint16_t
48
93.2M
#define gunichar uint32_t
49
13.5M
#define gsize size_t
50
#define gssize ssize_t
51
582k
#define g_malloc malloc
52
1.87k
#define g_free free
53
4.07M
#define g_return_val_if_fail(expr,val)  {   \
54
4.07M
    if (!(expr))         \
55
4.07M
      return (val);         \
56
4.07M
  }
57
58
/* Code from GLIB gmacros.h starts here. */
59
60
/* GLIB - Library of useful routines for C programming
61
 * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald
62
 *
63
 * This library is free software; you can redistribute it and/or
64
 * modify it under the terms of the GNU Lesser General Public
65
 * License as published by the Free Software Foundation; either
66
 * version 2 of the License, or (at your option) any later version.
67
 *
68
 * This library is distributed in the hope that it will be useful,
69
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
70
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
71
 * Lesser General Public License for more details.
72
 */
73
74
#ifndef FALSE
75
213M
# define  FALSE (0)
76
#endif
77
78
#ifndef TRUE
79
95.3M
# define  TRUE  (!FALSE)
80
#endif
81
82
8.63M
#define G_N_ELEMENTS(arr)   (sizeof (arr) / sizeof ((arr)[0]))
83
84
5.62M
#define G_UNLIKELY(expr) (expr)
85
86
/* Code from GLIB gunicode.h starts here. */
87
88
/* gunicode.h - Unicode manipulation functions
89
 *
90
 *  Copyright (C) 1999, 2000 Tom Tromey
91
 *  Copyright 2000, 2005 Red Hat, Inc.
92
 *
93
 * The Gnome Library is free software; you can redistribute it and/or
94
 * modify it under the terms of the GNU Lesser General Public License as
95
 * published by the Free Software Foundation; either version 2 of the
96
 * License, or (at your option) any later version.
97
 *
98
 * The Gnome Library is distributed in the hope that it will be useful,
99
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
100
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
101
 * Lesser General Public License for more details.
102
 */
103
104
typedef enum
105
{
106
  G_NORMALIZE_DEFAULT,
107
  G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
108
  G_NORMALIZE_DEFAULT_COMPOSE,
109
  G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
110
  G_NORMALIZE_ALL,
111
  G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
112
  G_NORMALIZE_ALL_COMPOSE,
113
  G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
114
}
115
GNormalizeMode;
116
117
133M
#define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
118
119
/* Code from GLIB gutf8.c starts here. */
120
121
/* gutf8.c - Operations on UTF-8 strings.
122
 *
123
 * Copyright (C) 1999 Tom Tromey
124
 * Copyright (C) 2000 Red Hat, Inc.
125
 *
126
 * This library is free software; you can redistribute it and/or
127
 * modify it under the terms of the GNU Lesser General Public
128
 * License as published by the Free Software Foundation; either
129
 * version 2 of the License, or (at your option) any later version.
130
 *
131
 * This library is distributed in the hope that it will be useful,
132
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
133
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
134
 * Lesser General Public License for more details.
135
 */
136
137
#define UTF8_COMPUTE(Char, Mask, Len)   \
138
67.5M
  if (Char < 128)       \
139
67.5M
    {           \
140
9.98M
      Len = 1;          \
141
9.98M
      Mask = 0x7f;        \
142
9.98M
    }            \
143
67.5M
  else if ((Char & 0xe0) == 0xc0)   \
144
57.5M
    {           \
145
47.3M
      Len = 2;          \
146
47.3M
      Mask = 0x1f;        \
147
47.3M
    }            \
148
57.5M
  else if ((Char & 0xf0) == 0xe0)   \
149
10.2M
    {           \
150
10.1M
      Len = 3;          \
151
10.1M
      Mask = 0x0f;        \
152
10.1M
    }            \
153
10.2M
  else if ((Char & 0xf8) == 0xf0)   \
154
66.6k
    {           \
155
58.0k
      Len = 4;          \
156
58.0k
      Mask = 0x07;        \
157
58.0k
    }            \
158
66.6k
  else if ((Char & 0xfc) == 0xf8)   \
159
8.59k
    {           \
160
1.01k
      Len = 5;          \
161
1.01k
      Mask = 0x03;        \
162
1.01k
    }            \
163
8.59k
  else if ((Char & 0xfe) == 0xfc)   \
164
7.57k
    {           \
165
7.35k
      Len = 6;          \
166
7.35k
      Mask = 0x01;        \
167
7.35k
    }            \
168
7.57k
  else            \
169
7.57k
    Len = -1;
170
171
#define UTF8_LENGTH(Char)     \
172
13.1M
  ((Char) < 0x80 ? 1 :        \
173
13.1M
   ((Char) < 0x800 ? 2 :      \
174
10.7M
    ((Char) < 0x10000 ? 3 :      \
175
4.53M
     ((Char) < 0x200000 ? 4 :      \
176
77.5k
      ((Char) < 0x4000000 ? 5 : 6)))))
177
178
#define UTF8_GET(Result, Chars, Count, Mask, Len)           \
179
67.5M
  (Result) = (Chars)[0] & (Mask);               \
180
135M
  for ((Count) = 1; (Count) < (Len); ++(Count))             \
181
67.9M
    {                       \
182
67.9M
      if (((Chars)[(Count)] & 0xc0) != 0x80)             \
183
67.9M
  {                     \
184
231
    (Result) = -1;                  \
185
231
    break;                    \
186
231
  }                      \
187
67.9M
      (Result) <<= 6;                   \
188
67.9M
      (Result) |= ((Chars)[(Count)] & 0x3f);              \
189
67.9M
    }
190
191
static const gchar utf8_skip_data[256] = {
192
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
193
  1, 1, 1, 1, 1, 1, 1,
194
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
195
  1, 1, 1, 1, 1, 1, 1,
196
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
197
  1, 1, 1, 1, 1, 1, 1,
198
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
199
  1, 1, 1, 1, 1, 1, 1,
200
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
201
  1, 1, 1, 1, 1, 1, 1,
202
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
203
  1, 1, 1, 1, 1, 1, 1,
204
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
205
  2, 2, 2, 2, 2, 2, 2,
206
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
207
  5, 5, 5, 6, 6, 1, 1
208
};
209
210
static const gchar *const g_utf8_skip = utf8_skip_data;
211
212
/*
213
 * g_utf8_strlen:
214
 * @p: pointer to the start of a UTF-8 encoded string
215
 * @max: the maximum number of bytes to examine. If @max
216
 *       is less than 0, then the string is assumed to be
217
 *       nul-terminated. If @max is 0, @p will not be examined and
218
 *       may be %NULL.
219
 *
220
 * Computes the length of the string in characters, not including
221
 * the terminating nul character.
222
 *
223
 * Return value: the length of the string in characters
224
 **/
225
static gsize
226
g_utf8_strlen (const gchar *p)
227
3.89M
{
228
3.89M
  gsize len = 0;
229
230
3.89M
  g_return_val_if_fail (p != NULL, 0);
231
232
62.8M
  while (*p)
233
58.9M
    {
234
58.9M
      p = g_utf8_next_char (p);
235
58.9M
      ++len;
236
58.9M
    }
237
238
3.89M
  return len;
239
3.89M
}
240
241
/*
242
 * g_utf8_get_char:
243
 * @p: a pointer to Unicode character encoded as UTF-8
244
 *
245
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
246
 * If @p does not point to a valid UTF-8 encoded character, results are
247
 * undefined. If you are not sure that the bytes are complete
248
 * valid Unicode characters, you should use g_utf8_get_char_validated()
249
 * instead.
250
 *
251
 * Return value: the resulting character
252
 **/
253
static gunichar
254
g_utf8_get_char (const gchar *p)
255
67.5M
{
256
67.5M
  int i, mask = 0, len;
257
67.5M
  gunichar result;
258
67.5M
  unsigned char c = (unsigned char) *p;
259
260
67.5M
  UTF8_COMPUTE (c, mask, len);
261
67.5M
  if (len == -1)
262
214
    return (gunichar) - 1;
263
67.5M
  UTF8_GET (result, p, i, mask, len);
264
265
67.5M
  return result;
266
67.5M
}
267
268
/*
269
 * g_unichar_to_utf8:
270
 * @c: a Unicode character code
271
 * @outbuf: output buffer, must have at least 6 bytes of space.
272
 *       If %NULL, the length will be computed and returned
273
 *       and nothing will be written to @outbuf.
274
 *
275
 * Converts a single character to UTF-8.
276
 *
277
 * Return value: number of bytes written
278
 **/
279
static int
280
g_unichar_to_utf8 (gunichar c, gchar *outbuf)
281
13.1M
{
282
  /* If this gets modified, also update the copy in g_string_insert_unichar() */
283
13.1M
  guint len = 0;
284
13.1M
  int first;
285
13.1M
  int i;
286
287
13.1M
  if (c < 0x80)
288
2.41M
    {
289
2.41M
      first = 0;
290
2.41M
      len = 1;
291
2.41M
    }
292
10.7M
  else if (c < 0x800)
293
6.19M
    {
294
6.19M
      first = 0xc0;
295
6.19M
      len = 2;
296
6.19M
    }
297
4.53M
  else if (c < 0x10000)
298
4.45M
    {
299
4.45M
      first = 0xe0;
300
4.45M
      len = 3;
301
4.45M
    }
302
76.7k
  else if (c < 0x200000)
303
47.4k
    {
304
47.4k
      first = 0xf0;
305
47.4k
      len = 4;
306
47.4k
    }
307
29.2k
  else if (c < 0x4000000)
308
1.62k
    {
309
1.62k
      first = 0xf8;
310
1.62k
      len = 5;
311
1.62k
    }
312
27.5k
  else
313
27.5k
    {
314
27.5k
      first = 0xfc;
315
27.5k
      len = 6;
316
27.5k
    }
317
318
13.1M
  if (outbuf)
319
13.1M
    {
320
28.5M
      for (i = len - 1; i > 0; --i)
321
15.3M
  {
322
15.3M
    outbuf[i] = (c & 0x3f) | 0x80;
323
15.3M
    c >>= 6;
324
15.3M
  }
325
13.1M
      outbuf[0] = c | first;
326
13.1M
    }
327
328
13.1M
  return len;
329
13.1M
}
330
331
/*
332
 * g_utf8_to_ucs4_fast:
333
 * @str: a UTF-8 encoded string
334
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
335
 *       then the string is nul-terminated.
336
 * @items_written: location to store the number of characters in the
337
 *                 result, or %NULL.
338
 *
339
 * Convert a string from UTF-8 to a 32-bit fixed width
340
 * representation as UCS-4, assuming valid UTF-8 input.
341
 * This function is roughly twice as fast as g_utf8_to_ucs4()
342
 * but does no error checking on the input. A trailing 0 character
343
 * will be added to the string after the converted text.
344
 *
345
 * Return value: a pointer to a newly allocated UCS-4 string.
346
 *               This value must be freed with g_free().
347
 **/
348
static gunichar *
349
g_utf8_to_ucs4_fast (const gchar *str, gssize len, gsize *items_written)
350
183k
{
351
183k
  gunichar *result;
352
183k
  gsize n_chars, i;
353
183k
  const gchar *p;
354
355
183k
  g_return_val_if_fail (str != NULL, NULL);
356
357
183k
  p = str;
358
183k
  n_chars = 0;
359
183k
  if (len < 0)
360
183k
    {
361
6.90M
      while (*p)
362
6.72M
  {
363
6.72M
    p = g_utf8_next_char (p);
364
6.72M
    ++n_chars;
365
6.72M
  }
366
183k
    }
367
0
  else
368
0
    {
369
0
      while (p < str + len && *p)
370
0
  {
371
0
    p = g_utf8_next_char (p);
372
0
    ++n_chars;
373
0
  }
374
0
    }
375
376
183k
  result = g_malloc (sizeof (gunichar) * (n_chars + 1));
377
183k
  if (!result)
378
0
    return NULL;
379
380
183k
  p = str;
381
6.90M
  for (i = 0; i < n_chars; i++)
382
6.72M
    {
383
6.72M
      gunichar wc = (guchar) * p++;
384
385
6.72M
      if (wc < 0x80)
386
1.10M
  {
387
1.10M
    result[i] = wc;
388
1.10M
  }
389
5.62M
      else
390
5.62M
  {
391
5.62M
    gunichar mask = 0x40;
392
393
5.62M
    if (G_UNLIKELY ((wc & mask) == 0))
394
0
      {
395
        /* It's an out-of-sequence 10xxxxxxx byte.
396
         * Rather than making an ugly hash of this and the next byte
397
         * and overrunning the buffer, it's more useful to treat it
398
         * with a replacement character */
399
0
        result[i] = 0xfffd;
400
0
        continue;
401
0
      }
402
403
5.62M
    do
404
9.63M
      {
405
9.63M
        wc <<= 6;
406
9.63M
        wc |= (guchar) (*p++) & 0x3f;
407
9.63M
        mask <<= 5;
408
9.63M
      }
409
9.63M
    while ((wc & mask) != 0);
410
411
5.62M
    wc &= mask - 1;
412
413
5.62M
    result[i] = wc;
414
5.62M
  }
415
6.72M
    }
416
183k
  result[i] = 0;
417
418
183k
  if (items_written)
419
171k
    *items_written = i;
420
421
183k
  return result;
422
183k
}
423
424
/*
425
 * g_ucs4_to_utf8:
426
 * @str: a UCS-4 encoded string
427
 * @len: the maximum length (number of characters) of @str to use.
428
 *       If @len < 0, then the string is nul-terminated.
429
 * @items_read: location to store number of characters read, or %NULL.
430
 * @items_written: location to store number of bytes written or %NULL.
431
 *                 The value here stored does not include the trailing 0
432
 *                 byte.
433
 * @error: location to store the error occurring, or %NULL to ignore
434
 *         errors. Any of the errors in #GConvertError other than
435
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
436
 *
437
 * Convert a string from a 32-bit fixed width representation as UCS-4.
438
 * to UTF-8. The result will be terminated with a 0 byte.
439
 *
440
 * Return value: a pointer to a newly allocated UTF-8 string.
441
 *               This value must be freed with g_free(). If an
442
 *               error occurs, %NULL will be returned and
443
 *               @error set. In that case, @items_read will be
444
 *               set to the position of the first invalid input
445
 *               character.
446
 **/
447
static gchar *
448
g_ucs4_to_utf8 (const gunichar *str,
449
    gsize len, gsize *items_read, gsize *items_written)
450
251k
{
451
251k
  gint result_length;
452
251k
  gchar *result = NULL;
453
251k
  gchar *p;
454
251k
  gsize i;
455
456
251k
  result_length = 0;
457
13.3M
  for (i = 0; i < len; i++)
458
13.1M
    {
459
13.1M
      if (!str[i])
460
11.6k
  break;
461
462
13.1M
      if (str[i] >= 0x80000000)
463
3.64k
  goto err_out;
464
465
13.1M
      result_length += UTF8_LENGTH (str[i]);
466
13.1M
    }
467
468
247k
  result = g_malloc (result_length + 1);
469
247k
  if (!result)
470
0
    return NULL;
471
247k
  p = result;
472
473
247k
  i = 0;
474
13.3M
  while (p < result + result_length)
475
13.1M
    p += g_unichar_to_utf8 (str[i++], p);
476
477
247k
  *p = '\0';
478
479
247k
  if (items_written)
480
0
    *items_written = p - result;
481
482
251k
err_out:
483
251k
  if (items_read)
484
0
    *items_read = i;
485
486
251k
  return result;
487
247k
}
488
489
/* Code from GLIB gunidecomp.c starts here. */
490
491
/* decomp.c - Character decomposition.
492
 *
493
 *  Copyright (C) 1999, 2000 Tom Tromey
494
 *  Copyright 2000 Red Hat, Inc.
495
 *
496
 * The Gnome Library is free software; you can redistribute it and/or
497
 * modify it under the terms of the GNU Lesser General Public License as
498
 * published by the Free Software Foundation; either version 2 of the
499
 * License, or (at your option) any later version.
500
 *
501
 * The Gnome Library is distributed in the hope that it will be useful,
502
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
503
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
504
 * Lesser General Public License for more details.
505
 */
506
507
#include "gunidecomp.h"
508
#include "gunicomp.h"
509
510
#define CC_PART1(Page, Char)            \
511
186M
  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
512
186M
   ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX)  \
513
186M
   : (cclass_data[combining_class_table_part1[Page]][Char]))
514
515
#define CC_PART2(Page, Char)            \
516
24.1k
  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
517
24.1k
   ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
518
24.1k
   : (cclass_data[combining_class_table_part2[Page]][Char]))
519
520
#define COMBINING_CLASS(Char)         \
521
186M
  (((Char) <= G_UNICODE_LAST_CHAR_PART1)     \
522
186M
   ? CC_PART1 ((Char) >> 8, (Char) & 0xff)     \
523
186M
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
524
69.1k
      ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
525
69.1k
      : 0))
526
527
/* constants for hangul syllable [de]composition */
528
200M
#define SBase 0xAC00
529
179M
#define LBase 0x1100
530
58.9M
#define VBase 0x1161
531
58.9M
#define TBase 0x11A7
532
9.20M
#define LCount 19
533
6.64M
#define VCount 21
534
6.67M
#define TCount 28
535
6.62M
#define NCount (VCount * TCount)
536
6.60M
#define SCount (LCount * NCount)
537
538
/*
539
 * g_unicode_canonical_ordering:
540
 * @string: a UCS-4 encoded string.
541
 * @len: the maximum length of @string to use.
542
 *
543
 * Computes the canonical ordering of a string in-place.
544
 * This rearranges decomposed characters in the string
545
 * according to their combining classes.  See the Unicode
546
 * manual for more information.
547
 **/
548
static void
549
g_unicode_canonical_ordering (gunichar *string, gsize len)
550
4.31M
{
551
4.31M
  gsize i;
552
4.31M
  int swap = 1;
553
554
8.63M
  while (swap)
555
4.32M
    {
556
4.32M
      int last;
557
4.32M
      swap = 0;
558
4.32M
      last = COMBINING_CLASS (string[0]);
559
118M
      for (i = 0; i < len - 1; ++i)
560
114M
  {
561
114M
    int next = COMBINING_CLASS (string[i + 1]);
562
114M
    if (next != 0 && last > next)
563
70.2k
      {
564
70.2k
        gsize j;
565
        /* Percolate item leftward through string.  */
566
4.27M
        for (j = i + 1; j > 0; --j)
567
4.26M
    {
568
4.26M
      gunichar t;
569
4.26M
      if (COMBINING_CLASS (string[j - 1]) <= next)
570
68.3k
        break;
571
4.20M
      t = string[j];
572
4.20M
      string[j] = string[j - 1];
573
4.20M
      string[j - 1] = t;
574
4.20M
      swap = 1;
575
4.20M
    }
576
        /* We're re-entering the loop looking at the old
577
           character again.  */
578
70.2k
        next = last;
579
70.2k
      }
580
114M
    last = next;
581
114M
  }
582
4.32M
    }
583
4.31M
}
584
585
/* http://www.unicode.org/unicode/reports/tr15/#Hangul
586
 * r should be null or have sufficient space. Calling with r == NULL will
587
 * only calculate the result_len; however, a buffer with space for three
588
 * characters will always be big enough. */
589
static void
590
decompose_hangul (gunichar s, gunichar *r, gsize *result_len)
591
16.7k
{
592
16.7k
  gint SIndex = s - SBase;
593
16.7k
  gint TIndex = SIndex % TCount;
594
595
16.7k
  if (r)
596
8.38k
    {
597
8.38k
      r[0] = LBase + SIndex / NCount;
598
8.38k
      r[1] = VBase + (SIndex % NCount) / TCount;
599
8.38k
    }
600
601
16.7k
  if (TIndex)
602
10.9k
    {
603
10.9k
      if (r)
604
5.48k
  r[2] = TBase + TIndex;
605
10.9k
      *result_len = 3;
606
10.9k
    }
607
5.81k
  else
608
5.81k
    *result_len = 2;
609
16.7k
}
610
611
/* returns a pointer to a null-terminated UTF-8 string */
612
static const gchar *
613
find_decomposition (gunichar ch, gboolean compat)
614
8.63M
{
615
8.63M
  int start = 0;
616
8.63M
  int end = G_N_ELEMENTS (decomp_table);
617
618
8.63M
  if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
619
7.96M
    {
620
95.1M
      while (TRUE)
621
95.1M
  {
622
95.1M
    int half = (start + end) / 2;
623
95.1M
    if (ch == decomp_table[half].ch)
624
7.78M
      {
625
7.78M
        int offset;
626
627
7.78M
        if (compat)
628
7.78M
    {
629
7.78M
      offset = decomp_table[half].compat_offset;
630
7.78M
      if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
631
315k
        offset = decomp_table[half].canon_offset;
632
7.78M
    }
633
0
        else
634
0
    {
635
0
      offset = decomp_table[half].canon_offset;
636
0
      if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
637
0
        return NULL;
638
0
    }
639
640
7.78M
        return &(decomp_expansion_string[offset]);
641
7.78M
      }
642
87.4M
    else if (half == start)
643
182k
      break;
644
87.2M
    else if (ch > decomp_table[half].ch)
645
45.3M
      start = half;
646
41.8M
    else
647
41.8M
      end = half;
648
95.1M
  }
649
7.96M
    }
650
651
853k
  return NULL;
652
8.63M
}
653
654
/* L,V => LV and LV,T => LVT  */
655
static gboolean
656
combine_hangul (gunichar a, gunichar b, gunichar *result)
657
58.9M
{
658
58.9M
  if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
659
9.91k
    {
660
9.91k
      gint LIndex = a - LBase;
661
9.91k
      gint VIndex = b - VBase;
662
663
9.91k
      *result = SBase + (LIndex * VCount + VIndex) * TCount;
664
9.91k
      return TRUE;
665
9.91k
    }
666
667
58.9M
  if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
668
6.68k
    {
669
6.68k
      gint SIndex = a - SBase;
670
671
6.68k
      if ((SIndex % TCount) == 0)
672
5.52k
  {
673
5.52k
    gint TIndex = b - TBase;
674
675
5.52k
    *result = a + TIndex;
676
5.52k
    return TRUE;
677
5.52k
  }
678
6.68k
    }
679
680
58.9M
  return FALSE;
681
58.9M
}
682
683
#define CI(Page, Char)          \
684
110M
  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
685
110M
   ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX)  \
686
110M
   : (compose_data[compose_table[Page]][Char]))
687
688
#define COMPOSE_INDEX(Char)           \
689
110M
  (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
690
691
static gboolean
692
combine (gunichar a, gunichar b, gunichar *result)
693
58.9M
{
694
58.9M
  gushort index_a, index_b;
695
696
58.9M
  if (combine_hangul (a, b, result))
697
15.4k
    return TRUE;
698
699
58.9M
  index_a = COMPOSE_INDEX (a);
700
701
58.9M
  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
702
6.92M
    {
703
6.92M
      if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
704
50.2k
  {
705
50.2k
    *result =
706
50.2k
      compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
707
50.2k
    return TRUE;
708
50.2k
  }
709
6.87M
      else
710
6.87M
  return FALSE;
711
6.92M
    }
712
713
51.9M
  index_b = COMPOSE_INDEX (b);
714
715
51.9M
  if (index_b >= COMPOSE_SECOND_SINGLE_START)
716
26.1k
    {
717
26.1k
      if (a ==
718
26.1k
    compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
719
24.2k
  {
720
24.2k
    *result =
721
24.2k
      compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
722
24.2k
    return TRUE;
723
24.2k
  }
724
1.88k
      else
725
1.88k
  return FALSE;
726
26.1k
    }
727
728
51.9M
  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
729
51.9M
      && index_b >= COMPOSE_SECOND_START
730
51.9M
      && index_b < COMPOSE_SECOND_SINGLE_START)
731
40.9k
    {
732
40.9k
      gunichar res =
733
40.9k
  compose_array[index_a - COMPOSE_FIRST_START][index_b -
734
40.9k
                 COMPOSE_SECOND_START];
735
736
40.9k
      if (res)
737
37.5k
  {
738
37.5k
    *result = res;
739
37.5k
    return TRUE;
740
37.5k
  }
741
40.9k
    }
742
743
51.9M
  return FALSE;
744
51.9M
}
745
746
static gunichar *
747
_g_utf8_normalize_wc (const gchar *str, gssize max_len, GNormalizeMode mode)
748
151k
{
749
151k
  gsize n_wc;
750
151k
  gunichar *wc_buffer;
751
151k
  const char *p;
752
151k
  gsize last_start;
753
151k
  gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
754
151k
  gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
755
756
151k
  n_wc = 0;
757
151k
  p = str;
758
4.47M
  while ((max_len < 0 || p < str + max_len) && *p)
759
4.32M
    {
760
4.32M
      const gchar *decomp;
761
4.32M
      gunichar wc = g_utf8_get_char (p);
762
763
4.32M
      if (wc >= SBase && wc < SBase + SCount)
764
8.38k
  {
765
8.38k
    gsize result_len;
766
8.38k
    decompose_hangul (wc, NULL, &result_len);
767
8.38k
    n_wc += result_len;
768
8.38k
  }
769
4.31M
      else
770
4.31M
  {
771
4.31M
    decomp = find_decomposition (wc, do_compat);
772
773
4.31M
    if (decomp)
774
3.89M
      n_wc += g_utf8_strlen (decomp);
775
426k
    else
776
426k
      n_wc++;
777
4.31M
  }
778
779
4.32M
      p = g_utf8_next_char (p);
780
4.32M
    }
781
782
151k
  wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
783
151k
  if (!wc_buffer)
784
0
    return NULL;
785
786
151k
  last_start = 0;
787
151k
  n_wc = 0;
788
151k
  p = str;
789
4.47M
  while ((max_len < 0 || p < str + max_len) && *p)
790
4.32M
    {
791
4.32M
      gunichar wc = g_utf8_get_char (p);
792
4.32M
      const gchar *decomp;
793
4.32M
      int cc;
794
4.32M
      gsize old_n_wc = n_wc;
795
796
4.32M
      if (wc >= SBase && wc < SBase + SCount)
797
8.38k
  {
798
8.38k
    gsize result_len;
799
8.38k
    decompose_hangul (wc, wc_buffer + n_wc, &result_len);
800
8.38k
    n_wc += result_len;
801
8.38k
  }
802
4.31M
      else
803
4.31M
  {
804
4.31M
    decomp = find_decomposition (wc, do_compat);
805
806
4.31M
    if (decomp)
807
3.89M
      {
808
3.89M
        const char *pd;
809
62.8M
        for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
810
58.9M
    wc_buffer[n_wc++] = g_utf8_get_char (pd);
811
3.89M
      }
812
426k
    else
813
426k
      wc_buffer[n_wc++] = wc;
814
4.31M
  }
815
816
4.32M
      if (n_wc > 0)
817
4.32M
  {
818
4.32M
    cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
819
820
4.32M
    if (cc == 0)
821
4.16M
      {
822
4.16M
        g_unicode_canonical_ordering (wc_buffer + last_start,
823
4.16M
              n_wc - last_start);
824
4.16M
        last_start = old_n_wc;
825
4.16M
      }
826
4.32M
  }
827
828
4.32M
      p = g_utf8_next_char (p);
829
4.32M
    }
830
831
151k
  if (n_wc > 0)
832
149k
    {
833
149k
      g_unicode_canonical_ordering (wc_buffer + last_start,
834
149k
            n_wc - last_start);
835
      /* dead assignment: last_start = n_wc; */
836
149k
    }
837
838
151k
  wc_buffer[n_wc] = 0;
839
840
  /* All decomposed and reordered */
841
842
151k
  if (do_compose && n_wc > 0)
843
149k
    {
844
149k
      gsize i, j;
845
149k
      int last_cc = 0;
846
149k
      last_start = 0;
847
848
59.5M
      for (i = 0; i < n_wc; i++)
849
59.3M
  {
850
59.3M
    int cc = COMBINING_CLASS (wc_buffer[i]);
851
852
59.3M
    if (i > 0 &&
853
59.3M
        (last_cc == 0 || last_cc != cc) &&
854
59.3M
        combine (wc_buffer[last_start], wc_buffer[i],
855
58.9M
           &wc_buffer[last_start]))
856
127k
      {
857
192M
        for (j = i + 1; j < n_wc; j++)
858
192M
    wc_buffer[j - 1] = wc_buffer[j];
859
127k
        n_wc--;
860
127k
        i--;
861
862
127k
        if (i == last_start)
863
120k
    last_cc = 0;
864
6.79k
        else
865
6.79k
    last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
866
867
127k
        continue;
868
127k
      }
869
870
59.2M
    if (cc == 0)
871
58.8M
      last_start = i;
872
873
59.2M
    last_cc = cc;
874
59.2M
  }
875
149k
    }
876
877
151k
  wc_buffer[n_wc] = 0;
878
879
151k
  return wc_buffer;
880
151k
}
881
882
/*
883
 * g_utf8_normalize:
884
 * @str: a UTF-8 encoded string.
885
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
886
 * @mode: the type of normalization to perform.
887
 *
888
 * Converts a string into canonical form, standardizing
889
 * such issues as whether a character with an accent
890
 * is represented as a base character and combining
891
 * accent or as a single precomposed character. The
892
 * string has to be valid UTF-8, otherwise %NULL is
893
 * returned. You should generally call g_utf8_normalize()
894
 * before comparing two Unicode strings.
895
 *
896
 * The normalization mode %G_NORMALIZE_DEFAULT only
897
 * standardizes differences that do not affect the
898
 * text content, such as the above-mentioned accent
899
 * representation. %G_NORMALIZE_ALL also standardizes
900
 * the "compatibility" characters in Unicode, such
901
 * as SUPERSCRIPT THREE to the standard forms
902
 * (in this case DIGIT THREE). Formatting information
903
 * may be lost but for most text operations such
904
 * characters should be considered the same.
905
 *
906
 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
907
 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
908
 * but returned a result with composed forms rather
909
 * than a maximally decomposed form. This is often
910
 * useful if you intend to convert the string to
911
 * a legacy encoding or pass it to a system with
912
 * less capable Unicode handling.
913
 *
914
 * Return value: a newly allocated string, that is the
915
 *   normalized form of @str, or %NULL if @str is not
916
 *   valid UTF-8.
917
 **/
918
static gchar *
919
g_utf8_normalize (const gchar *str, gssize len, GNormalizeMode mode)
920
1.87k
{
921
1.87k
  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
922
1.87k
  gchar *result = NULL;
923
924
1.87k
  if (result_wc)
925
1.87k
    result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
926
927
1.87k
  g_free (result_wc);
928
929
1.87k
  return result;
930
1.87k
}
931
932
/* Public Libidn API starts here. */
933
934
/**
935
 * stringprep_utf8_to_unichar:
936
 * @p: a pointer to Unicode character encoded as UTF-8
937
 *
938
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
939
 * If @p does not point to a valid UTF-8 encoded character, results are
940
 * undefined.
941
 *
942
 * Return value: the resulting character.
943
 **/
944
uint32_t
945
stringprep_utf8_to_unichar (const char *p)
946
2.77k
{
947
2.77k
  return g_utf8_get_char (p);
948
2.77k
}
949
950
/**
951
 * stringprep_unichar_to_utf8:
952
 * @c: a ISO10646 character code
953
 * @outbuf: output buffer, must have at least 6 bytes of space.
954
 *       If %NULL, the length will be computed and returned
955
 *       and nothing will be written to @outbuf.
956
 *
957
 * Converts a single character to UTF-8.
958
 *
959
 * Return value: number of bytes written.
960
 **/
961
int
962
stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
963
2.77k
{
964
2.77k
  return g_unichar_to_utf8 (c, outbuf);
965
2.77k
}
966
967
#include <unistr.h>
968
969
/**
970
 * stringprep_utf8_to_ucs4:
971
 * @str: a UTF-8 encoded string
972
 * @len: the maximum length of @str to use. If @len < 0, then
973
 *       the string is nul-terminated.
974
 * @items_written: location to store the number of characters in the
975
 *                 result, or %NULL.
976
 *
977
 * Convert a string from UTF-8 to a 32-bit fixed width representation
978
 * as UCS-4.  The function now performs error checking to verify that
979
 * the input is valid UTF-8 (before it was documented to not do error
980
 * checking).
981
 *
982
 * Return value: a pointer to a newly allocated UCS-4 string.
983
 *               This value must be deallocated by the caller.
984
 **/
985
uint32_t *
986
stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
987
192k
{
988
192k
  size_t n;
989
990
192k
  if (len < 0)
991
192k
    n = strlen (str);
992
0
  else
993
0
    n = len;
994
995
192k
  if (u8_check ((const uint8_t *) str, n))
996
9.27k
    return NULL;
997
998
183k
  return g_utf8_to_ucs4_fast (str, len, items_written);
999
192k
}
1000
1001
/**
1002
 * stringprep_ucs4_to_utf8:
1003
 * @str: a UCS-4 encoded string
1004
 * @len: the maximum length of @str to use. If @len < 0, then
1005
 *       the string is terminated with a 0 character.
1006
 * @items_read: location to store number of characters read read, or %NULL.
1007
 * @items_written: location to store number of bytes written or %NULL.
1008
 *                 The value here stored does not include the trailing 0
1009
 *                 byte.
1010
 *
1011
 * Convert a string from a 32-bit fixed width representation as UCS-4.
1012
 * to UTF-8. The result will be terminated with a 0 byte.
1013
 *
1014
 * Return value: a pointer to a newly allocated UTF-8 string.
1015
 *               This value must be deallocated by the caller.
1016
 *               If an error occurs, %NULL will be returned.
1017
 **/
1018
char *
1019
stringprep_ucs4_to_utf8 (const uint32_t *str, ssize_t len,
1020
       size_t *items_read, size_t *items_written)
1021
249k
{
1022
249k
  return g_ucs4_to_utf8 (str, len, items_read, items_written);
1023
249k
}
1024
1025
/**
1026
 * stringprep_utf8_nfkc_normalize:
1027
 * @str: a UTF-8 encoded string.
1028
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1029
 *
1030
 * Converts a string into canonical form, standardizing
1031
 * such issues as whether a character with an accent
1032
 * is represented as a base character and combining
1033
 * accent or as a single precomposed character.
1034
 *
1035
 * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1036
 * differences that do not affect the text content, such as the
1037
 * above-mentioned accent representation. It standardizes the
1038
 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1039
 * the standard forms (in this case DIGIT THREE). Formatting
1040
 * information may be lost but for most text operations such
1041
 * characters should be considered the same. It returns a result with
1042
 * composed forms rather than a maximally decomposed form.
1043
 *
1044
 * Return value: a newly allocated string, that is the
1045
 *   NFKC normalized form of @str.
1046
 **/
1047
char *
1048
stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1049
2.77k
{
1050
2.77k
  size_t n;
1051
1052
2.77k
  if (len < 0)
1053
0
    n = strlen (str);
1054
2.77k
  else
1055
2.77k
    n = len;
1056
1057
2.77k
  if (u8_check ((const uint8_t *) str, n))
1058
907
    return NULL;
1059
1060
1.87k
  return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1061
2.77k
}
1062
1063
#include <stdio.h>
1064
/**
1065
 * stringprep_ucs4_nfkc_normalize:
1066
 * @str: a Unicode string.
1067
 * @len: length of @str array, or -1 if @str is nul-terminated.
1068
 *
1069
 * Converts a UCS4 string into canonical form, see
1070
 * stringprep_utf8_nfkc_normalize() for more information.
1071
 *
1072
 * Return value: a newly allocated Unicode string, that is the NFKC
1073
 *   normalized form of @str.
1074
 **/
1075
uint32_t *
1076
stringprep_ucs4_nfkc_normalize (const uint32_t *str, ssize_t len)
1077
150k
{
1078
150k
  char *p;
1079
150k
  uint32_t *result_wc;
1080
1081
150k
  p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1082
150k
  if (!p)
1083
460
    return NULL;
1084
1085
149k
  result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1086
149k
  free (p);
1087
1088
149k
  return result_wc;
1089
150k
}