Coverage Report

Created: 2025-12-14 06:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libidn/lib/nfkc.c
Line
Count
Source
1
/* nfkc.c --- Unicode normalization utilities.
2
   Copyright (C) 2002-2025 Simon Josefsson
3
4
   This file is part of GNU Libidn.
5
6
   GNU Libidn is free software: you can redistribute it and/or
7
   modify it under the terms of either:
8
9
     * the GNU Lesser General Public License as published by the Free
10
       Software Foundation; either version 3 of the License, or (at
11
       your option) any later version.
12
13
   or
14
15
     * the GNU General Public License as published by the Free
16
       Software Foundation; either version 2 of the License, or (at
17
       your option) any later version.
18
19
   or both in parallel, as here.
20
21
   GNU Libidn is distributed in the hope that it will be useful,
22
   but WITHOUT ANY WARRANTY; without even the implied warranty of
23
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24
   General Public License for more details.
25
26
   You should have received copies of the GNU General Public License and
27
   the GNU Lesser General Public License along with this program.  If
28
   not, see <https://www.gnu.org/licenses/>. */
29
30
#ifdef HAVE_CONFIG_H
31
# include "config.h"
32
#endif
33
34
#include <stdlib.h>
35
#include <string.h>
36
37
#include "stringprep.h"
38
39
/* Hacks to make syncing with GLIB code easier. */
40
27.7k
#define gboolean int
41
39.9k
#define gchar char
42
#define guchar unsigned char
43
42.0k
#define gint int
44
842k
#define guint unsigned int
45
3.87M
#define gushort unsigned short
46
#define gint16 int16_t
47
#define guint16 uint16_t
48
6.50M
#define gunichar uint32_t
49
1.18M
#define gsize size_t
50
#define gssize ssize_t
51
48.4k
#define g_malloc malloc
52
1.65k
#define g_free free
53
322k
#define g_return_val_if_fail(expr,val)  {   \
54
322k
    if (!(expr))         \
55
322k
      return (val);         \
56
322k
  }
57
58
/* Code from GLIB gmacros.h starts here. */
59
60
/* GLIB - Library of useful routines for C programming
61
 * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald
62
 *
63
 * This library is free software; you can redistribute it and/or
64
 * modify it under the terms of the GNU Lesser General Public
65
 * License as published by the Free Software Foundation; either
66
 * version 2 of the License, or (at your option) any later version.
67
 *
68
 * This library is distributed in the hope that it will be useful,
69
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
70
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
71
 * Lesser General Public License for more details.
72
 */
73
74
#ifndef FALSE
75
15.2M
# define  FALSE (0)
76
#endif
77
78
#ifndef TRUE
79
7.52M
# define  TRUE  (!FALSE)
80
#endif
81
82
848k
#define G_N_ELEMENTS(arr)   (sizeof (arr) / sizeof ((arr)[0]))
83
84
340k
#define G_UNLIKELY(expr) (expr)
85
86
/* Code from GLIB gunicode.h starts here. */
87
88
/* gunicode.h - Unicode manipulation functions
89
 *
90
 *  Copyright (C) 1999, 2000 Tom Tromey
91
 *  Copyright 2000, 2005 Red Hat, Inc.
92
 *
93
 * The Gnome Library is free software; you can redistribute it and/or
94
 * modify it under the terms of the GNU Lesser General Public License as
95
 * published by the Free Software Foundation; either version 2 of the
96
 * License, or (at your option) any later version.
97
 *
98
 * The Gnome Library is distributed in the hope that it will be useful,
99
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
100
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
101
 * Lesser General Public License for more details.
102
 */
103
104
typedef enum
105
{
106
  G_NORMALIZE_DEFAULT,
107
  G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
108
  G_NORMALIZE_DEFAULT_COMPOSE,
109
  G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
110
  G_NORMALIZE_ALL,
111
  G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
112
  G_NORMALIZE_ALL_COMPOSE,
113
  G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
114
}
115
GNormalizeMode;
116
117
9.16M
#define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
118
119
/* Code from GLIB gutf8.c starts here. */
120
121
/* gutf8.c - Operations on UTF-8 strings.
122
 *
123
 * Copyright (C) 1999 Tom Tromey
124
 * Copyright (C) 2000 Red Hat, Inc.
125
 *
126
 * This library is free software; you can redistribute it and/or
127
 * modify it under the terms of the GNU Lesser General Public
128
 * License as published by the Free Software Foundation; either
129
 * version 2 of the License, or (at your option) any later version.
130
 *
131
 * This library is distributed in the hope that it will be useful,
132
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
133
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
134
 * Lesser General Public License for more details.
135
 */
136
137
#define UTF8_COMPUTE(Char, Mask, Len)   \
138
4.77M
  if (Char < 128)       \
139
4.77M
    {           \
140
816k
      Len = 1;          \
141
816k
      Mask = 0x7f;        \
142
816k
    }            \
143
4.77M
  else if ((Char & 0xe0) == 0xc0)   \
144
3.95M
    {           \
145
3.36M
      Len = 2;          \
146
3.36M
      Mask = 0x1f;        \
147
3.36M
    }            \
148
3.95M
  else if ((Char & 0xf0) == 0xe0)   \
149
589k
    {           \
150
575k
      Len = 3;          \
151
575k
      Mask = 0x0f;        \
152
575k
    }            \
153
589k
  else if ((Char & 0xf8) == 0xf0)   \
154
13.9k
    {           \
155
7.32k
      Len = 4;          \
156
7.32k
      Mask = 0x07;        \
157
7.32k
    }            \
158
13.9k
  else if ((Char & 0xfc) == 0xf8)   \
159
6.58k
    {           \
160
1.10k
      Len = 5;          \
161
1.10k
      Mask = 0x03;        \
162
1.10k
    }            \
163
6.58k
  else if ((Char & 0xfe) == 0xfc)   \
164
5.48k
    {           \
165
5.31k
      Len = 6;          \
166
5.31k
      Mask = 0x01;        \
167
5.31k
    }            \
168
5.48k
  else            \
169
5.48k
    Len = -1;
170
171
#define UTF8_LENGTH(Char)     \
172
839k
  ((Char) < 0x80 ? 1 :        \
173
839k
   ((Char) < 0x800 ? 2 :      \
174
635k
    ((Char) < 0x10000 ? 3 :      \
175
246k
     ((Char) < 0x200000 ? 4 :      \
176
7.01k
      ((Char) < 0x4000000 ? 5 : 6)))))
177
178
#define UTF8_GET(Result, Chars, Count, Mask, Len)           \
179
4.77M
  (Result) = (Chars)[0] & (Mask);               \
180
9.34M
  for ((Count) = 1; (Count) < (Len); ++(Count))             \
181
4.77M
    {                       \
182
4.57M
      if (((Chars)[(Count)] & 0xc0) != 0x80)             \
183
4.57M
  {                     \
184
237
    (Result) = -1;                  \
185
237
    break;                    \
186
237
  }                      \
187
4.57M
      (Result) <<= 6;                   \
188
4.57M
      (Result) |= ((Chars)[(Count)] & 0x3f);              \
189
4.57M
    }
190
191
static const gchar utf8_skip_data[256] = {
192
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
193
  1, 1, 1, 1, 1, 1, 1,
194
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
195
  1, 1, 1, 1, 1, 1, 1,
196
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
197
  1, 1, 1, 1, 1, 1, 1,
198
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
199
  1, 1, 1, 1, 1, 1, 1,
200
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
201
  1, 1, 1, 1, 1, 1, 1,
202
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
203
  1, 1, 1, 1, 1, 1, 1,
204
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
205
  2, 2, 2, 2, 2, 2, 2,
206
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
207
  5, 5, 5, 6, 6, 1, 1
208
};
209
210
static const gchar *const g_utf8_skip = utf8_skip_data;
211
212
/*
213
 * g_utf8_strlen:
214
 * @p: pointer to the start of a UTF-8 encoded string
215
 * @max: the maximum number of bytes to examine. If @max
216
 *       is less than 0, then the string is assumed to be
217
 *       nul-terminated. If @max is 0, @p will not be examined and
218
 *       may be %NULL.
219
 *
220
 * Computes the length of the string in characters, not including
221
 * the terminating nul character.
222
 *
223
 * Return value: the length of the string in characters
224
 **/
225
static gsize
226
g_utf8_strlen (const gchar *p)
227
306k
{
228
306k
  gsize len = 0;
229
230
306k
  g_return_val_if_fail (p != NULL, 0);
231
232
4.22M
  while (*p)
233
3.91M
    {
234
3.91M
      p = g_utf8_next_char (p);
235
3.91M
      ++len;
236
3.91M
    }
237
238
306k
  return len;
239
306k
}
240
241
/*
242
 * g_utf8_get_char:
243
 * @p: a pointer to Unicode character encoded as UTF-8
244
 *
245
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
246
 * If @p does not point to a valid UTF-8 encoded character, results are
247
 * undefined. If you are not sure that the bytes are complete
248
 * valid Unicode characters, you should use g_utf8_get_char_validated()
249
 * instead.
250
 *
251
 * Return value: the resulting character
252
 **/
253
static gunichar
254
g_utf8_get_char (const gchar *p)
255
4.77M
{
256
4.77M
  int i, mask = 0, len;
257
4.77M
  gunichar result;
258
4.77M
  unsigned char c = (unsigned char) *p;
259
260
4.77M
  UTF8_COMPUTE (c, mask, len);
261
4.77M
  if (len == -1)
262
172
    return (gunichar) - 1;
263
4.77M
  UTF8_GET (result, p, i, mask, len);
264
265
4.77M
  return result;
266
4.77M
}
267
268
/*
269
 * g_unichar_to_utf8:
270
 * @c: a Unicode character code
271
 * @outbuf: output buffer, must have at least 6 bytes of space.
272
 *       If %NULL, the length will be computed and returned
273
 *       and nothing will be written to @outbuf.
274
 *
275
 * Converts a single character to UTF-8.
276
 *
277
 * Return value: number of bytes written
278
 **/
279
static int
280
g_unichar_to_utf8 (gunichar c, gchar *outbuf)
281
842k
{
282
  /* If this gets modified, also update the copy in g_string_insert_unichar() */
283
842k
  guint len = 0;
284
842k
  int first;
285
842k
  int i;
286
287
842k
  if (c < 0x80)
288
204k
    {
289
204k
      first = 0;
290
204k
      len = 1;
291
204k
    }
292
637k
  else if (c < 0x800)
293
389k
    {
294
389k
      first = 0xc0;
295
389k
      len = 2;
296
389k
    }
297
247k
  else if (c < 0x10000)
298
239k
    {
299
239k
      first = 0xe0;
300
239k
      len = 3;
301
239k
    }
302
7.57k
  else if (c < 0x200000)
303
3.95k
    {
304
3.95k
      first = 0xf0;
305
3.95k
      len = 4;
306
3.95k
    }
307
3.61k
  else if (c < 0x4000000)
308
557
    {
309
557
      first = 0xf8;
310
557
      len = 5;
311
557
    }
312
3.06k
  else
313
3.06k
    {
314
3.06k
      first = 0xfc;
315
3.06k
      len = 6;
316
3.06k
    }
317
318
842k
  if (outbuf)
319
842k
    {
320
1.74M
      for (i = len - 1; i > 0; --i)
321
899k
  {
322
899k
    outbuf[i] = (c & 0x3f) | 0x80;
323
899k
    c >>= 6;
324
899k
  }
325
842k
      outbuf[0] = c | first;
326
842k
    }
327
328
842k
  return len;
329
842k
}
330
331
/*
332
 * g_utf8_to_ucs4_fast:
333
 * @str: a UTF-8 encoded string
334
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
335
 *       then the string is nul-terminated.
336
 * @items_written: location to store the number of characters in the
337
 *                 result, or %NULL.
338
 *
339
 * Convert a string from UTF-8 to a 32-bit fixed width
340
 * representation as UCS-4, assuming valid UTF-8 input.
341
 * This function is roughly twice as fast as g_utf8_to_ucs4()
342
 * but does no error checking on the input. A trailing 0 character
343
 * will be added to the string after the converted text.
344
 *
345
 * Return value: a pointer to a newly allocated UCS-4 string.
346
 *               This value must be freed with g_free().
347
 **/
348
static gunichar *
349
g_utf8_to_ucs4_fast (const gchar *str, gssize len, gsize *items_written)
350
15.8k
{
351
15.8k
  gunichar *result;
352
15.8k
  gsize n_chars, i;
353
15.8k
  const gchar *p;
354
355
15.8k
  g_return_val_if_fail (str != NULL, NULL);
356
357
15.8k
  p = str;
358
15.8k
  n_chars = 0;
359
15.8k
  if (len < 0)
360
15.8k
    {
361
494k
      while (*p)
362
478k
  {
363
478k
    p = g_utf8_next_char (p);
364
478k
    ++n_chars;
365
478k
  }
366
15.8k
    }
367
0
  else
368
0
    {
369
0
      while (p < str + len && *p)
370
0
  {
371
0
    p = g_utf8_next_char (p);
372
0
    ++n_chars;
373
0
  }
374
0
    }
375
376
15.8k
  result = g_malloc (sizeof (gunichar) * (n_chars + 1));
377
15.8k
  if (!result)
378
0
    return NULL;
379
380
15.8k
  p = str;
381
494k
  for (i = 0; i < n_chars; i++)
382
478k
    {
383
478k
      gunichar wc = (guchar) * p++;
384
385
478k
      if (wc < 0x80)
386
138k
  {
387
138k
    result[i] = wc;
388
138k
  }
389
340k
      else
390
340k
  {
391
340k
    gunichar mask = 0x40;
392
393
340k
    if (G_UNLIKELY ((wc & mask) == 0))
394
0
      {
395
        /* It's an out-of-sequence 10xxxxxxx byte.
396
         * Rather than making an ugly hash of this and the next byte
397
         * and overrunning the buffer, it's more useful to treat it
398
         * with a replacement character */
399
0
        result[i] = 0xfffd;
400
0
        continue;
401
0
      }
402
403
340k
    do
404
590k
      {
405
590k
        wc <<= 6;
406
590k
        wc |= (guchar) (*p++) & 0x3f;
407
590k
        mask <<= 5;
408
590k
      }
409
590k
    while ((wc & mask) != 0);
410
411
340k
    wc &= mask - 1;
412
413
340k
    result[i] = wc;
414
340k
  }
415
478k
    }
416
15.8k
  result[i] = 0;
417
418
15.8k
  if (items_written)
419
14.0k
    *items_written = i;
420
421
15.8k
  return result;
422
15.8k
}
423
424
/*
425
 * g_ucs4_to_utf8:
426
 * @str: a UCS-4 encoded string
427
 * @len: the maximum length (number of characters) of @str to use.
428
 *       If @len < 0, then the string is nul-terminated.
429
 * @items_read: location to store number of characters read, or %NULL.
430
 * @items_written: location to store number of bytes written or %NULL.
431
 *                 The value here stored does not include the trailing 0
432
 *                 byte.
433
 * @error: location to store the error occurring, or %NULL to ignore
434
 *         errors. Any of the errors in #GConvertError other than
435
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
436
 *
437
 * Convert a string from a 32-bit fixed width representation as UCS-4.
438
 * to UTF-8. The result will be terminated with a 0 byte.
439
 *
440
 * Return value: a pointer to a newly allocated UTF-8 string.
441
 *               This value must be freed with g_free(). If an
442
 *               error occurs, %NULL will be returned and
443
 *               @error set. In that case, @items_read will be
444
 *               set to the position of the first invalid input
445
 *               character.
446
 **/
447
static gchar *
448
g_ucs4_to_utf8 (const gunichar *str,
449
    gsize len, gsize *items_read, gsize *items_written)
450
19.1k
{
451
19.1k
  gint result_length;
452
19.1k
  gchar *result = NULL;
453
19.1k
  gchar *p;
454
19.1k
  gsize i;
455
456
19.1k
  result_length = 0;
457
858k
  for (i = 0; i < len; i++)
458
841k
    {
459
841k
      if (!str[i])
460
1.65k
  break;
461
462
840k
      if (str[i] >= 0x80000000)
463
451
  goto err_out;
464
465
839k
      result_length += UTF8_LENGTH (str[i]);
466
839k
    }
467
468
18.7k
  result = g_malloc (result_length + 1);
469
18.7k
  if (!result)
470
0
    return NULL;
471
18.7k
  p = result;
472
473
18.7k
  i = 0;
474
858k
  while (p < result + result_length)
475
839k
    p += g_unichar_to_utf8 (str[i++], p);
476
477
18.7k
  *p = '\0';
478
479
18.7k
  if (items_written)
480
0
    *items_written = p - result;
481
482
19.1k
err_out:
483
19.1k
  if (items_read)
484
0
    *items_read = i;
485
486
19.1k
  return result;
487
18.7k
}
488
489
/* Code from GLIB gunidecomp.c starts here. */
490
491
/* decomp.c - Character decomposition.
492
 *
493
 *  Copyright (C) 1999, 2000 Tom Tromey
494
 *  Copyright 2000 Red Hat, Inc.
495
 *
496
 * The Gnome Library is free software; you can redistribute it and/or
497
 * modify it under the terms of the GNU Lesser General Public License as
498
 * published by the Free Software Foundation; either version 2 of the
499
 * License, or (at your option) any later version.
500
 *
501
 * The Gnome Library is distributed in the hope that it will be useful,
502
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
503
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
504
 * Lesser General Public License for more details.
505
 */
506
507
#include "gunidecomp.h"
508
#include "gunicomp.h"
509
510
#define CC_PART1(Page, Char)            \
511
12.4M
  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
512
12.4M
   ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX)  \
513
12.4M
   : (cclass_data[combining_class_table_part1[Page]][Char]))
514
515
#define CC_PART2(Page, Char)            \
516
4.84k
  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
517
4.84k
   ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
518
4.84k
   : (cclass_data[combining_class_table_part2[Page]][Char]))
519
520
#define COMBINING_CLASS(Char)         \
521
12.4M
  (((Char) <= G_UNICODE_LAST_CHAR_PART1)     \
522
12.4M
   ? CC_PART1 ((Char) >> 8, (Char) & 0xff)     \
523
12.4M
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
524
22.3k
      ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
525
22.3k
      : 0))
526
527
/* constants for hangul syllable [de]composition */
528
13.7M
#define SBase 0xAC00
529
11.7M
#define LBase 0x1100
530
3.89M
#define VBase 0x1161
531
3.88M
#define TBase 0x11A7
532
565k
#define LCount 19
533
456k
#define VCount 21
534
465k
#define TCount 28
535
448k
#define NCount (VCount * TCount)
536
442k
#define SCount (LCount * NCount)
537
538
/*
539
 * g_unicode_canonical_ordering:
540
 * @string: a UCS-4 encoded string.
541
 * @len: the maximum length of @string to use.
542
 *
543
 * Computes the canonical ordering of a string in-place.
544
 * This rearranges decomposed characters in the string
545
 * according to their combining classes.  See the Unicode
546
 * manual for more information.
547
 **/
548
static void
549
g_unicode_canonical_ordering (gunichar *string, gsize len)
550
360k
{
551
360k
  gsize i;
552
360k
  int swap = 1;
553
554
721k
  while (swap)
555
361k
    {
556
361k
      int last;
557
361k
      swap = 0;
558
361k
      last = COMBINING_CLASS (string[0]);
559
7.93M
      for (i = 0; i < len - 1; ++i)
560
7.57M
  {
561
7.57M
    int next = COMBINING_CLASS (string[i + 1]);
562
7.57M
    if (next != 0 && last > next)
563
7.92k
      {
564
7.92k
        gsize j;
565
        /* Percolate item leftward through string.  */
566
19.9k
        for (j = i + 1; j > 0; --j)
567
19.5k
    {
568
19.5k
      gunichar t;
569
19.5k
      if (COMBINING_CLASS (string[j - 1]) <= next)
570
7.51k
        break;
571
12.0k
      t = string[j];
572
12.0k
      string[j] = string[j - 1];
573
12.0k
      string[j - 1] = t;
574
12.0k
      swap = 1;
575
12.0k
    }
576
        /* We're re-entering the loop looking at the old
577
           character again.  */
578
7.92k
        next = last;
579
7.92k
      }
580
7.57M
    last = next;
581
7.57M
  }
582
361k
    }
583
360k
}
584
585
/* http://www.unicode.org/unicode/reports/tr15/#Hangul
586
 * r should be null or have sufficient space. Calling with r == NULL will
587
 * only calculate the result_len; however, a buffer with space for three
588
 * characters will always be big enough. */
589
static void
590
decompose_hangul (gunichar s, gunichar *r, gsize *result_len)
591
6.51k
{
592
6.51k
  gint SIndex = s - SBase;
593
6.51k
  gint TIndex = SIndex % TCount;
594
595
6.51k
  if (r)
596
3.25k
    {
597
3.25k
      r[0] = LBase + SIndex / NCount;
598
3.25k
      r[1] = VBase + (SIndex % NCount) / TCount;
599
3.25k
    }
600
601
6.51k
  if (TIndex)
602
2.53k
    {
603
2.53k
      if (r)
604
1.26k
  r[2] = TBase + TIndex;
605
2.53k
      *result_len = 3;
606
2.53k
    }
607
3.97k
  else
608
3.97k
    *result_len = 2;
609
6.51k
}
610
611
/* returns a pointer to a null-terminated UTF-8 string */
612
static const gchar *
613
find_decomposition (gunichar ch, gboolean compat)
614
848k
{
615
848k
  int start = 0;
616
848k
  int end = G_N_ELEMENTS (decomp_table);
617
618
848k
  if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
619
636k
    {
620
7.51M
      while (TRUE)
621
7.51M
  {
622
7.51M
    int half = (start + end) / 2;
623
7.51M
    if (ch == decomp_table[half].ch)
624
613k
      {
625
613k
        int offset;
626
627
613k
        if (compat)
628
613k
    {
629
613k
      offset = decomp_table[half].compat_offset;
630
613k
      if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
631
161k
        offset = decomp_table[half].canon_offset;
632
613k
    }
633
0
        else
634
0
    {
635
0
      offset = decomp_table[half].canon_offset;
636
0
      if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
637
0
        return NULL;
638
0
    }
639
640
613k
        return &(decomp_expansion_string[offset]);
641
613k
      }
642
6.89M
    else if (half == start)
643
23.2k
      break;
644
6.87M
    else if (ch > decomp_table[half].ch)
645
3.54M
      start = half;
646
3.32M
    else
647
3.32M
      end = half;
648
7.51M
  }
649
636k
    }
650
651
234k
  return NULL;
652
848k
}
653
654
/* L,V => LV and LV,T => LVT  */
655
static gboolean
656
combine_hangul (gunichar a, gunichar b, gunichar *result)
657
3.87M
{
658
3.87M
  if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
659
3.53k
    {
660
3.53k
      gint LIndex = a - LBase;
661
3.53k
      gint VIndex = b - VBase;
662
663
3.53k
      *result = SBase + (LIndex * VCount + VIndex) * TCount;
664
3.53k
      return TRUE;
665
3.53k
    }
666
667
3.87M
  if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
668
1.48k
    {
669
1.48k
      gint SIndex = a - SBase;
670
671
1.48k
      if ((SIndex % TCount) == 0)
672
1.28k
  {
673
1.28k
    gint TIndex = b - TBase;
674
675
1.28k
    *result = a + TIndex;
676
1.28k
    return TRUE;
677
1.28k
  }
678
1.48k
    }
679
680
3.87M
  return FALSE;
681
3.87M
}
682
683
#define CI(Page, Char)          \
684
7.28M
  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
685
7.28M
   ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX)  \
686
7.28M
   : (compose_data[compose_table[Page]][Char]))
687
688
#define COMPOSE_INDEX(Char)           \
689
7.29M
  (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
690
691
static gboolean
692
combine (gunichar a, gunichar b, gunichar *result)
693
3.87M
{
694
3.87M
  gushort index_a, index_b;
695
696
3.87M
  if (combine_hangul (a, b, result))
697
4.81k
    return TRUE;
698
699
3.87M
  index_a = COMPOSE_INDEX (a);
700
701
3.87M
  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
702
441k
    {
703
441k
      if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
704
1.48k
  {
705
1.48k
    *result =
706
1.48k
      compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
707
1.48k
    return TRUE;
708
1.48k
  }
709
440k
      else
710
440k
  return FALSE;
711
441k
    }
712
713
3.42M
  index_b = COMPOSE_INDEX (b);
714
715
3.42M
  if (index_b >= COMPOSE_SECOND_SINGLE_START)
716
1.64k
    {
717
1.64k
      if (a ==
718
1.64k
    compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
719
1.03k
  {
720
1.03k
    *result =
721
1.03k
      compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
722
1.03k
    return TRUE;
723
1.03k
  }
724
608
      else
725
608
  return FALSE;
726
1.64k
    }
727
728
3.42M
  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
729
247k
      && index_b >= COMPOSE_SECOND_START
730
4.70k
      && index_b < COMPOSE_SECOND_SINGLE_START)
731
4.70k
    {
732
4.70k
      gunichar res =
733
4.70k
  compose_array[index_a - COMPOSE_FIRST_START][index_b -
734
4.70k
                 COMPOSE_SECOND_START];
735
736
4.70k
      if (res)
737
4.31k
  {
738
4.31k
    *result = res;
739
4.31k
    return TRUE;
740
4.31k
  }
741
4.70k
    }
742
743
3.42M
  return FALSE;
744
3.42M
}
745
746
static gunichar *
747
_g_utf8_normalize_wc (const gchar *str, gssize max_len, GNormalizeMode mode)
748
13.8k
{
749
13.8k
  gsize n_wc;
750
13.8k
  gunichar *wc_buffer;
751
13.8k
  const char *p;
752
13.8k
  gsize last_start;
753
13.8k
  gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
754
13.8k
  gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
755
756
13.8k
  n_wc = 0;
757
13.8k
  p = str;
758
441k
  while ((max_len < 0 || p < str + max_len) && *p)
759
427k
    {
760
427k
      const gchar *decomp;
761
427k
      gunichar wc = g_utf8_get_char (p);
762
763
427k
      if (wc >= SBase && wc < SBase + SCount)
764
3.25k
  {
765
3.25k
    gsize result_len;
766
3.25k
    decompose_hangul (wc, NULL, &result_len);
767
3.25k
    n_wc += result_len;
768
3.25k
  }
769
424k
      else
770
424k
  {
771
424k
    decomp = find_decomposition (wc, do_compat);
772
773
424k
    if (decomp)
774
306k
      n_wc += g_utf8_strlen (decomp);
775
117k
    else
776
117k
      n_wc++;
777
424k
  }
778
779
427k
      p = g_utf8_next_char (p);
780
427k
    }
781
782
13.8k
  wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
783
13.8k
  if (!wc_buffer)
784
0
    return NULL;
785
786
13.8k
  last_start = 0;
787
13.8k
  n_wc = 0;
788
13.8k
  p = str;
789
441k
  while ((max_len < 0 || p < str + max_len) && *p)
790
427k
    {
791
427k
      gunichar wc = g_utf8_get_char (p);
792
427k
      const gchar *decomp;
793
427k
      int cc;
794
427k
      gsize old_n_wc = n_wc;
795
796
427k
      if (wc >= SBase && wc < SBase + SCount)
797
3.25k
  {
798
3.25k
    gsize result_len;
799
3.25k
    decompose_hangul (wc, wc_buffer + n_wc, &result_len);
800
3.25k
    n_wc += result_len;
801
3.25k
  }
802
424k
      else
803
424k
  {
804
424k
    decomp = find_decomposition (wc, do_compat);
805
806
424k
    if (decomp)
807
306k
      {
808
306k
        const char *pd;
809
4.22M
        for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
810
3.91M
    wc_buffer[n_wc++] = g_utf8_get_char (pd);
811
306k
      }
812
117k
    else
813
117k
      wc_buffer[n_wc++] = wc;
814
424k
  }
815
816
427k
      if (n_wc > 0)
817
427k
  {
818
427k
    cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
819
820
427k
    if (cc == 0)
821
346k
      {
822
346k
        g_unicode_canonical_ordering (wc_buffer + last_start,
823
346k
              n_wc - last_start);
824
346k
        last_start = old_n_wc;
825
346k
      }
826
427k
  }
827
828
427k
      p = g_utf8_next_char (p);
829
427k
    }
830
831
13.8k
  if (n_wc > 0)
832
13.3k
    {
833
13.3k
      g_unicode_canonical_ordering (wc_buffer + last_start,
834
13.3k
            n_wc - last_start);
835
      /* dead assignment: last_start = n_wc; */
836
13.3k
    }
837
838
13.8k
  wc_buffer[n_wc] = 0;
839
840
  /* All decomposed and reordered */
841
842
13.8k
  if (do_compose && n_wc > 0)
843
13.3k
    {
844
13.3k
      gsize i, j;
845
13.3k
      int last_cc = 0;
846
13.3k
      last_start = 0;
847
848
4.05M
      for (i = 0; i < n_wc; i++)
849
4.03M
  {
850
4.03M
    int cc = COMBINING_CLASS (wc_buffer[i]);
851
852
4.03M
    if (i > 0 &&
853
4.02M
        (last_cc == 0 || last_cc != cc) &&
854
3.87M
        combine (wc_buffer[last_start], wc_buffer[i],
855
3.87M
           &wc_buffer[last_start]))
856
11.6k
      {
857
1.79M
        for (j = i + 1; j < n_wc; j++)
858
1.78M
    wc_buffer[j - 1] = wc_buffer[j];
859
11.6k
        n_wc--;
860
11.6k
        i--;
861
862
11.6k
        if (i == last_start)
863
10.6k
    last_cc = 0;
864
951
        else
865
951
    last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
866
867
11.6k
        continue;
868
11.6k
      }
869
870
4.02M
    if (cc == 0)
871
3.86M
      last_start = i;
872
873
4.02M
    last_cc = cc;
874
4.02M
  }
875
13.3k
    }
876
877
13.8k
  wc_buffer[n_wc] = 0;
878
879
13.8k
  return wc_buffer;
880
13.8k
}
881
882
/*
883
 * g_utf8_normalize:
884
 * @str: a UTF-8 encoded string.
885
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
886
 * @mode: the type of normalization to perform.
887
 *
888
 * Converts a string into canonical form, standardizing
889
 * such issues as whether a character with an accent
890
 * is represented as a base character and combining
891
 * accent or as a single precomposed character. The
892
 * string has to be valid UTF-8, otherwise %NULL is
893
 * returned. You should generally call g_utf8_normalize()
894
 * before comparing two Unicode strings.
895
 *
896
 * The normalization mode %G_NORMALIZE_DEFAULT only
897
 * standardizes differences that do not affect the
898
 * text content, such as the above-mentioned accent
899
 * representation. %G_NORMALIZE_ALL also standardizes
900
 * the "compatibility" characters in Unicode, such
901
 * as SUPERSCRIPT THREE to the standard forms
902
 * (in this case DIGIT THREE). Formatting information
903
 * may be lost but for most text operations such
904
 * characters should be considered the same.
905
 *
906
 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
907
 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
908
 * but returned a result with composed forms rather
909
 * than a maximally decomposed form. This is often
910
 * useful if you intend to convert the string to
911
 * a legacy encoding or pass it to a system with
912
 * less capable Unicode handling.
913
 *
914
 * Return value: a newly allocated string, that is the
915
 *   normalized form of @str, or %NULL if @str is not
916
 *   valid UTF-8.
917
 **/
918
static gchar *
919
g_utf8_normalize (const gchar *str, gssize len, GNormalizeMode mode)
920
1.65k
{
921
1.65k
  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
922
1.65k
  gchar *result = NULL;
923
924
1.65k
  if (result_wc)
925
1.65k
    result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
926
927
1.65k
  g_free (result_wc);
928
929
1.65k
  return result;
930
1.65k
}
931
932
/* Public Libidn API starts here. */
933
934
/**
935
 * stringprep_utf8_to_unichar:
936
 * @p: a pointer to Unicode character encoded as UTF-8
937
 *
938
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
939
 * If @p does not point to a valid UTF-8 encoded character, results are
940
 * undefined.
941
 *
942
 * Return value: the resulting character.
943
 **/
944
uint32_t
945
stringprep_utf8_to_unichar (const char *p)
946
2.48k
{
947
2.48k
  return g_utf8_get_char (p);
948
2.48k
}
949
950
/**
951
 * stringprep_unichar_to_utf8:
952
 * @c: a ISO10646 character code
953
 * @outbuf: output buffer, must have at least 6 bytes of space.
954
 *       If %NULL, the length will be computed and returned
955
 *       and nothing will be written to @outbuf.
956
 *
957
 * Converts a single character to UTF-8.
958
 *
959
 * Return value: number of bytes written.
960
 **/
961
int
962
stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
963
2.48k
{
964
2.48k
  return g_unichar_to_utf8 (c, outbuf);
965
2.48k
}
966
967
#include <unistr.h>
968
969
/**
970
 * stringprep_utf8_to_ucs4:
971
 * @str: a UTF-8 encoded string
972
 * @len: the maximum length of @str to use. If @len < 0, then
973
 *       the string is nul-terminated.
974
 * @items_written: location to store the number of characters in the
975
 *                 result, or %NULL.
976
 *
977
 * Convert a string from UTF-8 to a 32-bit fixed width representation
978
 * as UCS-4.  The function now performs error checking to verify that
979
 * the input is valid UTF-8 (before it was documented to not do error
980
 * checking).
981
 *
982
 * Return value: a pointer to a newly allocated UCS-4 string.
983
 *               This value must be deallocated by the caller.
984
 **/
985
uint32_t *
986
stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
987
19.2k
{
988
19.2k
  size_t n;
989
990
19.2k
  if (len < 0)
991
19.2k
    n = strlen (str);
992
0
  else
993
0
    n = len;
994
995
19.2k
  if (u8_check ((const uint8_t *) str, n))
996
3.35k
    return NULL;
997
998
15.8k
  return g_utf8_to_ucs4_fast (str, len, items_written);
999
19.2k
}
1000
1001
/**
1002
 * stringprep_ucs4_to_utf8:
1003
 * @str: a UCS-4 encoded string
1004
 * @len: the maximum length of @str to use. If @len < 0, then
1005
 *       the string is terminated with a 0 character.
1006
 * @items_read: location to store number of characters read read, or %NULL.
1007
 * @items_written: location to store number of bytes written or %NULL.
1008
 *                 The value here stored does not include the trailing 0
1009
 *                 byte.
1010
 *
1011
 * Convert a string from a 32-bit fixed width representation as UCS-4.
1012
 * to UTF-8. The result will be terminated with a 0 byte.
1013
 *
1014
 * Return value: a pointer to a newly allocated UTF-8 string.
1015
 *               This value must be deallocated by the caller.
1016
 *               If an error occurs, %NULL will be returned.
1017
 **/
1018
char *
1019
stringprep_ucs4_to_utf8 (const uint32_t *str, ssize_t len,
1020
       size_t *items_read, size_t *items_written)
1021
17.4k
{
1022
17.4k
  return g_ucs4_to_utf8 (str, len, items_read, items_written);
1023
17.4k
}
1024
1025
/**
1026
 * stringprep_utf8_nfkc_normalize:
1027
 * @str: a UTF-8 encoded string.
1028
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1029
 *
1030
 * Converts a string into canonical form, standardizing
1031
 * such issues as whether a character with an accent
1032
 * is represented as a base character and combining
1033
 * accent or as a single precomposed character.
1034
 *
1035
 * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1036
 * differences that do not affect the text content, such as the
1037
 * above-mentioned accent representation. It standardizes the
1038
 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1039
 * the standard forms (in this case DIGIT THREE). Formatting
1040
 * information may be lost but for most text operations such
1041
 * characters should be considered the same. It returns a result with
1042
 * composed forms rather than a maximally decomposed form.
1043
 *
1044
 * Return value: a newly allocated string, that is the
1045
 *   NFKC normalized form of @str.
1046
 **/
1047
char *
1048
stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1049
2.48k
{
1050
2.48k
  size_t n;
1051
1052
2.48k
  if (len < 0)
1053
0
    n = strlen (str);
1054
2.48k
  else
1055
2.48k
    n = len;
1056
1057
2.48k
  if (u8_check ((const uint8_t *) str, n))
1058
831
    return NULL;
1059
1060
1.65k
  return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1061
2.48k
}
1062
1063
#include <stdio.h>
1064
/**
1065
 * stringprep_ucs4_nfkc_normalize:
1066
 * @str: a Unicode string.
1067
 * @len: length of @str array, or -1 if @str is nul-terminated.
1068
 *
1069
 * Converts a UCS4 string into canonical form, see
1070
 * stringprep_utf8_nfkc_normalize() for more information.
1071
 *
1072
 * Return value: a newly allocated Unicode string, that is the NFKC
1073
 *   normalized form of @str.
1074
 **/
1075
uint32_t *
1076
stringprep_ucs4_nfkc_normalize (const uint32_t *str, ssize_t len)
1077
12.6k
{
1078
12.6k
  char *p;
1079
12.6k
  uint32_t *result_wc;
1080
1081
12.6k
  p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1082
12.6k
  if (!p)
1083
451
    return NULL;
1084
1085
12.1k
  result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1086
12.1k
  free (p);
1087
1088
12.1k
  return result_wc;
1089
12.6k
}