Coverage Report

Created: 2024-04-23 06:37

/src/libidn/lib/nfkc.c
Line
Count
Source (jump to first uncovered line)
1
/* nfkc.c --- Unicode normalization utilities.
2
   Copyright (C) 2002-2024 Simon Josefsson
3
4
   This file is part of GNU Libidn.
5
6
   GNU Libidn is free software: you can redistribute it and/or
7
   modify it under the terms of either:
8
9
     * the GNU Lesser General Public License as published by the Free
10
       Software Foundation; either version 3 of the License, or (at
11
       your option) any later version.
12
13
   or
14
15
     * the GNU General Public License as published by the Free
16
       Software Foundation; either version 2 of the License, or (at
17
       your option) any later version.
18
19
   or both in parallel, as here.
20
21
   GNU Libidn is distributed in the hope that it will be useful,
22
   but WITHOUT ANY WARRANTY; without even the implied warranty of
23
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24
   General Public License for more details.
25
26
   You should have received copies of the GNU General Public License and
27
   the GNU Lesser General Public License along with this program.  If
28
   not, see <https://www.gnu.org/licenses/>. */
29
30
#ifdef HAVE_CONFIG_H
31
# include "config.h"
32
#endif
33
34
#include <stdlib.h>
35
#include <string.h>
36
37
#include "stringprep.h"
38
39
/* Hacks to make syncing with GLIB code easier. */
40
29.1k
#define gboolean int
41
41.3k
#define gchar char
42
#define guchar unsigned char
43
47.7k
#define gint int
44
983k
#define guint unsigned int
45
4.33M
#define gushort unsigned short
46
#define gint16 int16_t
47
#define guint16 uint16_t
48
7.31M
#define gunichar uint32_t
49
1.35M
#define gsize size_t
50
#define gssize ssize_t
51
50.6k
#define g_malloc malloc
52
1.70k
#define g_free free
53
373k
#define g_return_val_if_fail(expr,val)  {   \
54
373k
    if (!(expr))         \
55
373k
      return (val);         \
56
373k
  }
57
58
/* Code from GLIB gmacros.h starts here. */
59
60
/* GLIB - Library of useful routines for C programming
61
 * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald
62
 *
63
 * This library is free software; you can redistribute it and/or
64
 * modify it under the terms of the GNU Lesser General Public
65
 * License as published by the Free Software Foundation; either
66
 * version 2 of the License, or (at your option) any later version.
67
 *
68
 * This library is distributed in the hope that it will be useful,
69
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
70
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
71
 * Lesser General Public License for more details.
72
 *
73
 * You should have received a copy of the GNU Lesser General Public
74
 * License along with this library; if not, write to the
75
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
76
 * Boston, MA 02111-1307, USA.
77
 */
78
79
#ifndef FALSE
80
17.4M
# define  FALSE (0)
81
#endif
82
83
#ifndef TRUE
84
8.74M
# define  TRUE  (!FALSE)
85
#endif
86
87
963k
#define G_N_ELEMENTS(arr)   (sizeof (arr) / sizeof ((arr)[0]))
88
89
395k
#define G_UNLIKELY(expr) (expr)
90
91
/* Code from GLIB gunicode.h starts here. */
92
93
/* gunicode.h - Unicode manipulation functions
94
 *
95
 *  Copyright (C) 1999, 2000 Tom Tromey
96
 *  Copyright 2000, 2005 Red Hat, Inc.
97
 *
98
 * The Gnome Library is free software; you can redistribute it and/or
99
 * modify it under the terms of the GNU Lesser General Public License as
100
 * published by the Free Software Foundation; either version 2 of the
101
 * License, or (at your option) any later version.
102
 *
103
 * The Gnome Library is distributed in the hope that it will be useful,
104
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
105
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
106
 * Lesser General Public License for more details.
107
 *
108
 * You should have received a copy of the GNU Lesser General Public
109
 * License along with the Gnome Library; see the file COPYING.LIB.  If not,
110
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
111
 *   Boston, MA 02111-1307, USA.
112
 */
113
114
typedef enum
115
{
116
  G_NORMALIZE_DEFAULT,
117
  G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
118
  G_NORMALIZE_DEFAULT_COMPOSE,
119
  G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
120
  G_NORMALIZE_ALL,
121
  G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
122
  G_NORMALIZE_ALL_COMPOSE,
123
  G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
124
}
125
GNormalizeMode;
126
127
10.2M
#define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
128
129
/* Code from GLIB gutf8.c starts here. */
130
131
/* gutf8.c - Operations on UTF-8 strings.
132
 *
133
 * Copyright (C) 1999 Tom Tromey
134
 * Copyright (C) 2000 Red Hat, Inc.
135
 *
136
 * This library is free software; you can redistribute it and/or
137
 * modify it under the terms of the GNU Lesser General Public
138
 * License as published by the Free Software Foundation; either
139
 * version 2 of the License, or (at your option) any later version.
140
 *
141
 * This library is distributed in the hope that it will be useful,
142
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
143
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
144
 * Lesser General Public License for more details.
145
 *
146
 * You should have received a copy of the GNU Lesser General Public
147
 * License along with this library; if not, write to the
148
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
149
 * Boston, MA 02111-1307, USA.
150
 */
151
152
#define UTF8_COMPUTE(Char, Mask, Len)   \
153
5.34M
  if (Char < 128)       \
154
5.34M
    {           \
155
871k
      Len = 1;          \
156
871k
      Mask = 0x7f;        \
157
871k
    }            \
158
5.34M
  else if ((Char & 0xe0) == 0xc0)   \
159
4.47M
    {           \
160
3.56M
      Len = 2;          \
161
3.56M
      Mask = 0x1f;        \
162
3.56M
    }            \
163
4.47M
  else if ((Char & 0xf0) == 0xe0)   \
164
904k
    {           \
165
889k
      Len = 3;          \
166
889k
      Mask = 0x0f;        \
167
889k
    }            \
168
904k
  else if ((Char & 0xf8) == 0xf0)   \
169
15.2k
    {           \
170
10.0k
      Len = 4;          \
171
10.0k
      Mask = 0x07;        \
172
10.0k
    }            \
173
15.2k
  else if ((Char & 0xfc) == 0xf8)   \
174
5.18k
    {           \
175
567
      Len = 5;          \
176
567
      Mask = 0x03;        \
177
567
    }            \
178
5.18k
  else if ((Char & 0xfe) == 0xfc)   \
179
4.61k
    {           \
180
4.43k
      Len = 6;          \
181
4.43k
      Mask = 0x01;        \
182
4.43k
    }            \
183
4.61k
  else            \
184
4.61k
    Len = -1;
185
186
#define UTF8_LENGTH(Char)     \
187
980k
  ((Char) < 0x80 ? 1 :        \
188
980k
   ((Char) < 0x800 ? 2 :      \
189
753k
    ((Char) < 0x10000 ? 3 :      \
190
336k
     ((Char) < 0x200000 ? 4 :      \
191
7.02k
      ((Char) < 0x4000000 ? 5 : 6)))))
192
193
#define UTF8_GET(Result, Chars, Count, Mask, Len)           \
194
5.34M
  (Result) = (Chars)[0] & (Mask);               \
195
10.7M
  for ((Count) = 1; (Count) < (Len); ++(Count))             \
196
5.40M
    {                       \
197
5.40M
      if (((Chars)[(Count)] & 0xc0) != 0x80)             \
198
5.40M
  {                     \
199
228
    (Result) = -1;                  \
200
228
    break;                    \
201
228
  }                      \
202
5.40M
      (Result) <<= 6;                   \
203
5.40M
      (Result) |= ((Chars)[(Count)] & 0x3f);              \
204
5.40M
    }
205
206
static const gchar utf8_skip_data[256] = {
207
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
208
  1, 1, 1, 1, 1, 1, 1,
209
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
210
  1, 1, 1, 1, 1, 1, 1,
211
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
212
  1, 1, 1, 1, 1, 1, 1,
213
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
214
  1, 1, 1, 1, 1, 1, 1,
215
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
216
  1, 1, 1, 1, 1, 1, 1,
217
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
218
  1, 1, 1, 1, 1, 1, 1,
219
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
220
  2, 2, 2, 2, 2, 2, 2,
221
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
222
  5, 5, 5, 6, 6, 1, 1
223
};
224
225
static const gchar *const g_utf8_skip = utf8_skip_data;
226
227
/*
228
 * g_utf8_strlen:
229
 * @p: pointer to the start of a UTF-8 encoded string
230
 * @max: the maximum number of bytes to examine. If @max
231
 *       is less than 0, then the string is assumed to be
232
 *       nul-terminated. If @max is 0, @p will not be examined and
233
 *       may be %NULL.
234
 *
235
 * Computes the length of the string in characters, not including
236
 * the terminating nul character.
237
 *
238
 * Return value: the length of the string in characters
239
 **/
240
static gsize
241
g_utf8_strlen (const gchar *p)
242
356k
{
243
356k
  gsize len = 0;
244
245
356k
  g_return_val_if_fail (p != NULL, 0);
246
247
4.72M
  while (*p)
248
4.37M
    {
249
4.37M
      p = g_utf8_next_char (p);
250
4.37M
      ++len;
251
4.37M
    }
252
253
356k
  return len;
254
356k
}
255
256
/*
257
 * g_utf8_get_char:
258
 * @p: a pointer to Unicode character encoded as UTF-8
259
 *
260
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
261
 * If @p does not point to a valid UTF-8 encoded character, results are
262
 * undefined. If you are not sure that the bytes are complete
263
 * valid Unicode characters, you should use g_utf8_get_char_validated()
264
 * instead.
265
 *
266
 * Return value: the resulting character
267
 **/
268
static gunichar
269
g_utf8_get_char (const gchar *p)
270
5.34M
{
271
5.34M
  int i, mask = 0, len;
272
5.34M
  gunichar result;
273
5.34M
  unsigned char c = (unsigned char) *p;
274
275
5.34M
  UTF8_COMPUTE (c, mask, len);
276
5.34M
  if (len == -1)
277
187
    return (gunichar) - 1;
278
5.34M
  UTF8_GET (result, p, i, mask, len);
279
280
5.34M
  return result;
281
5.34M
}
282
283
/*
284
 * g_unichar_to_utf8:
285
 * @c: a Unicode character code
286
 * @outbuf: output buffer, must have at least 6 bytes of space.
287
 *       If %NULL, the length will be computed and returned
288
 *       and nothing will be written to @outbuf.
289
 *
290
 * Converts a single character to UTF-8.
291
 *
292
 * Return value: number of bytes written
293
 **/
294
static int
295
g_unichar_to_utf8 (gunichar c, gchar *outbuf)
296
983k
{
297
  /* If this gets modified, also update the copy in g_string_insert_unichar() */
298
983k
  guint len = 0;
299
983k
  int first;
300
983k
  int i;
301
302
983k
  if (c < 0x80)
303
228k
    {
304
228k
      first = 0;
305
228k
      len = 1;
306
228k
    }
307
754k
  else if (c < 0x800)
308
417k
    {
309
417k
      first = 0xc0;
310
417k
      len = 2;
311
417k
    }
312
337k
  else if (c < 0x10000)
313
329k
    {
314
329k
      first = 0xe0;
315
329k
      len = 3;
316
329k
    }
317
7.61k
  else if (c < 0x200000)
318
4.70k
    {
319
4.70k
      first = 0xf0;
320
4.70k
      len = 4;
321
4.70k
    }
322
2.91k
  else if (c < 0x4000000)
323
295
    {
324
295
      first = 0xf8;
325
295
      len = 5;
326
295
    }
327
2.62k
  else
328
2.62k
    {
329
2.62k
      first = 0xfc;
330
2.62k
      len = 6;
331
2.62k
    }
332
333
983k
  if (outbuf)
334
983k
    {
335
2.08M
      for (i = len - 1; i > 0; --i)
336
1.10M
  {
337
1.10M
    outbuf[i] = (c & 0x3f) | 0x80;
338
1.10M
    c >>= 6;
339
1.10M
  }
340
983k
      outbuf[0] = c | first;
341
983k
    }
342
343
983k
  return len;
344
983k
}
345
346
/*
347
 * g_utf8_to_ucs4_fast:
348
 * @str: a UTF-8 encoded string
349
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
350
 *       then the string is nul-terminated.
351
 * @items_written: location to store the number of characters in the
352
 *                 result, or %NULL.
353
 *
354
 * Convert a string from UTF-8 to a 32-bit fixed width
355
 * representation as UCS-4, assuming valid UTF-8 input.
356
 * This function is roughly twice as fast as g_utf8_to_ucs4()
357
 * but does no error checking on the input. A trailing 0 character
358
 * will be added to the string after the converted text.
359
 *
360
 * Return value: a pointer to a newly allocated UCS-4 string.
361
 *               This value must be freed with g_free().
362
 **/
363
static gunichar *
364
g_utf8_to_ucs4_fast (const gchar *str, gssize len, gsize *items_written)
365
16.6k
{
366
16.6k
  gunichar *result;
367
16.6k
  gsize n_chars, i;
368
16.6k
  const gchar *p;
369
370
16.6k
  g_return_val_if_fail (str != NULL, NULL);
371
372
16.6k
  p = str;
373
16.6k
  n_chars = 0;
374
16.6k
  if (len < 0)
375
16.6k
    {
376
558k
      while (*p)
377
541k
  {
378
541k
    p = g_utf8_next_char (p);
379
541k
    ++n_chars;
380
541k
  }
381
16.6k
    }
382
0
  else
383
0
    {
384
0
      while (p < str + len && *p)
385
0
  {
386
0
    p = g_utf8_next_char (p);
387
0
    ++n_chars;
388
0
  }
389
0
    }
390
391
16.6k
  result = g_malloc (sizeof (gunichar) * (n_chars + 1));
392
16.6k
  if (!result)
393
0
    return NULL;
394
395
16.6k
  p = str;
396
558k
  for (i = 0; i < n_chars; i++)
397
541k
    {
398
541k
      gunichar wc = (guchar) * p++;
399
400
541k
      if (wc < 0x80)
401
146k
  {
402
146k
    result[i] = wc;
403
146k
  }
404
395k
      else
405
395k
  {
406
395k
    gunichar mask = 0x40;
407
408
395k
    if (G_UNLIKELY ((wc & mask) == 0))
409
0
      {
410
        /* It's an out-of-sequence 10xxxxxxx byte.
411
         * Rather than making an ugly hash of this and the next byte
412
         * and overrunning the buffer, it's more useful to treat it
413
         * with a replacement character */
414
0
        result[i] = 0xfffd;
415
0
        continue;
416
0
      }
417
418
395k
    do
419
698k
      {
420
698k
        wc <<= 6;
421
698k
        wc |= (guchar) (*p++) & 0x3f;
422
698k
        mask <<= 5;
423
698k
      }
424
698k
    while ((wc & mask) != 0);
425
426
395k
    wc &= mask - 1;
427
428
395k
    result[i] = wc;
429
395k
  }
430
541k
    }
431
16.6k
  result[i] = 0;
432
433
16.6k
  if (items_written)
434
14.8k
    *items_written = i;
435
436
16.6k
  return result;
437
16.6k
}
438
439
/*
440
 * g_ucs4_to_utf8:
441
 * @str: a UCS-4 encoded string
442
 * @len: the maximum length (number of characters) of @str to use.
443
 *       If @len < 0, then the string is nul-terminated.
444
 * @items_read: location to store number of characters read, or %NULL.
445
 * @items_written: location to store number of bytes written or %NULL.
446
 *                 The value here stored does not include the trailing 0
447
 *                 byte.
448
 * @error: location to store the error occurring, or %NULL to ignore
449
 *         errors. Any of the errors in #GConvertError other than
450
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
451
 *
452
 * Convert a string from a 32-bit fixed width representation as UCS-4.
453
 * to UTF-8. The result will be terminated with a 0 byte.
454
 *
455
 * Return value: a pointer to a newly allocated UTF-8 string.
456
 *               This value must be freed with g_free(). If an
457
 *               error occurs, %NULL will be returned and
458
 *               @error set. In that case, @items_read will be
459
 *               set to the position of the first invalid input
460
 *               character.
461
 **/
462
static gchar *
463
g_ucs4_to_utf8 (const gunichar *str,
464
    gsize len, gsize *items_read, gsize *items_written)
465
19.8k
{
466
19.8k
  gint result_length;
467
19.8k
  gchar *result = NULL;
468
19.8k
  gchar *p;
469
19.8k
  gsize i;
470
471
19.8k
  result_length = 0;
472
1.00M
  for (i = 0; i < len; i++)
473
982k
    {
474
982k
      if (!str[i])
475
1.70k
  break;
476
477
981k
      if (str[i] >= 0x80000000)
478
405
  goto err_out;
479
480
980k
      result_length += UTF8_LENGTH (str[i]);
481
980k
    }
482
483
19.4k
  result = g_malloc (result_length + 1);
484
19.4k
  if (!result)
485
0
    return NULL;
486
19.4k
  p = result;
487
488
19.4k
  i = 0;
489
1.00M
  while (p < result + result_length)
490
980k
    p += g_unichar_to_utf8 (str[i++], p);
491
492
19.4k
  *p = '\0';
493
494
19.4k
  if (items_written)
495
0
    *items_written = p - result;
496
497
19.8k
err_out:
498
19.8k
  if (items_read)
499
0
    *items_read = i;
500
501
19.8k
  return result;
502
19.4k
}
503
504
/* Code from GLIB gunidecomp.c starts here. */
505
506
/* decomp.c - Character decomposition.
507
 *
508
 *  Copyright (C) 1999, 2000 Tom Tromey
509
 *  Copyright 2000 Red Hat, Inc.
510
 *
511
 * The Gnome Library is free software; you can redistribute it and/or
512
 * modify it under the terms of the GNU Lesser General Public License as
513
 * published by the Free Software Foundation; either version 2 of the
514
 * License, or (at your option) any later version.
515
 *
516
 * The Gnome Library is distributed in the hope that it will be useful,
517
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
518
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
519
 * Lesser General Public License for more details.
520
 *
521
 * You should have received a copy of the GNU Lesser General Public
522
 * License along with the Gnome Library; see the file COPYING.LIB.  If not,
523
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
524
 *   Boston, MA 02111-1307, USA.
525
 */
526
527
#include "gunidecomp.h"
528
#include "gunicomp.h"
529
530
#define CC_PART1(Page, Char)            \
531
13.8M
  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
532
13.8M
   ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX)  \
533
13.8M
   : (cclass_data[combining_class_table_part1[Page]][Char]))
534
535
#define CC_PART2(Page, Char)            \
536
5.37k
  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
537
5.37k
   ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
538
5.37k
   : (cclass_data[combining_class_table_part2[Page]][Char]))
539
540
#define COMBINING_CLASS(Char)         \
541
13.9M
  (((Char) <= G_UNICODE_LAST_CHAR_PART1)     \
542
13.9M
   ? CC_PART1 ((Char) >> 8, (Char) & 0xff)     \
543
13.9M
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
544
20.1k
      ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
545
20.1k
      : 0))
546
547
/* constants for hangul syllable [de]composition */
548
15.4M
#define SBase 0xAC00
549
13.3M
#define LBase 0x1100
550
4.35M
#define VBase 0x1161
551
4.34M
#define TBase 0x11A7
552
811k
#define LCount 19
553
488k
#define VCount 21
554
499k
#define TCount 28
555
479k
#define NCount (VCount * TCount)
556
471k
#define SCount (LCount * NCount)
557
558
/*
559
 * g_unicode_canonical_ordering:
560
 * @string: a UCS-4 encoded string.
561
 * @len: the maximum length of @string to use.
562
 *
563
 * Computes the canonical ordering of a string in-place.
564
 * This rearranges decomposed characters in the string
565
 * according to their combining classes.  See the Unicode
566
 * manual for more information.
567
 **/
568
static void
569
g_unicode_canonical_ordering (gunichar *string, gsize len)
570
418k
{
571
418k
  gsize i;
572
418k
  int swap = 1;
573
574
838k
  while (swap)
575
420k
    {
576
420k
      int last;
577
420k
      swap = 0;
578
420k
      last = COMBINING_CLASS (string[0]);
579
8.88M
      for (i = 0; i < len - 1; ++i)
580
8.46M
  {
581
8.46M
    int next = COMBINING_CLASS (string[i + 1]);
582
8.46M
    if (next != 0 && last > next)
583
8.01k
      {
584
8.01k
        gsize j;
585
        /* Percolate item leftward through string.  */
586
23.5k
        for (j = i + 1; j > 0; --j)
587
23.1k
    {
588
23.1k
      gunichar t;
589
23.1k
      if (COMBINING_CLASS (string[j - 1]) <= next)
590
7.59k
        break;
591
15.5k
      t = string[j];
592
15.5k
      string[j] = string[j - 1];
593
15.5k
      string[j - 1] = t;
594
15.5k
      swap = 1;
595
15.5k
    }
596
        /* We're re-entering the loop looking at the old
597
           character again.  */
598
8.01k
        next = last;
599
8.01k
      }
600
8.46M
    last = next;
601
8.46M
  }
602
420k
    }
603
418k
}
604
605
/* http://www.unicode.org/unicode/reports/tr15/#Hangul
606
 * r should be null or have sufficient space. Calling with r == NULL will
607
 * only calculate the result_len; however, a buffer with space for three
608
 * characters will always be big enough. */
609
static void
610
decompose_hangul (gunichar s, gunichar *r, gsize *result_len)
611
8.29k
{
612
8.29k
  gint SIndex = s - SBase;
613
8.29k
  gint TIndex = SIndex % TCount;
614
615
8.29k
  if (r)
616
4.14k
    {
617
4.14k
      r[0] = LBase + SIndex / NCount;
618
4.14k
      r[1] = VBase + (SIndex % NCount) / TCount;
619
4.14k
    }
620
621
8.29k
  if (TIndex)
622
2.26k
    {
623
2.26k
      if (r)
624
1.13k
  r[2] = TBase + TIndex;
625
2.26k
      *result_len = 3;
626
2.26k
    }
627
6.02k
  else
628
6.02k
    *result_len = 2;
629
8.29k
}
630
631
/* returns a pointer to a null-terminated UTF-8 string */
632
static const gchar *
633
find_decomposition (gunichar ch, gboolean compat)
634
963k
{
635
963k
  int start = 0;
636
963k
  int end = G_N_ELEMENTS (decomp_table);
637
638
963k
  if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
639
738k
    {
640
8.73M
      while (TRUE)
641
8.73M
  {
642
8.73M
    int half = (start + end) / 2;
643
8.73M
    if (ch == decomp_table[half].ch)
644
713k
      {
645
713k
        int offset;
646
647
713k
        if (compat)
648
713k
    {
649
713k
      offset = decomp_table[half].compat_offset;
650
713k
      if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
651
161k
        offset = decomp_table[half].canon_offset;
652
713k
    }
653
0
        else
654
0
    {
655
0
      offset = decomp_table[half].canon_offset;
656
0
      if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
657
0
        return NULL;
658
0
    }
659
660
713k
        return &(decomp_expansion_string[offset]);
661
713k
      }
662
8.01M
    else if (half == start)
663
24.6k
      break;
664
7.99M
    else if (ch > decomp_table[half].ch)
665
4.01M
      start = half;
666
3.97M
    else
667
3.97M
      end = half;
668
8.73M
  }
669
738k
    }
670
671
249k
  return NULL;
672
963k
}
673
674
/* L,V => LV and LV,T => LVT  */
675
static gboolean
676
combine_hangul (gunichar a, gunichar b, gunichar *result)
677
4.33M
{
678
4.33M
  if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
679
4.38k
    {
680
4.38k
      gint LIndex = a - LBase;
681
4.38k
      gint VIndex = b - VBase;
682
683
4.38k
      *result = SBase + (LIndex * VCount + VIndex) * TCount;
684
4.38k
      return TRUE;
685
4.38k
    }
686
687
4.33M
  if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
688
1.38k
    {
689
1.38k
      gint SIndex = a - SBase;
690
691
1.38k
      if ((SIndex % TCount) == 0)
692
1.18k
  {
693
1.18k
    gint TIndex = b - TBase;
694
695
1.18k
    *result = a + TIndex;
696
1.18k
    return TRUE;
697
1.18k
  }
698
1.38k
    }
699
700
4.33M
  return FALSE;
701
4.33M
}
702
703
#define CI(Page, Char)          \
704
8.10M
  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
705
8.10M
   ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX)  \
706
8.10M
   : (compose_data[compose_table[Page]][Char]))
707
708
#define COMPOSE_INDEX(Char)           \
709
8.12M
  (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
710
711
static gboolean
712
combine (gunichar a, gunichar b, gunichar *result)
713
4.33M
{
714
4.33M
  gushort index_a, index_b;
715
716
4.33M
  if (combine_hangul (a, b, result))
717
5.57k
    return TRUE;
718
719
4.33M
  index_a = COMPOSE_INDEX (a);
720
721
4.33M
  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
722
538k
    {
723
538k
      if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
724
1.72k
  {
725
1.72k
    *result =
726
1.72k
      compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
727
1.72k
    return TRUE;
728
1.72k
  }
729
537k
      else
730
537k
  return FALSE;
731
538k
    }
732
733
3.79M
  index_b = COMPOSE_INDEX (b);
734
735
3.79M
  if (index_b >= COMPOSE_SECOND_SINGLE_START)
736
689
    {
737
689
      if (a ==
738
689
    compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
739
301
  {
740
301
    *result =
741
301
      compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
742
301
    return TRUE;
743
301
  }
744
388
      else
745
388
  return FALSE;
746
689
    }
747
748
3.79M
  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
749
3.79M
      && index_b >= COMPOSE_SECOND_START
750
3.79M
      && index_b < COMPOSE_SECOND_SINGLE_START)
751
5.33k
    {
752
5.33k
      gunichar res =
753
5.33k
  compose_array[index_a - COMPOSE_FIRST_START][index_b -
754
5.33k
                 COMPOSE_SECOND_START];
755
756
5.33k
      if (res)
757
4.94k
  {
758
4.94k
    *result = res;
759
4.94k
    return TRUE;
760
4.94k
  }
761
5.33k
    }
762
763
3.78M
  return FALSE;
764
3.79M
}
765
766
static gunichar *
767
_g_utf8_normalize_wc (const gchar *str, gssize max_len, GNormalizeMode mode)
768
14.5k
{
769
14.5k
  gsize n_wc;
770
14.5k
  gunichar *wc_buffer;
771
14.5k
  const char *p;
772
14.5k
  gsize last_start;
773
14.5k
  gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
774
14.5k
  gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
775
776
14.5k
  n_wc = 0;
777
14.5k
  p = str;
778
500k
  while ((max_len < 0 || p < str + max_len) && *p)
779
485k
    {
780
485k
      const gchar *decomp;
781
485k
      gunichar wc = g_utf8_get_char (p);
782
783
485k
      if (wc >= SBase && wc < SBase + SCount)
784
4.14k
  {
785
4.14k
    gsize result_len;
786
4.14k
    decompose_hangul (wc, NULL, &result_len);
787
4.14k
    n_wc += result_len;
788
4.14k
  }
789
481k
      else
790
481k
  {
791
481k
    decomp = find_decomposition (wc, do_compat);
792
793
481k
    if (decomp)
794
356k
      n_wc += g_utf8_strlen (decomp);
795
124k
    else
796
124k
      n_wc++;
797
481k
  }
798
799
485k
      p = g_utf8_next_char (p);
800
485k
    }
801
802
14.5k
  wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
803
14.5k
  if (!wc_buffer)
804
0
    return NULL;
805
806
14.5k
  last_start = 0;
807
14.5k
  n_wc = 0;
808
14.5k
  p = str;
809
500k
  while ((max_len < 0 || p < str + max_len) && *p)
810
485k
    {
811
485k
      gunichar wc = g_utf8_get_char (p);
812
485k
      const gchar *decomp;
813
485k
      int cc;
814
485k
      gsize old_n_wc = n_wc;
815
816
485k
      if (wc >= SBase && wc < SBase + SCount)
817
4.14k
  {
818
4.14k
    gsize result_len;
819
4.14k
    decompose_hangul (wc, wc_buffer + n_wc, &result_len);
820
4.14k
    n_wc += result_len;
821
4.14k
  }
822
481k
      else
823
481k
  {
824
481k
    decomp = find_decomposition (wc, do_compat);
825
826
481k
    if (decomp)
827
356k
      {
828
356k
        const char *pd;
829
4.72M
        for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
830
4.37M
    wc_buffer[n_wc++] = g_utf8_get_char (pd);
831
356k
      }
832
124k
    else
833
124k
      wc_buffer[n_wc++] = wc;
834
481k
  }
835
836
485k
      if (n_wc > 0)
837
485k
  {
838
485k
    cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
839
840
485k
    if (cc == 0)
841
403k
      {
842
403k
        g_unicode_canonical_ordering (wc_buffer + last_start,
843
403k
              n_wc - last_start);
844
403k
        last_start = old_n_wc;
845
403k
      }
846
485k
  }
847
848
485k
      p = g_utf8_next_char (p);
849
485k
    }
850
851
14.5k
  if (n_wc > 0)
852
14.1k
    {
853
14.1k
      g_unicode_canonical_ordering (wc_buffer + last_start,
854
14.1k
            n_wc - last_start);
855
      /* dead assignment: last_start = n_wc; */
856
14.1k
    }
857
858
14.5k
  wc_buffer[n_wc] = 0;
859
860
  /* All decomposed and reordered */
861
862
14.5k
  if (do_compose && n_wc > 0)
863
14.1k
    {
864
14.1k
      gsize i, j;
865
14.1k
      int last_cc = 0;
866
14.1k
      last_start = 0;
867
868
4.51M
      for (i = 0; i < n_wc; i++)
869
4.50M
  {
870
4.50M
    int cc = COMBINING_CLASS (wc_buffer[i]);
871
872
4.50M
    if (i > 0 &&
873
4.50M
        (last_cc == 0 || last_cc != cc) &&
874
4.50M
        combine (wc_buffer[last_start], wc_buffer[i],
875
4.33M
           &wc_buffer[last_start]))
876
12.5k
      {
877
1.10M
        for (j = i + 1; j < n_wc; j++)
878
1.08M
    wc_buffer[j - 1] = wc_buffer[j];
879
12.5k
        n_wc--;
880
12.5k
        i--;
881
882
12.5k
        if (i == last_start)
883
11.2k
    last_cc = 0;
884
1.24k
        else
885
1.24k
    last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
886
887
12.5k
        continue;
888
12.5k
      }
889
890
4.49M
    if (cc == 0)
891
4.33M
      last_start = i;
892
893
4.49M
    last_cc = cc;
894
4.49M
  }
895
14.1k
    }
896
897
14.5k
  wc_buffer[n_wc] = 0;
898
899
14.5k
  return wc_buffer;
900
14.5k
}
901
902
/*
903
 * g_utf8_normalize:
904
 * @str: a UTF-8 encoded string.
905
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
906
 * @mode: the type of normalization to perform.
907
 *
908
 * Converts a string into canonical form, standardizing
909
 * such issues as whether a character with an accent
910
 * is represented as a base character and combining
911
 * accent or as a single precomposed character. The
912
 * string has to be valid UTF-8, otherwise %NULL is
913
 * returned. You should generally call g_utf8_normalize()
914
 * before comparing two Unicode strings.
915
 *
916
 * The normalization mode %G_NORMALIZE_DEFAULT only
917
 * standardizes differences that do not affect the
918
 * text content, such as the above-mentioned accent
919
 * representation. %G_NORMALIZE_ALL also standardizes
920
 * the "compatibility" characters in Unicode, such
921
 * as SUPERSCRIPT THREE to the standard forms
922
 * (in this case DIGIT THREE). Formatting information
923
 * may be lost but for most text operations such
924
 * characters should be considered the same.
925
 *
926
 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
927
 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
928
 * but returned a result with composed forms rather
929
 * than a maximally decomposed form. This is often
930
 * useful if you intend to convert the string to
931
 * a legacy encoding or pass it to a system with
932
 * less capable Unicode handling.
933
 *
934
 * Return value: a newly allocated string, that is the
935
 *   normalized form of @str, or %NULL if @str is not
936
 *   valid UTF-8.
937
 **/
938
static gchar *
939
g_utf8_normalize (const gchar *str, gssize len, GNormalizeMode mode)
940
1.70k
{
941
1.70k
  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
942
1.70k
  gchar *result = NULL;
943
944
1.70k
  if (result_wc)
945
1.70k
    result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
946
947
1.70k
  g_free (result_wc);
948
949
1.70k
  return result;
950
1.70k
}
951
952
/* Public Libidn API starts here. */
953
954
/**
955
 * stringprep_utf8_to_unichar:
956
 * @p: a pointer to Unicode character encoded as UTF-8
957
 *
958
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
959
 * If @p does not point to a valid UTF-8 encoded character, results are
960
 * undefined.
961
 *
962
 * Return value: the resulting character.
963
 **/
964
uint32_t
965
stringprep_utf8_to_unichar (const char *p)
966
2.51k
{
967
2.51k
  return g_utf8_get_char (p);
968
2.51k
}
969
970
/**
971
 * stringprep_unichar_to_utf8:
972
 * @c: a ISO10646 character code
973
 * @outbuf: output buffer, must have at least 6 bytes of space.
974
 *       If %NULL, the length will be computed and returned
975
 *       and nothing will be written to @outbuf.
976
 *
977
 * Converts a single character to UTF-8.
978
 *
979
 * Return value: number of bytes written.
980
 **/
981
int
982
stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
983
2.51k
{
984
2.51k
  return g_unichar_to_utf8 (c, outbuf);
985
2.51k
}
986
987
#include <unistr.h>
988
989
/**
990
 * stringprep_utf8_to_ucs4:
991
 * @str: a UTF-8 encoded string
992
 * @len: the maximum length of @str to use. If @len < 0, then
993
 *       the string is nul-terminated.
994
 * @items_written: location to store the number of characters in the
995
 *                 result, or %NULL.
996
 *
997
 * Convert a string from UTF-8 to a 32-bit fixed width representation
998
 * as UCS-4.  The function now performs error checking to verify that
999
 * the input is valid UTF-8 (before it was documented to not do error
1000
 * checking).
1001
 *
1002
 * Return value: a pointer to a newly allocated UCS-4 string.
1003
 *               This value must be deallocated by the caller.
1004
 **/
1005
uint32_t *
1006
stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
1007
20.0k
{
1008
20.0k
  size_t n;
1009
1010
20.0k
  if (len < 0)
1011
20.0k
    n = strlen (str);
1012
0
  else
1013
0
    n = len;
1014
1015
20.0k
  if (u8_check ((const uint8_t *) str, n))
1016
3.33k
    return NULL;
1017
1018
16.6k
  return g_utf8_to_ucs4_fast (str, len, items_written);
1019
20.0k
}
1020
1021
/**
1022
 * stringprep_ucs4_to_utf8:
1023
 * @str: a UCS-4 encoded string
1024
 * @len: the maximum length of @str to use. If @len < 0, then
1025
 *       the string is terminated with a 0 character.
1026
 * @items_read: location to store number of characters read read, or %NULL.
1027
 * @items_written: location to store number of bytes written or %NULL.
1028
 *                 The value here stored does not include the trailing 0
1029
 *                 byte.
1030
 *
1031
 * Convert a string from a 32-bit fixed width representation as UCS-4.
1032
 * to UTF-8. The result will be terminated with a 0 byte.
1033
 *
1034
 * Return value: a pointer to a newly allocated UTF-8 string.
1035
 *               This value must be deallocated by the caller.
1036
 *               If an error occurs, %NULL will be returned.
1037
 **/
1038
char *
1039
stringprep_ucs4_to_utf8 (const uint32_t *str, ssize_t len,
1040
       size_t *items_read, size_t *items_written)
1041
18.1k
{
1042
18.1k
  return g_ucs4_to_utf8 (str, len, items_read, items_written);
1043
18.1k
}
1044
1045
/**
1046
 * stringprep_utf8_nfkc_normalize:
1047
 * @str: a UTF-8 encoded string.
1048
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1049
 *
1050
 * Converts a string into canonical form, standardizing
1051
 * such issues as whether a character with an accent
1052
 * is represented as a base character and combining
1053
 * accent or as a single precomposed character.
1054
 *
1055
 * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1056
 * differences that do not affect the text content, such as the
1057
 * above-mentioned accent representation. It standardizes the
1058
 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1059
 * the standard forms (in this case DIGIT THREE). Formatting
1060
 * information may be lost but for most text operations such
1061
 * characters should be considered the same. It returns a result with
1062
 * composed forms rather than a maximally decomposed form.
1063
 *
1064
 * Return value: a newly allocated string, that is the
1065
 *   NFKC normalized form of @str.
1066
 **/
1067
char *
1068
stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1069
2.51k
{
1070
2.51k
  size_t n;
1071
1072
2.51k
  if (len < 0)
1073
0
    n = strlen (str);
1074
2.51k
  else
1075
2.51k
    n = len;
1076
1077
2.51k
  if (u8_check ((const uint8_t *) str, n))
1078
812
    return NULL;
1079
1080
1.70k
  return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1081
2.51k
}
1082
1083
#include <stdio.h>
1084
/**
1085
 * stringprep_ucs4_nfkc_normalize:
1086
 * @str: a Unicode string.
1087
 * @len: length of @str array, or -1 if @str is nul-terminated.
1088
 *
1089
 * Converts a UCS4 string into canonical form, see
1090
 * stringprep_utf8_nfkc_normalize() for more information.
1091
 *
1092
 * Return value: a newly allocated Unicode string, that is the NFKC
1093
 *   normalized form of @str.
1094
 **/
1095
uint32_t *
1096
stringprep_ucs4_nfkc_normalize (const uint32_t *str, ssize_t len)
1097
13.2k
{
1098
13.2k
  char *p;
1099
13.2k
  uint32_t *result_wc;
1100
1101
13.2k
  p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1102
13.2k
  if (!p)
1103
405
    return NULL;
1104
1105
12.8k
  result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1106
12.8k
  free (p);
1107
1108
12.8k
  return result_wc;
1109
13.2k
}