Coverage Report

Created: 2023-06-07 07:17

/src/libidn/lib/nfkc.c
Line
Count
Source (jump to first uncovered line)
1
/* nfkc.c --- Unicode normalization utilities.
2
   Copyright (C) 2002-2023 Simon Josefsson
3
4
   This file is part of GNU Libidn.
5
6
   GNU Libidn is free software: you can redistribute it and/or
7
   modify it under the terms of either:
8
9
     * the GNU Lesser General Public License as published by the Free
10
       Software Foundation; either version 3 of the License, or (at
11
       your option) any later version.
12
13
   or
14
15
     * the GNU General Public License as published by the Free
16
       Software Foundation; either version 2 of the License, or (at
17
       your option) any later version.
18
19
   or both in parallel, as here.
20
21
   GNU Libidn is distributed in the hope that it will be useful,
22
   but WITHOUT ANY WARRANTY; without even the implied warranty of
23
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24
   General Public License for more details.
25
26
   You should have received copies of the GNU General Public License and
27
   the GNU Lesser General Public License along with this program.  If
28
   not, see <https://www.gnu.org/licenses/>. */
29
30
#ifdef HAVE_CONFIG_H
31
# include "config.h"
32
#endif
33
34
#include <stdlib.h>
35
#include <string.h>
36
37
#include "stringprep.h"
38
39
/* Hacks to make syncing with GLIB code easier. */
40
12.5M
#define gboolean int
41
37.2M
#define gchar char
42
#define guchar unsigned char
43
841k
#define glong long
44
37.2M
#define gint int
45
19.7M
#define guint unsigned int
46
2.10M
#define gushort unsigned short
47
#define gint16 int16_t
48
#define guint16 uint16_t
49
93.7M
#define gunichar uint32_t
50
57.1M
#define gsize size_t
51
#define gssize ssize_t
52
43.6M
#define g_malloc malloc
53
0
#define g_free free
54
19.5M
#define g_return_val_if_fail(expr,val)  {   \
55
19.5M
    if (!(expr))         \
56
19.5M
      return (val);         \
57
19.5M
  }
58
59
/* Code from GLIB gmacros.h starts here. */
60
61
/* GLIB - Library of useful routines for C programming
62
 * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald
63
 *
64
 * This library is free software; you can redistribute it and/or
65
 * modify it under the terms of the GNU Lesser General Public
66
 * License as published by the Free Software Foundation; either
67
 * version 2 of the License, or (at your option) any later version.
68
 *
69
 * This library is distributed in the hope that it will be useful,
70
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
71
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
72
 * Lesser General Public License for more details.
73
 *
74
 * You should have received a copy of the GNU Lesser General Public
75
 * License along with this library; if not, write to the
76
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
77
 * Boston, MA 02111-1307, USA.
78
 */
79
80
#ifndef FALSE
81
173M
# define  FALSE (0)
82
#endif
83
84
#ifndef TRUE
85
169M
# define  TRUE  (!FALSE)
86
#endif
87
88
13.3M
#define G_N_ELEMENTS(arr)   (sizeof (arr) / sizeof ((arr)[0]))
89
90
19.1M
#define G_UNLIKELY(expr) (expr)
91
92
/* Code from GLIB gunicode.h starts here. */
93
94
/* gunicode.h - Unicode manipulation functions
95
 *
96
 *  Copyright (C) 1999, 2000 Tom Tromey
97
 *  Copyright 2000, 2005 Red Hat, Inc.
98
 *
99
 * The Gnome Library is free software; you can redistribute it and/or
100
 * modify it under the terms of the GNU Lesser General Public License as
101
 * published by the Free Software Foundation; either version 2 of the
102
 * License, or (at your option) any later version.
103
 *
104
 * The Gnome Library is distributed in the hope that it will be useful,
105
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
106
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
107
 * Lesser General Public License for more details.
108
 *
109
 * You should have received a copy of the GNU Lesser General Public
110
 * License along with the Gnome Library; see the file COPYING.LIB.  If not,
111
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
112
 *   Boston, MA 02111-1307, USA.
113
 */
114
115
typedef enum
116
{
117
  G_NORMALIZE_DEFAULT,
118
  G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
119
  G_NORMALIZE_DEFAULT_COMPOSE,
120
  G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
121
  G_NORMALIZE_ALL,
122
  G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
123
  G_NORMALIZE_ALL_COMPOSE,
124
  G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
125
}
126
GNormalizeMode;
127
128
38.2M
#define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
129
130
/* Code from GLIB gutf8.c starts here. */
131
132
/* gutf8.c - Operations on UTF-8 strings.
133
 *
134
 * Copyright (C) 1999 Tom Tromey
135
 * Copyright (C) 2000 Red Hat, Inc.
136
 *
137
 * This library is free software; you can redistribute it and/or
138
 * modify it under the terms of the GNU Lesser General Public
139
 * License as published by the Free Software Foundation; either
140
 * version 2 of the License, or (at your option) any later version.
141
 *
142
 * This library is distributed in the hope that it will be useful,
143
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
144
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
145
 * Lesser General Public License for more details.
146
 *
147
 * You should have received a copy of the GNU Lesser General Public
148
 * License along with this library; if not, write to the
149
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
150
 * Boston, MA 02111-1307, USA.
151
 */
152
153
#define UTF8_COMPUTE(Char, Mask, Len)   \
154
15.9M
  if (Char < 128)       \
155
15.9M
    {           \
156
961k
      Len = 1;          \
157
961k
      Mask = 0x7f;        \
158
961k
    }            \
159
15.9M
  else if ((Char & 0xe0) == 0xc0)   \
160
14.9M
    {           \
161
14.0M
      Len = 2;          \
162
14.0M
      Mask = 0x1f;        \
163
14.0M
    }            \
164
14.9M
  else if ((Char & 0xf0) == 0xe0)   \
165
883k
    {           \
166
862k
      Len = 3;          \
167
862k
      Mask = 0x0f;        \
168
862k
    }            \
169
883k
  else if ((Char & 0xf8) == 0xf0)   \
170
20.9k
    {           \
171
20.9k
      Len = 4;          \
172
20.9k
      Mask = 0x07;        \
173
20.9k
    }            \
174
20.9k
  else if ((Char & 0xfc) == 0xf8)   \
175
0
    {           \
176
0
      Len = 5;          \
177
0
      Mask = 0x03;        \
178
0
    }            \
179
0
  else if ((Char & 0xfe) == 0xfc)   \
180
0
    {           \
181
0
      Len = 6;          \
182
0
      Mask = 0x01;        \
183
0
    }            \
184
0
  else            \
185
0
    Len = -1;
186
187
#define UTF8_LENGTH(Char)     \
188
19.7M
  ((Char) < 0x80 ? 1 :        \
189
19.7M
   ((Char) < 0x800 ? 2 :      \
190
19.1M
    ((Char) < 0x10000 ? 3 :      \
191
300k
     ((Char) < 0x200000 ? 4 :      \
192
12.6k
      ((Char) < 0x4000000 ? 5 : 6)))))
193
194
#define UTF8_GET(Result, Chars, Count, Mask, Len)           \
195
15.9M
  (Result) = (Chars)[0] & (Mask);               \
196
31.7M
  for ((Count) = 1; (Count) < (Len); ++(Count))             \
197
15.9M
    {                       \
198
15.8M
      if (((Chars)[(Count)] & 0xc0) != 0x80)             \
199
15.8M
  {                     \
200
0
    (Result) = -1;                  \
201
0
    break;                    \
202
0
  }                      \
203
15.8M
      (Result) <<= 6;                   \
204
15.8M
      (Result) |= ((Chars)[(Count)] & 0x3f);              \
205
15.8M
    }
206
207
static const gchar utf8_skip_data[256] = {
208
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
209
  1, 1, 1, 1, 1, 1, 1,
210
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
211
  1, 1, 1, 1, 1, 1, 1,
212
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213
  1, 1, 1, 1, 1, 1, 1,
214
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215
  1, 1, 1, 1, 1, 1, 1,
216
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217
  1, 1, 1, 1, 1, 1, 1,
218
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
219
  1, 1, 1, 1, 1, 1, 1,
220
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
221
  2, 2, 2, 2, 2, 2, 2,
222
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
223
  5, 5, 5, 6, 6, 1, 1
224
};
225
226
static const gchar *const g_utf8_skip = utf8_skip_data;
227
228
/*
229
 * g_utf8_strlen:
230
 * @p: pointer to the start of a UTF-8 encoded string
231
 * @max: the maximum number of bytes to examine. If @max
232
 *       is less than 0, then the string is assumed to be
233
 *       nul-terminated. If @max is 0, @p will not be examined and
234
 *       may be %NULL.
235
 *
236
 * Computes the length of the string in characters, not including
237
 * the terminating nul character.
238
 *
239
 * Return value: the length of the string in characters
240
 **/
241
static glong
242
g_utf8_strlen (const gchar * p)
243
841k
{
244
841k
  glong len = 0;
245
246
841k
  g_return_val_if_fail (p != NULL, 0);
247
248
3.39M
  while (*p)
249
2.55M
    {
250
2.55M
      p = g_utf8_next_char (p);
251
2.55M
      ++len;
252
2.55M
    }
253
254
841k
  return len;
255
841k
}
256
257
/*
258
 * g_utf8_get_char:
259
 * @p: a pointer to Unicode character encoded as UTF-8
260
 *
261
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
262
 * If @p does not point to a valid UTF-8 encoded character, results are
263
 * undefined. If you are not sure that the bytes are complete
264
 * valid Unicode characters, you should use g_utf8_get_char_validated()
265
 * instead.
266
 *
267
 * Return value: the resulting character
268
 **/
269
static gunichar
270
g_utf8_get_char (const gchar * p)
271
15.9M
{
272
15.9M
  int i, mask = 0, len;
273
15.9M
  gunichar result;
274
15.9M
  unsigned char c = (unsigned char) *p;
275
276
15.9M
  UTF8_COMPUTE (c, mask, len);
277
15.9M
  if (len == -1)
278
0
    return (gunichar) - 1;
279
15.9M
  UTF8_GET (result, p, i, mask, len);
280
281
15.9M
  return result;
282
15.9M
}
283
284
/*
285
 * g_unichar_to_utf8:
286
 * @c: a Unicode character code
287
 * @outbuf: output buffer, must have at least 6 bytes of space.
288
 *       If %NULL, the length will be computed and returned
289
 *       and nothing will be written to @outbuf.
290
 *
291
 * Converts a single character to UTF-8.
292
 *
293
 * Return value: number of bytes written
294
 **/
295
static int
296
g_unichar_to_utf8 (gunichar c, gchar * outbuf)
297
19.7M
{
298
  /* If this gets modified, also update the copy in g_string_insert_unichar() */
299
19.7M
  guint len = 0;
300
19.7M
  int first;
301
19.7M
  int i;
302
303
19.7M
  if (c < 0x80)
304
638k
    {
305
638k
      first = 0;
306
638k
      len = 1;
307
638k
    }
308
19.1M
  else if (c < 0x800)
309
18.8M
    {
310
18.8M
      first = 0xc0;
311
18.8M
      len = 2;
312
18.8M
    }
313
300k
  else if (c < 0x10000)
314
287k
    {
315
287k
      first = 0xe0;
316
287k
      len = 3;
317
287k
    }
318
12.6k
  else if (c < 0x200000)
319
12.6k
    {
320
12.6k
      first = 0xf0;
321
12.6k
      len = 4;
322
12.6k
    }
323
0
  else if (c < 0x4000000)
324
0
    {
325
0
      first = 0xf8;
326
0
      len = 5;
327
0
    }
328
0
  else
329
0
    {
330
0
      first = 0xfc;
331
0
      len = 6;
332
0
    }
333
334
19.7M
  if (outbuf)
335
19.7M
    {
336
39.2M
      for (i = len - 1; i > 0; --i)
337
19.4M
  {
338
19.4M
    outbuf[i] = (c & 0x3f) | 0x80;
339
19.4M
    c >>= 6;
340
19.4M
  }
341
19.7M
      outbuf[0] = c | first;
342
19.7M
    }
343
344
19.7M
  return len;
345
19.7M
}
346
347
/*
348
 * g_utf8_to_ucs4_fast:
349
 * @str: a UTF-8 encoded string
350
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
351
 *       then the string is nul-terminated.
352
 * @items_written: location to store the number of characters in the
353
 *                 result, or %NULL.
354
 *
355
 * Convert a string from UTF-8 to a 32-bit fixed width
356
 * representation as UCS-4, assuming valid UTF-8 input.
357
 * This function is roughly twice as fast as g_utf8_to_ucs4()
358
 * but does no error checking on the input. A trailing 0 character
359
 * will be added to the string after the converted text.
360
 *
361
 * Return value: a pointer to a newly allocated UCS-4 string.
362
 *               This value must be freed with g_free().
363
 **/
364
static gunichar *
365
g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
366
18.6M
{
367
18.6M
  gunichar *result;
368
18.6M
  gsize n_chars, i;
369
18.6M
  const gchar *p;
370
371
18.6M
  g_return_val_if_fail (str != NULL, NULL);
372
373
18.6M
  p = str;
374
18.6M
  n_chars = 0;
375
18.6M
  if (len < 0)
376
18.6M
    {
377
38.4M
      while (*p)
378
19.8M
  {
379
19.8M
    p = g_utf8_next_char (p);
380
19.8M
    ++n_chars;
381
19.8M
  }
382
18.6M
    }
383
0
  else
384
0
    {
385
0
      while (p < str + len && *p)
386
0
  {
387
0
    p = g_utf8_next_char (p);
388
0
    ++n_chars;
389
0
  }
390
0
    }
391
392
18.6M
  result = g_malloc (sizeof (gunichar) * (n_chars + 1));
393
18.6M
  if (!result)
394
0
    return NULL;
395
396
18.6M
  p = str;
397
38.4M
  for (i = 0; i < n_chars; i++)
398
19.8M
    {
399
19.8M
      gunichar wc = (guchar) * p++;
400
401
19.8M
      if (wc < 0x80)
402
683k
  {
403
683k
    result[i] = wc;
404
683k
  }
405
19.1M
      else
406
19.1M
  {
407
19.1M
    gunichar mask = 0x40;
408
409
19.1M
    if (G_UNLIKELY ((wc & mask) == 0))
410
0
      {
411
        /* It's an out-of-sequence 10xxxxxxx byte.
412
         * Rather than making an ugly hash of this and the next byte
413
         * and overrunning the buffer, it's more useful to treat it
414
         * with a replacement character */
415
0
        result[i] = 0xfffd;
416
0
        continue;
417
0
      }
418
419
19.1M
    do
420
19.4M
      {
421
19.4M
        wc <<= 6;
422
19.4M
        wc |= (guchar) (*p++) & 0x3f;
423
19.4M
        mask <<= 5;
424
19.4M
      }
425
19.4M
    while ((wc & mask) != 0);
426
427
19.1M
    wc &= mask - 1;
428
429
19.1M
    result[i] = wc;
430
19.1M
  }
431
19.8M
    }
432
18.6M
  result[i] = 0;
433
434
18.6M
  if (items_written)
435
12.5M
    *items_written = i;
436
437
18.6M
  return result;
438
18.6M
}
439
440
/*
441
 * g_ucs4_to_utf8:
442
 * @str: a UCS-4 encoded string
443
 * @len: the maximum length (number of characters) of @str to use.
444
 *       If @len < 0, then the string is nul-terminated.
445
 * @items_read: location to store number of characters read, or %NULL.
446
 * @items_written: location to store number of bytes written or %NULL.
447
 *                 The value here stored does not include the trailing 0
448
 *                 byte.
449
 * @error: location to store the error occurring, or %NULL to ignore
450
 *         errors. Any of the errors in #GConvertError other than
451
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
452
 *
453
 * Convert a string from a 32-bit fixed width representation as UCS-4.
454
 * to UTF-8. The result will be terminated with a 0 byte.
455
 *
456
 * Return value: a pointer to a newly allocated UTF-8 string.
457
 *               This value must be freed with g_free(). If an
458
 *               error occurs, %NULL will be returned and
459
 *               @error set. In that case, @items_read will be
460
 *               set to the position of the first invalid input
461
 *               character.
462
 **/
463
static gchar *
464
g_ucs4_to_utf8 (const gunichar * str,
465
    glong len, glong * items_read, glong * items_written)
466
18.6M
{
467
18.6M
  gint result_length;
468
18.6M
  gchar *result = NULL;
469
18.6M
  gchar *p;
470
18.6M
  gint i;
471
472
18.6M
  result_length = 0;
473
38.4M
  for (i = 0; len < 0 || i < len; i++)
474
19.7M
    {
475
19.7M
      if (!str[i])
476
0
  break;
477
478
19.7M
      if (str[i] >= 0x80000000)
479
0
  goto err_out;
480
481
19.7M
      result_length += UTF8_LENGTH (str[i]);
482
19.7M
    }
483
484
18.6M
  result = g_malloc (result_length + 1);
485
18.6M
  if (!result)
486
0
    return NULL;
487
18.6M
  p = result;
488
489
18.6M
  i = 0;
490
38.4M
  while (p < result + result_length)
491
19.7M
    p += g_unichar_to_utf8 (str[i++], p);
492
493
18.6M
  *p = '\0';
494
495
18.6M
  if (items_written)
496
0
    *items_written = p - result;
497
498
18.6M
err_out:
499
18.6M
  if (items_read)
500
0
    *items_read = i;
501
502
18.6M
  return result;
503
18.6M
}
504
505
/* Code from GLIB gunidecomp.c starts here. */
506
507
/* decomp.c - Character decomposition.
508
 *
509
 *  Copyright (C) 1999, 2000 Tom Tromey
510
 *  Copyright 2000 Red Hat, Inc.
511
 *
512
 * The Gnome Library is free software; you can redistribute it and/or
513
 * modify it under the terms of the GNU Lesser General Public License as
514
 * published by the Free Software Foundation; either version 2 of the
515
 * License, or (at your option) any later version.
516
 *
517
 * The Gnome Library is distributed in the hope that it will be useful,
518
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
519
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
520
 * Lesser General Public License for more details.
521
 *
522
 * You should have received a copy of the GNU Lesser General Public
523
 * License along with the Gnome Library; see the file COPYING.LIB.  If not,
524
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
525
 *   Boston, MA 02111-1307, USA.
526
 */
527
528
#include "gunidecomp.h"
529
#include "gunicomp.h"
530
531
#define CC_PART1(Page, Char)            \
532
31.8M
  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
533
31.8M
   ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX)  \
534
31.8M
   : (cclass_data[combining_class_table_part1[Page]][Char]))
535
536
#define CC_PART2(Page, Char)            \
537
8.22k
  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
538
8.22k
   ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
539
8.22k
   : (cclass_data[combining_class_table_part2[Page]][Char]))
540
541
#define COMBINING_CLASS(Char)         \
542
31.8M
  (((Char) <= G_UNICODE_LAST_CHAR_PART1)     \
543
31.8M
   ? CC_PART1 ((Char) >> 8, (Char) & 0xff)     \
544
31.8M
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
545
30.0k
      ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
546
30.0k
      : 0))
547
548
/* constants for hangul syllable [de]composition */
549
33.1M
#define SBase 0xAC00
550
6.90M
#define LBase 0x1100
551
2.11M
#define VBase 0x1161
552
2.10M
#define TBase 0x11A7
553
670k
#define LCount 19
554
91.6k
#define VCount 21
555
95.9k
#define TCount 28
556
88.3k
#define NCount (VCount * TCount)
557
85.9k
#define SCount (LCount * NCount)
558
559
/*
560
 * g_unicode_canonical_ordering:
561
 * @string: a UCS-4 encoded string.
562
 * @len: the maximum length of @string to use.
563
 *
564
 * Computes the canonical ordering of a string in-place.
565
 * This rearranges decomposed characters in the string
566
 * according to their combining classes.  See the Unicode
567
 * manual for more information.
568
 **/
569
static void
570
g_unicode_canonical_ordering (gunichar * string, gsize len)
571
12.9M
{
572
12.9M
  gsize i;
573
12.9M
  int swap = 1;
574
575
25.9M
  while (swap)
576
12.9M
    {
577
12.9M
      int last;
578
12.9M
      swap = 0;
579
12.9M
      last = COMBINING_CLASS (string[0]);
580
16.7M
      for (i = 0; i < len - 1; ++i)
581
3.83M
  {
582
3.83M
    int next = COMBINING_CLASS (string[i + 1]);
583
3.83M
    if (next != 0 && last > next)
584
2.63k
      {
585
2.63k
        gsize j;
586
        /* Percolate item leftward through string.  */
587
11.0k
        for (j = i + 1; j > 0; --j)
588
10.8k
    {
589
10.8k
      gunichar t;
590
10.8k
      if (COMBINING_CLASS (string[j - 1]) <= next)
591
2.42k
        break;
592
8.42k
      t = string[j];
593
8.42k
      string[j] = string[j - 1];
594
8.42k
      string[j - 1] = t;
595
8.42k
      swap = 1;
596
8.42k
    }
597
        /* We're re-entering the loop looking at the old
598
           character again.  */
599
2.63k
        next = last;
600
2.63k
      }
601
3.83M
    last = next;
602
3.83M
  }
603
12.9M
    }
604
12.9M
}
605
606
/* http://www.unicode.org/unicode/reports/tr15/#Hangul
607
 * r should be null or have sufficient space. Calling with r == NULL will
608
 * only calculate the result_len; however, a buffer with space for three
609
 * characters will always be big enough. */
610
static void
611
decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
612
2.31k
{
613
2.31k
  gint SIndex = s - SBase;
614
2.31k
  gint TIndex = SIndex % TCount;
615
616
2.31k
  if (r)
617
1.15k
    {
618
1.15k
      r[0] = LBase + SIndex / NCount;
619
1.15k
      r[1] = VBase + (SIndex % NCount) / TCount;
620
1.15k
    }
621
622
2.31k
  if (TIndex)
623
1.80k
    {
624
1.80k
      if (r)
625
901
  r[2] = TBase + TIndex;
626
1.80k
      *result_len = 3;
627
1.80k
    }
628
516
  else
629
516
    *result_len = 2;
630
2.31k
}
631
632
/* returns a pointer to a null-terminated UTF-8 string */
633
static const gchar *
634
find_decomposition (gunichar ch, gboolean compat)
635
13.3M
{
636
13.3M
  int start = 0;
637
13.3M
  int end = G_N_ELEMENTS (decomp_table);
638
639
13.3M
  if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
640
12.9M
    {
641
168M
      while (TRUE)
642
168M
  {
643
168M
    int half = (start + end) / 2;
644
168M
    if (ch == decomp_table[half].ch)
645
1.68M
      {
646
1.68M
        int offset;
647
648
1.68M
        if (compat)
649
1.68M
    {
650
1.68M
      offset = decomp_table[half].compat_offset;
651
1.68M
      if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
652
1.14M
        offset = decomp_table[half].canon_offset;
653
1.68M
    }
654
0
        else
655
0
    {
656
0
      offset = decomp_table[half].canon_offset;
657
0
      if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
658
0
        return NULL;
659
0
    }
660
661
1.68M
        return &(decomp_expansion_string[offset]);
662
1.68M
      }
663
167M
    else if (half == start)
664
11.3M
      break;
665
155M
    else if (ch > decomp_table[half].ch)
666
49.6M
      start = half;
667
106M
    else
668
106M
      end = half;
669
168M
  }
670
12.9M
    }
671
672
11.6M
  return NULL;
673
13.3M
}
674
675
/* L,V => LV and LV,T => LVT  */
676
static gboolean
677
combine_hangul (gunichar a, gunichar b, gunichar * result)
678
2.10M
{
679
2.10M
  if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
680
1.54k
    {
681
1.54k
      gint LIndex = a - LBase;
682
1.54k
      gint VIndex = b - VBase;
683
684
1.54k
      *result = SBase + (LIndex * VCount + VIndex) * TCount;
685
1.54k
      return TRUE;
686
1.54k
    }
687
688
2.10M
  if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
689
1.09k
    {
690
1.09k
      gint SIndex = a - SBase;
691
692
1.09k
      if ((SIndex % TCount) == 0)
693
901
  {
694
901
    gint TIndex = b - TBase;
695
696
901
    *result = a + TIndex;
697
901
    return TRUE;
698
901
  }
699
1.09k
    }
700
701
2.10M
  return FALSE;
702
2.10M
}
703
704
#define CI(Page, Char)          \
705
3.75M
  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
706
3.75M
   ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX)  \
707
3.75M
   : (compose_data[compose_table[Page]][Char]))
708
709
#define COMPOSE_INDEX(Char)           \
710
3.95M
  (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
711
712
static gboolean
713
combine (gunichar a, gunichar b, gunichar * result)
714
2.10M
{
715
2.10M
  gushort index_a, index_b;
716
717
2.10M
  if (combine_hangul (a, b, result))
718
2.45k
    return TRUE;
719
720
2.10M
  index_a = COMPOSE_INDEX (a);
721
722
2.10M
  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
723
252k
    {
724
252k
      if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
725
17.8k
  {
726
17.8k
    *result =
727
17.8k
      compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
728
17.8k
    return TRUE;
729
17.8k
  }
730
234k
      else
731
234k
  return FALSE;
732
252k
    }
733
734
1.84M
  index_b = COMPOSE_INDEX (b);
735
736
1.84M
  if (index_b >= COMPOSE_SECOND_SINGLE_START)
737
1.18k
    {
738
1.18k
      if (a ==
739
1.18k
    compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
740
741
  {
741
741
    *result =
742
741
      compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
743
741
    return TRUE;
744
741
  }
745
443
      else
746
443
  return FALSE;
747
1.18k
    }
748
749
1.84M
  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
750
1.84M
      && index_b >= COMPOSE_SECOND_START
751
1.84M
      && index_b < COMPOSE_SECOND_SINGLE_START)
752
561k
    {
753
561k
      gunichar res =
754
561k
  compose_array[index_a - COMPOSE_FIRST_START][index_b -
755
561k
                 COMPOSE_SECOND_START];
756
757
561k
      if (res)
758
560k
  {
759
560k
    *result = res;
760
560k
    return TRUE;
761
560k
  }
762
561k
    }
763
764
1.28M
  return FALSE;
765
1.84M
}
766
767
static gunichar *
768
_g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
769
6.28M
{
770
6.28M
  gsize n_wc;
771
6.28M
  gunichar *wc_buffer;
772
6.28M
  const char *p;
773
6.28M
  gsize last_start;
774
6.28M
  gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
775
6.28M
  gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
776
777
6.28M
  n_wc = 0;
778
6.28M
  p = str;
779
12.9M
  while ((max_len < 0 || p < str + max_len) && *p)
780
6.68M
    {
781
6.68M
      const gchar *decomp;
782
6.68M
      gunichar wc = g_utf8_get_char (p);
783
784
6.68M
      if (wc >= SBase && wc < SBase + SCount)
785
1.15k
  {
786
1.15k
    gsize result_len;
787
1.15k
    decompose_hangul (wc, NULL, &result_len);
788
1.15k
    n_wc += result_len;
789
1.15k
  }
790
6.68M
      else
791
6.68M
  {
792
6.68M
    decomp = find_decomposition (wc, do_compat);
793
794
6.68M
    if (decomp)
795
841k
      n_wc += g_utf8_strlen (decomp);
796
5.84M
    else
797
5.84M
      n_wc++;
798
6.68M
  }
799
800
6.68M
      p = g_utf8_next_char (p);
801
6.68M
    }
802
803
6.28M
  wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
804
6.28M
  if (!wc_buffer)
805
0
    return NULL;
806
807
6.28M
  last_start = 0;
808
6.28M
  n_wc = 0;
809
6.28M
  p = str;
810
12.9M
  while ((max_len < 0 || p < str + max_len) && *p)
811
6.68M
    {
812
6.68M
      gunichar wc = g_utf8_get_char (p);
813
6.68M
      const gchar *decomp;
814
6.68M
      int cc;
815
6.68M
      gsize old_n_wc = n_wc;
816
817
6.68M
      if (wc >= SBase && wc < SBase + SCount)
818
1.15k
  {
819
1.15k
    gsize result_len;
820
1.15k
    decompose_hangul (wc, wc_buffer + n_wc, &result_len);
821
1.15k
    n_wc += result_len;
822
1.15k
  }
823
6.68M
      else
824
6.68M
  {
825
6.68M
    decomp = find_decomposition (wc, do_compat);
826
827
6.68M
    if (decomp)
828
841k
      {
829
841k
        const char *pd;
830
3.39M
        for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
831
2.55M
    wc_buffer[n_wc++] = g_utf8_get_char (pd);
832
841k
      }
833
5.84M
    else
834
5.84M
      wc_buffer[n_wc++] = wc;
835
6.68M
  }
836
837
6.68M
      if (n_wc > 0)
838
6.68M
  {
839
6.68M
    cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
840
841
6.68M
    if (cc == 0)
842
6.67M
      {
843
6.67M
        g_unicode_canonical_ordering (wc_buffer + last_start,
844
6.67M
              n_wc - last_start);
845
6.67M
        last_start = old_n_wc;
846
6.67M
      }
847
6.68M
  }
848
849
6.68M
      p = g_utf8_next_char (p);
850
6.68M
    }
851
852
6.28M
  if (n_wc > 0)
853
6.28M
    {
854
6.28M
      g_unicode_canonical_ordering (wc_buffer + last_start,
855
6.28M
            n_wc - last_start);
856
      /* dead assignment: last_start = n_wc; */
857
6.28M
    }
858
859
6.28M
  wc_buffer[n_wc] = 0;
860
861
  /* All decomposed and reordered */
862
863
6.28M
  if (do_compose && n_wc > 0)
864
6.28M
    {
865
6.28M
      gsize i, j;
866
6.28M
      int last_cc = 0;
867
6.28M
      last_start = 0;
868
869
14.6M
      for (i = 0; i < n_wc; i++)
870
8.39M
  {
871
8.39M
    int cc = COMBINING_CLASS (wc_buffer[i]);
872
873
8.39M
    if (i > 0 &&
874
8.39M
        (last_cc == 0 || last_cc != cc) &&
875
8.39M
        combine (wc_buffer[last_start], wc_buffer[i],
876
2.10M
           &wc_buffer[last_start]))
877
581k
      {
878
1.03M
        for (j = i + 1; j < n_wc; j++)
879
448k
    wc_buffer[j - 1] = wc_buffer[j];
880
581k
        n_wc--;
881
581k
        i--;
882
883
581k
        if (i == last_start)
884
581k
    last_cc = 0;
885
622
        else
886
622
    last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
887
888
581k
        continue;
889
581k
      }
890
891
7.81M
    if (cc == 0)
892
7.79M
      last_start = i;
893
894
7.81M
    last_cc = cc;
895
7.81M
  }
896
6.28M
    }
897
898
6.28M
  wc_buffer[n_wc] = 0;
899
900
6.28M
  return wc_buffer;
901
6.28M
}
902
903
/*
904
 * g_utf8_normalize:
905
 * @str: a UTF-8 encoded string.
906
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
907
 * @mode: the type of normalization to perform.
908
 *
909
 * Converts a string into canonical form, standardizing
910
 * such issues as whether a character with an accent
911
 * is represented as a base character and combining
912
 * accent or as a single precomposed character. The
913
 * string has to be valid UTF-8, otherwise %NULL is
914
 * returned. You should generally call g_utf8_normalize()
915
 * before comparing two Unicode strings.
916
 *
917
 * The normalization mode %G_NORMALIZE_DEFAULT only
918
 * standardizes differences that do not affect the
919
 * text content, such as the above-mentioned accent
920
 * representation. %G_NORMALIZE_ALL also standardizes
921
 * the "compatibility" characters in Unicode, such
922
 * as SUPERSCRIPT THREE to the standard forms
923
 * (in this case DIGIT THREE). Formatting information
924
 * may be lost but for most text operations such
925
 * characters should be considered the same.
926
 *
927
 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
928
 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
929
 * but returned a result with composed forms rather
930
 * than a maximally decomposed form. This is often
931
 * useful if you intend to convert the string to
932
 * a legacy encoding or pass it to a system with
933
 * less capable Unicode handling.
934
 *
935
 * Return value: a newly allocated string, that is the
936
 *   normalized form of @str, or %NULL if @str is not
937
 *   valid UTF-8.
938
 **/
939
static gchar *
940
g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
941
0
{
942
0
  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
943
0
  gchar *result = NULL;
944
945
0
  if (result_wc)
946
0
    result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
947
948
0
  g_free (result_wc);
949
950
0
  return result;
951
0
}
952
953
/* Public Libidn API starts here. */
954
955
/**
956
 * stringprep_utf8_to_unichar:
957
 * @p: a pointer to Unicode character encoded as UTF-8
958
 *
959
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
960
 * If @p does not point to a valid UTF-8 encoded character, results are
961
 * undefined.
962
 *
963
 * Return value: the resulting character.
964
 **/
965
uint32_t
966
stringprep_utf8_to_unichar (const char *p)
967
0
{
968
0
  return g_utf8_get_char (p);
969
0
}
970
971
/**
972
 * stringprep_unichar_to_utf8:
973
 * @c: a ISO10646 character code
974
 * @outbuf: output buffer, must have at least 6 bytes of space.
975
 *       If %NULL, the length will be computed and returned
976
 *       and nothing will be written to @outbuf.
977
 *
978
 * Converts a single character to UTF-8.
979
 *
980
 * Return value: number of bytes written.
981
 **/
982
int
983
stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
984
0
{
985
0
  return g_unichar_to_utf8 (c, outbuf);
986
0
}
987
988
#include <unistr.h>
989
990
/**
991
 * stringprep_utf8_to_ucs4:
992
 * @str: a UTF-8 encoded string
993
 * @len: the maximum length of @str to use. If @len < 0, then
994
 *       the string is nul-terminated.
995
 * @items_written: location to store the number of characters in the
996
 *                 result, or %NULL.
997
 *
998
 * Convert a string from UTF-8 to a 32-bit fixed width representation
999
 * as UCS-4.  The function now performs error checking to verify that
1000
 * the input is valid UTF-8 (before it was documented to not do error
1001
 * checking).
1002
 *
1003
 * Return value: a pointer to a newly allocated UCS-4 string.
1004
 *               This value must be deallocated by the caller.
1005
 **/
1006
uint32_t *
1007
stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
1008
18.6M
{
1009
18.6M
  size_t n;
1010
1011
18.6M
  if (len < 0)
1012
18.6M
    n = strlen (str);
1013
0
  else
1014
0
    n = len;
1015
1016
18.6M
  if (u8_check ((const uint8_t *) str, n))
1017
2.02k
    return NULL;
1018
1019
18.6M
  return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
1020
18.6M
}
1021
1022
/**
1023
 * stringprep_ucs4_to_utf8:
1024
 * @str: a UCS-4 encoded string
1025
 * @len: the maximum length of @str to use. If @len < 0, then
1026
 *       the string is terminated with a 0 character.
1027
 * @items_read: location to store number of characters read read, or %NULL.
1028
 * @items_written: location to store number of bytes written or %NULL.
1029
 *                 The value here stored does not include the trailing 0
1030
 *                 byte.
1031
 *
1032
 * Convert a string from a 32-bit fixed width representation as UCS-4.
1033
 * to UTF-8. The result will be terminated with a 0 byte.
1034
 *
1035
 * Return value: a pointer to a newly allocated UTF-8 string.
1036
 *               This value must be deallocated by the caller.
1037
 *               If an error occurs, %NULL will be returned.
1038
 **/
1039
char *
1040
stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1041
       size_t *items_read, size_t *items_written)
1042
18.6M
{
1043
18.6M
  return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1044
18.6M
       (glong *) items_written);
1045
18.6M
}
1046
1047
/**
1048
 * stringprep_utf8_nfkc_normalize:
1049
 * @str: a UTF-8 encoded string.
1050
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1051
 *
1052
 * Converts a string into canonical form, standardizing
1053
 * such issues as whether a character with an accent
1054
 * is represented as a base character and combining
1055
 * accent or as a single precomposed character.
1056
 *
1057
 * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1058
 * differences that do not affect the text content, such as the
1059
 * above-mentioned accent representation. It standardizes the
1060
 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1061
 * the standard forms (in this case DIGIT THREE). Formatting
1062
 * information may be lost but for most text operations such
1063
 * characters should be considered the same. It returns a result with
1064
 * composed forms rather than a maximally decomposed form.
1065
 *
1066
 * Return value: a newly allocated string, that is the
1067
 *   NFKC normalized form of @str.
1068
 **/
1069
char *
1070
stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1071
0
{
1072
0
  size_t n;
1073
1074
0
  if (len < 0)
1075
0
    n = strlen (str);
1076
0
  else
1077
0
    n = len;
1078
1079
0
  if (u8_check ((const uint8_t *) str, n))
1080
0
    return NULL;
1081
1082
0
  return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1083
0
}
1084
1085
#include <stdio.h>
1086
/**
1087
 * stringprep_ucs4_nfkc_normalize:
1088
 * @str: a Unicode string.
1089
 * @len: length of @str array, or -1 if @str is nul-terminated.
1090
 *
1091
 * Converts a UCS4 string into canonical form, see
1092
 * stringprep_utf8_nfkc_normalize() for more information.
1093
 *
1094
 * Return value: a newly allocated Unicode string, that is the NFKC
1095
 *   normalized form of @str.
1096
 **/
1097
uint32_t *
1098
stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)
1099
6.28M
{
1100
6.28M
  char *p;
1101
6.28M
  uint32_t *result_wc;
1102
1103
6.28M
  p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1104
6.28M
  if (!p)
1105
0
    return NULL;
1106
1107
6.28M
  result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1108
6.28M
  free (p);
1109
1110
6.28M
  return result_wc;
1111
6.28M
}