Coverage Report

Created: 2023-06-07 07:17

/src/libidn/lib/nfkc.c
Line
Count
Source (jump to first uncovered line)
1
/* nfkc.c --- Unicode normalization utilities.
2
   Copyright (C) 2002-2023 Simon Josefsson
3
4
   This file is part of GNU Libidn.
5
6
   GNU Libidn is free software: you can redistribute it and/or
7
   modify it under the terms of either:
8
9
     * the GNU Lesser General Public License as published by the Free
10
       Software Foundation; either version 3 of the License, or (at
11
       your option) any later version.
12
13
   or
14
15
     * the GNU General Public License as published by the Free
16
       Software Foundation; either version 2 of the License, or (at
17
       your option) any later version.
18
19
   or both in parallel, as here.
20
21
   GNU Libidn is distributed in the hope that it will be useful,
22
   but WITHOUT ANY WARRANTY; without even the implied warranty of
23
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24
   General Public License for more details.
25
26
   You should have received copies of the GNU General Public License and
27
   the GNU Lesser General Public License along with this program.  If
28
   not, see <https://www.gnu.org/licenses/>. */
29
30
#ifdef HAVE_CONFIG_H
31
# include "config.h"
32
#endif
33
34
#include <stdlib.h>
35
#include <string.h>
36
37
#include "stringprep.h"
38
39
/* Hacks to make syncing with GLIB code easier. */
40
0
#define gboolean int
41
0
#define gchar char
42
#define guchar unsigned char
43
0
#define glong long
44
0
#define gint int
45
0
#define guint unsigned int
46
0
#define gushort unsigned short
47
#define gint16 int16_t
48
#define guint16 uint16_t
49
0
#define gunichar uint32_t
50
0
#define gsize size_t
51
#define gssize ssize_t
52
0
#define g_malloc malloc
53
0
#define g_free free
54
0
#define g_return_val_if_fail(expr,val)  {   \
55
0
    if (!(expr))         \
56
0
      return (val);         \
57
0
  }
58
59
/* Code from GLIB gmacros.h starts here. */
60
61
/* GLIB - Library of useful routines for C programming
62
 * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald
63
 *
64
 * This library is free software; you can redistribute it and/or
65
 * modify it under the terms of the GNU Lesser General Public
66
 * License as published by the Free Software Foundation; either
67
 * version 2 of the License, or (at your option) any later version.
68
 *
69
 * This library is distributed in the hope that it will be useful,
70
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
71
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
72
 * Lesser General Public License for more details.
73
 *
74
 * You should have received a copy of the GNU Lesser General Public
75
 * License along with this library; if not, write to the
76
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
77
 * Boston, MA 02111-1307, USA.
78
 */
79
80
#ifndef FALSE
81
0
# define  FALSE (0)
82
#endif
83
84
#ifndef TRUE
85
0
# define  TRUE  (!FALSE)
86
#endif
87
88
0
#define G_N_ELEMENTS(arr)   (sizeof (arr) / sizeof ((arr)[0]))
89
90
0
#define G_UNLIKELY(expr) (expr)
91
92
/* Code from GLIB gunicode.h starts here. */
93
94
/* gunicode.h - Unicode manipulation functions
95
 *
96
 *  Copyright (C) 1999, 2000 Tom Tromey
97
 *  Copyright 2000, 2005 Red Hat, Inc.
98
 *
99
 * The Gnome Library is free software; you can redistribute it and/or
100
 * modify it under the terms of the GNU Lesser General Public License as
101
 * published by the Free Software Foundation; either version 2 of the
102
 * License, or (at your option) any later version.
103
 *
104
 * The Gnome Library is distributed in the hope that it will be useful,
105
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
106
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
107
 * Lesser General Public License for more details.
108
 *
109
 * You should have received a copy of the GNU Lesser General Public
110
 * License along with the Gnome Library; see the file COPYING.LIB.  If not,
111
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
112
 *   Boston, MA 02111-1307, USA.
113
 */
114
115
typedef enum
116
{
117
  G_NORMALIZE_DEFAULT,
118
  G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
119
  G_NORMALIZE_DEFAULT_COMPOSE,
120
  G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
121
  G_NORMALIZE_ALL,
122
  G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
123
  G_NORMALIZE_ALL_COMPOSE,
124
  G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
125
}
126
GNormalizeMode;
127
128
0
#define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
129
130
/* Code from GLIB gutf8.c starts here. */
131
132
/* gutf8.c - Operations on UTF-8 strings.
133
 *
134
 * Copyright (C) 1999 Tom Tromey
135
 * Copyright (C) 2000 Red Hat, Inc.
136
 *
137
 * This library is free software; you can redistribute it and/or
138
 * modify it under the terms of the GNU Lesser General Public
139
 * License as published by the Free Software Foundation; either
140
 * version 2 of the License, or (at your option) any later version.
141
 *
142
 * This library is distributed in the hope that it will be useful,
143
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
144
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
145
 * Lesser General Public License for more details.
146
 *
147
 * You should have received a copy of the GNU Lesser General Public
148
 * License along with this library; if not, write to the
149
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
150
 * Boston, MA 02111-1307, USA.
151
 */
152
153
#define UTF8_COMPUTE(Char, Mask, Len)   \
154
0
  if (Char < 128)       \
155
0
    {           \
156
0
      Len = 1;          \
157
0
      Mask = 0x7f;        \
158
0
    }            \
159
0
  else if ((Char & 0xe0) == 0xc0)   \
160
0
    {           \
161
0
      Len = 2;          \
162
0
      Mask = 0x1f;        \
163
0
    }            \
164
0
  else if ((Char & 0xf0) == 0xe0)   \
165
0
    {           \
166
0
      Len = 3;          \
167
0
      Mask = 0x0f;        \
168
0
    }            \
169
0
  else if ((Char & 0xf8) == 0xf0)   \
170
0
    {           \
171
0
      Len = 4;          \
172
0
      Mask = 0x07;        \
173
0
    }            \
174
0
  else if ((Char & 0xfc) == 0xf8)   \
175
0
    {           \
176
0
      Len = 5;          \
177
0
      Mask = 0x03;        \
178
0
    }            \
179
0
  else if ((Char & 0xfe) == 0xfc)   \
180
0
    {           \
181
0
      Len = 6;          \
182
0
      Mask = 0x01;        \
183
0
    }            \
184
0
  else            \
185
0
    Len = -1;
186
187
#define UTF8_LENGTH(Char)     \
188
0
  ((Char) < 0x80 ? 1 :        \
189
0
   ((Char) < 0x800 ? 2 :      \
190
0
    ((Char) < 0x10000 ? 3 :      \
191
0
     ((Char) < 0x200000 ? 4 :      \
192
0
      ((Char) < 0x4000000 ? 5 : 6)))))
193
194
#define UTF8_GET(Result, Chars, Count, Mask, Len)           \
195
0
  (Result) = (Chars)[0] & (Mask);               \
196
0
  for ((Count) = 1; (Count) < (Len); ++(Count))             \
197
0
    {                       \
198
0
      if (((Chars)[(Count)] & 0xc0) != 0x80)             \
199
0
  {                     \
200
0
    (Result) = -1;                  \
201
0
    break;                    \
202
0
  }                      \
203
0
      (Result) <<= 6;                   \
204
0
      (Result) |= ((Chars)[(Count)] & 0x3f);              \
205
0
    }
206
207
static const gchar utf8_skip_data[256] = {
208
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
209
  1, 1, 1, 1, 1, 1, 1,
210
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
211
  1, 1, 1, 1, 1, 1, 1,
212
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213
  1, 1, 1, 1, 1, 1, 1,
214
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215
  1, 1, 1, 1, 1, 1, 1,
216
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217
  1, 1, 1, 1, 1, 1, 1,
218
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
219
  1, 1, 1, 1, 1, 1, 1,
220
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
221
  2, 2, 2, 2, 2, 2, 2,
222
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
223
  5, 5, 5, 6, 6, 1, 1
224
};
225
226
static const gchar *const g_utf8_skip = utf8_skip_data;
227
228
/*
229
 * g_utf8_strlen:
230
 * @p: pointer to the start of a UTF-8 encoded string
231
 * @max: the maximum number of bytes to examine. If @max
232
 *       is less than 0, then the string is assumed to be
233
 *       nul-terminated. If @max is 0, @p will not be examined and
234
 *       may be %NULL.
235
 *
236
 * Computes the length of the string in characters, not including
237
 * the terminating nul character.
238
 *
239
 * Return value: the length of the string in characters
240
 **/
241
static glong
242
g_utf8_strlen (const gchar * p)
243
0
{
244
0
  glong len = 0;
245
246
0
  g_return_val_if_fail (p != NULL, 0);
247
248
0
  while (*p)
249
0
    {
250
0
      p = g_utf8_next_char (p);
251
0
      ++len;
252
0
    }
253
254
0
  return len;
255
0
}
256
257
/*
258
 * g_utf8_get_char:
259
 * @p: a pointer to Unicode character encoded as UTF-8
260
 *
261
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
262
 * If @p does not point to a valid UTF-8 encoded character, results are
263
 * undefined. If you are not sure that the bytes are complete
264
 * valid Unicode characters, you should use g_utf8_get_char_validated()
265
 * instead.
266
 *
267
 * Return value: the resulting character
268
 **/
269
static gunichar
270
g_utf8_get_char (const gchar * p)
271
0
{
272
0
  int i, mask = 0, len;
273
0
  gunichar result;
274
0
  unsigned char c = (unsigned char) *p;
275
276
0
  UTF8_COMPUTE (c, mask, len);
277
0
  if (len == -1)
278
0
    return (gunichar) - 1;
279
0
  UTF8_GET (result, p, i, mask, len);
280
281
0
  return result;
282
0
}
283
284
/*
285
 * g_unichar_to_utf8:
286
 * @c: a Unicode character code
287
 * @outbuf: output buffer, must have at least 6 bytes of space.
288
 *       If %NULL, the length will be computed and returned
289
 *       and nothing will be written to @outbuf.
290
 *
291
 * Converts a single character to UTF-8.
292
 *
293
 * Return value: number of bytes written
294
 **/
295
static int
296
g_unichar_to_utf8 (gunichar c, gchar * outbuf)
297
0
{
298
  /* If this gets modified, also update the copy in g_string_insert_unichar() */
299
0
  guint len = 0;
300
0
  int first;
301
0
  int i;
302
303
0
  if (c < 0x80)
304
0
    {
305
0
      first = 0;
306
0
      len = 1;
307
0
    }
308
0
  else if (c < 0x800)
309
0
    {
310
0
      first = 0xc0;
311
0
      len = 2;
312
0
    }
313
0
  else if (c < 0x10000)
314
0
    {
315
0
      first = 0xe0;
316
0
      len = 3;
317
0
    }
318
0
  else if (c < 0x200000)
319
0
    {
320
0
      first = 0xf0;
321
0
      len = 4;
322
0
    }
323
0
  else if (c < 0x4000000)
324
0
    {
325
0
      first = 0xf8;
326
0
      len = 5;
327
0
    }
328
0
  else
329
0
    {
330
0
      first = 0xfc;
331
0
      len = 6;
332
0
    }
333
334
0
  if (outbuf)
335
0
    {
336
0
      for (i = len - 1; i > 0; --i)
337
0
  {
338
0
    outbuf[i] = (c & 0x3f) | 0x80;
339
0
    c >>= 6;
340
0
  }
341
0
      outbuf[0] = c | first;
342
0
    }
343
344
0
  return len;
345
0
}
346
347
/*
348
 * g_utf8_to_ucs4_fast:
349
 * @str: a UTF-8 encoded string
350
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
351
 *       then the string is nul-terminated.
352
 * @items_written: location to store the number of characters in the
353
 *                 result, or %NULL.
354
 *
355
 * Convert a string from UTF-8 to a 32-bit fixed width
356
 * representation as UCS-4, assuming valid UTF-8 input.
357
 * This function is roughly twice as fast as g_utf8_to_ucs4()
358
 * but does no error checking on the input. A trailing 0 character
359
 * will be added to the string after the converted text.
360
 *
361
 * Return value: a pointer to a newly allocated UCS-4 string.
362
 *               This value must be freed with g_free().
363
 **/
364
static gunichar *
365
g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
366
0
{
367
0
  gunichar *result;
368
0
  gsize n_chars, i;
369
0
  const gchar *p;
370
371
0
  g_return_val_if_fail (str != NULL, NULL);
372
373
0
  p = str;
374
0
  n_chars = 0;
375
0
  if (len < 0)
376
0
    {
377
0
      while (*p)
378
0
  {
379
0
    p = g_utf8_next_char (p);
380
0
    ++n_chars;
381
0
  }
382
0
    }
383
0
  else
384
0
    {
385
0
      while (p < str + len && *p)
386
0
  {
387
0
    p = g_utf8_next_char (p);
388
0
    ++n_chars;
389
0
  }
390
0
    }
391
392
0
  result = g_malloc (sizeof (gunichar) * (n_chars + 1));
393
0
  if (!result)
394
0
    return NULL;
395
396
0
  p = str;
397
0
  for (i = 0; i < n_chars; i++)
398
0
    {
399
0
      gunichar wc = (guchar) * p++;
400
401
0
      if (wc < 0x80)
402
0
  {
403
0
    result[i] = wc;
404
0
  }
405
0
      else
406
0
  {
407
0
    gunichar mask = 0x40;
408
409
0
    if (G_UNLIKELY ((wc & mask) == 0))
410
0
      {
411
        /* It's an out-of-sequence 10xxxxxxx byte.
412
         * Rather than making an ugly hash of this and the next byte
413
         * and overrunning the buffer, it's more useful to treat it
414
         * with a replacement character */
415
0
        result[i] = 0xfffd;
416
0
        continue;
417
0
      }
418
419
0
    do
420
0
      {
421
0
        wc <<= 6;
422
0
        wc |= (guchar) (*p++) & 0x3f;
423
0
        mask <<= 5;
424
0
      }
425
0
    while ((wc & mask) != 0);
426
427
0
    wc &= mask - 1;
428
429
0
    result[i] = wc;
430
0
  }
431
0
    }
432
0
  result[i] = 0;
433
434
0
  if (items_written)
435
0
    *items_written = i;
436
437
0
  return result;
438
0
}
439
440
/*
441
 * g_ucs4_to_utf8:
442
 * @str: a UCS-4 encoded string
443
 * @len: the maximum length (number of characters) of @str to use.
444
 *       If @len < 0, then the string is nul-terminated.
445
 * @items_read: location to store number of characters read, or %NULL.
446
 * @items_written: location to store number of bytes written or %NULL.
447
 *                 The value here stored does not include the trailing 0
448
 *                 byte.
449
 * @error: location to store the error occurring, or %NULL to ignore
450
 *         errors. Any of the errors in #GConvertError other than
451
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
452
 *
453
 * Convert a string from a 32-bit fixed width representation as UCS-4.
454
 * to UTF-8. The result will be terminated with a 0 byte.
455
 *
456
 * Return value: a pointer to a newly allocated UTF-8 string.
457
 *               This value must be freed with g_free(). If an
458
 *               error occurs, %NULL will be returned and
459
 *               @error set. In that case, @items_read will be
460
 *               set to the position of the first invalid input
461
 *               character.
462
 **/
463
static gchar *
464
g_ucs4_to_utf8 (const gunichar * str,
465
    glong len, glong * items_read, glong * items_written)
466
0
{
467
0
  gint result_length;
468
0
  gchar *result = NULL;
469
0
  gchar *p;
470
0
  gint i;
471
472
0
  result_length = 0;
473
0
  for (i = 0; len < 0 || i < len; i++)
474
0
    {
475
0
      if (!str[i])
476
0
  break;
477
478
0
      if (str[i] >= 0x80000000)
479
0
  goto err_out;
480
481
0
      result_length += UTF8_LENGTH (str[i]);
482
0
    }
483
484
0
  result = g_malloc (result_length + 1);
485
0
  if (!result)
486
0
    return NULL;
487
0
  p = result;
488
489
0
  i = 0;
490
0
  while (p < result + result_length)
491
0
    p += g_unichar_to_utf8 (str[i++], p);
492
493
0
  *p = '\0';
494
495
0
  if (items_written)
496
0
    *items_written = p - result;
497
498
0
err_out:
499
0
  if (items_read)
500
0
    *items_read = i;
501
502
0
  return result;
503
0
}
504
505
/* Code from GLIB gunidecomp.c starts here. */
506
507
/* decomp.c - Character decomposition.
508
 *
509
 *  Copyright (C) 1999, 2000 Tom Tromey
510
 *  Copyright 2000 Red Hat, Inc.
511
 *
512
 * The Gnome Library is free software; you can redistribute it and/or
513
 * modify it under the terms of the GNU Lesser General Public License as
514
 * published by the Free Software Foundation; either version 2 of the
515
 * License, or (at your option) any later version.
516
 *
517
 * The Gnome Library is distributed in the hope that it will be useful,
518
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
519
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
520
 * Lesser General Public License for more details.
521
 *
522
 * You should have received a copy of the GNU Lesser General Public
523
 * License along with the Gnome Library; see the file COPYING.LIB.  If not,
524
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
525
 *   Boston, MA 02111-1307, USA.
526
 */
527
528
#include "gunidecomp.h"
529
#include "gunicomp.h"
530
531
#define CC_PART1(Page, Char)            \
532
0
  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
533
0
   ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
534
0
   : (cclass_data[combining_class_table_part1[Page]][Char]))
535
536
#define CC_PART2(Page, Char)            \
537
0
  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
538
0
   ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
539
0
   : (cclass_data[combining_class_table_part2[Page]][Char]))
540
541
#define COMBINING_CLASS(Char)         \
542
0
  (((Char) <= G_UNICODE_LAST_CHAR_PART1)     \
543
0
   ? CC_PART1 ((Char) >> 8, (Char) & 0xff)     \
544
0
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
545
0
      ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
546
0
      : 0))
547
548
/* constants for hangul syllable [de]composition */
549
0
#define SBase 0xAC00
550
0
#define LBase 0x1100
551
0
#define VBase 0x1161
552
0
#define TBase 0x11A7
553
0
#define LCount 19
554
0
#define VCount 21
555
0
#define TCount 28
556
0
#define NCount (VCount * TCount)
557
0
#define SCount (LCount * NCount)
558
559
/*
560
 * g_unicode_canonical_ordering:
561
 * @string: a UCS-4 encoded string.
562
 * @len: the maximum length of @string to use.
563
 *
564
 * Computes the canonical ordering of a string in-place.
565
 * This rearranges decomposed characters in the string
566
 * according to their combining classes.  See the Unicode
567
 * manual for more information.
568
 **/
569
static void
570
g_unicode_canonical_ordering (gunichar * string, gsize len)
571
0
{
572
0
  gsize i;
573
0
  int swap = 1;
574
575
0
  while (swap)
576
0
    {
577
0
      int last;
578
0
      swap = 0;
579
0
      last = COMBINING_CLASS (string[0]);
580
0
      for (i = 0; i < len - 1; ++i)
581
0
  {
582
0
    int next = COMBINING_CLASS (string[i + 1]);
583
0
    if (next != 0 && last > next)
584
0
      {
585
0
        gsize j;
586
        /* Percolate item leftward through string.  */
587
0
        for (j = i + 1; j > 0; --j)
588
0
    {
589
0
      gunichar t;
590
0
      if (COMBINING_CLASS (string[j - 1]) <= next)
591
0
        break;
592
0
      t = string[j];
593
0
      string[j] = string[j - 1];
594
0
      string[j - 1] = t;
595
0
      swap = 1;
596
0
    }
597
        /* We're re-entering the loop looking at the old
598
           character again.  */
599
0
        next = last;
600
0
      }
601
0
    last = next;
602
0
  }
603
0
    }
604
0
}
605
606
/* http://www.unicode.org/unicode/reports/tr15/#Hangul
607
 * r should be null or have sufficient space. Calling with r == NULL will
608
 * only calculate the result_len; however, a buffer with space for three
609
 * characters will always be big enough. */
610
static void
611
decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
612
0
{
613
0
  gint SIndex = s - SBase;
614
0
  gint TIndex = SIndex % TCount;
615
616
0
  if (r)
617
0
    {
618
0
      r[0] = LBase + SIndex / NCount;
619
0
      r[1] = VBase + (SIndex % NCount) / TCount;
620
0
    }
621
622
0
  if (TIndex)
623
0
    {
624
0
      if (r)
625
0
  r[2] = TBase + TIndex;
626
0
      *result_len = 3;
627
0
    }
628
0
  else
629
0
    *result_len = 2;
630
0
}
631
632
/* returns a pointer to a null-terminated UTF-8 string */
633
static const gchar *
634
find_decomposition (gunichar ch, gboolean compat)
635
0
{
636
0
  int start = 0;
637
0
  int end = G_N_ELEMENTS (decomp_table);
638
639
0
  if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
640
0
    {
641
0
      while (TRUE)
642
0
  {
643
0
    int half = (start + end) / 2;
644
0
    if (ch == decomp_table[half].ch)
645
0
      {
646
0
        int offset;
647
648
0
        if (compat)
649
0
    {
650
0
      offset = decomp_table[half].compat_offset;
651
0
      if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
652
0
        offset = decomp_table[half].canon_offset;
653
0
    }
654
0
        else
655
0
    {
656
0
      offset = decomp_table[half].canon_offset;
657
0
      if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
658
0
        return NULL;
659
0
    }
660
661
0
        return &(decomp_expansion_string[offset]);
662
0
      }
663
0
    else if (half == start)
664
0
      break;
665
0
    else if (ch > decomp_table[half].ch)
666
0
      start = half;
667
0
    else
668
0
      end = half;
669
0
  }
670
0
    }
671
672
0
  return NULL;
673
0
}
674
675
/* L,V => LV and LV,T => LVT  */
676
static gboolean
677
combine_hangul (gunichar a, gunichar b, gunichar * result)
678
0
{
679
0
  if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
680
0
    {
681
0
      gint LIndex = a - LBase;
682
0
      gint VIndex = b - VBase;
683
684
0
      *result = SBase + (LIndex * VCount + VIndex) * TCount;
685
0
      return TRUE;
686
0
    }
687
688
0
  if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
689
0
    {
690
0
      gint SIndex = a - SBase;
691
692
0
      if ((SIndex % TCount) == 0)
693
0
  {
694
0
    gint TIndex = b - TBase;
695
696
0
    *result = a + TIndex;
697
0
    return TRUE;
698
0
  }
699
0
    }
700
701
0
  return FALSE;
702
0
}
703
704
#define CI(Page, Char)          \
705
0
  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
706
0
   ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
707
0
   : (compose_data[compose_table[Page]][Char]))
708
709
#define COMPOSE_INDEX(Char)           \
710
0
  (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
711
712
static gboolean
713
combine (gunichar a, gunichar b, gunichar * result)
714
0
{
715
0
  gushort index_a, index_b;
716
717
0
  if (combine_hangul (a, b, result))
718
0
    return TRUE;
719
720
0
  index_a = COMPOSE_INDEX (a);
721
722
0
  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
723
0
    {
724
0
      if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
725
0
  {
726
0
    *result =
727
0
      compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
728
0
    return TRUE;
729
0
  }
730
0
      else
731
0
  return FALSE;
732
0
    }
733
734
0
  index_b = COMPOSE_INDEX (b);
735
736
0
  if (index_b >= COMPOSE_SECOND_SINGLE_START)
737
0
    {
738
0
      if (a ==
739
0
    compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
740
0
  {
741
0
    *result =
742
0
      compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
743
0
    return TRUE;
744
0
  }
745
0
      else
746
0
  return FALSE;
747
0
    }
748
749
0
  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
750
0
      && index_b >= COMPOSE_SECOND_START
751
0
      && index_b < COMPOSE_SECOND_SINGLE_START)
752
0
    {
753
0
      gunichar res =
754
0
  compose_array[index_a - COMPOSE_FIRST_START][index_b -
755
0
                 COMPOSE_SECOND_START];
756
757
0
      if (res)
758
0
  {
759
0
    *result = res;
760
0
    return TRUE;
761
0
  }
762
0
    }
763
764
0
  return FALSE;
765
0
}
766
767
static gunichar *
768
_g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
769
0
{
770
0
  gsize n_wc;
771
0
  gunichar *wc_buffer;
772
0
  const char *p;
773
0
  gsize last_start;
774
0
  gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
775
0
  gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
776
777
0
  n_wc = 0;
778
0
  p = str;
779
0
  while ((max_len < 0 || p < str + max_len) && *p)
780
0
    {
781
0
      const gchar *decomp;
782
0
      gunichar wc = g_utf8_get_char (p);
783
784
0
      if (wc >= SBase && wc < SBase + SCount)
785
0
  {
786
0
    gsize result_len;
787
0
    decompose_hangul (wc, NULL, &result_len);
788
0
    n_wc += result_len;
789
0
  }
790
0
      else
791
0
  {
792
0
    decomp = find_decomposition (wc, do_compat);
793
794
0
    if (decomp)
795
0
      n_wc += g_utf8_strlen (decomp);
796
0
    else
797
0
      n_wc++;
798
0
  }
799
800
0
      p = g_utf8_next_char (p);
801
0
    }
802
803
0
  wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
804
0
  if (!wc_buffer)
805
0
    return NULL;
806
807
0
  last_start = 0;
808
0
  n_wc = 0;
809
0
  p = str;
810
0
  while ((max_len < 0 || p < str + max_len) && *p)
811
0
    {
812
0
      gunichar wc = g_utf8_get_char (p);
813
0
      const gchar *decomp;
814
0
      int cc;
815
0
      gsize old_n_wc = n_wc;
816
817
0
      if (wc >= SBase && wc < SBase + SCount)
818
0
  {
819
0
    gsize result_len;
820
0
    decompose_hangul (wc, wc_buffer + n_wc, &result_len);
821
0
    n_wc += result_len;
822
0
  }
823
0
      else
824
0
  {
825
0
    decomp = find_decomposition (wc, do_compat);
826
827
0
    if (decomp)
828
0
      {
829
0
        const char *pd;
830
0
        for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
831
0
    wc_buffer[n_wc++] = g_utf8_get_char (pd);
832
0
      }
833
0
    else
834
0
      wc_buffer[n_wc++] = wc;
835
0
  }
836
837
0
      if (n_wc > 0)
838
0
  {
839
0
    cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
840
841
0
    if (cc == 0)
842
0
      {
843
0
        g_unicode_canonical_ordering (wc_buffer + last_start,
844
0
              n_wc - last_start);
845
0
        last_start = old_n_wc;
846
0
      }
847
0
  }
848
849
0
      p = g_utf8_next_char (p);
850
0
    }
851
852
0
  if (n_wc > 0)
853
0
    {
854
0
      g_unicode_canonical_ordering (wc_buffer + last_start,
855
0
            n_wc - last_start);
856
      /* dead assignment: last_start = n_wc; */
857
0
    }
858
859
0
  wc_buffer[n_wc] = 0;
860
861
  /* All decomposed and reordered */
862
863
0
  if (do_compose && n_wc > 0)
864
0
    {
865
0
      gsize i, j;
866
0
      int last_cc = 0;
867
0
      last_start = 0;
868
869
0
      for (i = 0; i < n_wc; i++)
870
0
  {
871
0
    int cc = COMBINING_CLASS (wc_buffer[i]);
872
873
0
    if (i > 0 &&
874
0
        (last_cc == 0 || last_cc != cc) &&
875
0
        combine (wc_buffer[last_start], wc_buffer[i],
876
0
           &wc_buffer[last_start]))
877
0
      {
878
0
        for (j = i + 1; j < n_wc; j++)
879
0
    wc_buffer[j - 1] = wc_buffer[j];
880
0
        n_wc--;
881
0
        i--;
882
883
0
        if (i == last_start)
884
0
    last_cc = 0;
885
0
        else
886
0
    last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
887
888
0
        continue;
889
0
      }
890
891
0
    if (cc == 0)
892
0
      last_start = i;
893
894
0
    last_cc = cc;
895
0
  }
896
0
    }
897
898
0
  wc_buffer[n_wc] = 0;
899
900
0
  return wc_buffer;
901
0
}
902
903
/*
904
 * g_utf8_normalize:
905
 * @str: a UTF-8 encoded string.
906
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
907
 * @mode: the type of normalization to perform.
908
 *
909
 * Converts a string into canonical form, standardizing
910
 * such issues as whether a character with an accent
911
 * is represented as a base character and combining
912
 * accent or as a single precomposed character. The
913
 * string has to be valid UTF-8, otherwise %NULL is
914
 * returned. You should generally call g_utf8_normalize()
915
 * before comparing two Unicode strings.
916
 *
917
 * The normalization mode %G_NORMALIZE_DEFAULT only
918
 * standardizes differences that do not affect the
919
 * text content, such as the above-mentioned accent
920
 * representation. %G_NORMALIZE_ALL also standardizes
921
 * the "compatibility" characters in Unicode, such
922
 * as SUPERSCRIPT THREE to the standard forms
923
 * (in this case DIGIT THREE). Formatting information
924
 * may be lost but for most text operations such
925
 * characters should be considered the same.
926
 *
927
 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
928
 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
929
 * but returned a result with composed forms rather
930
 * than a maximally decomposed form. This is often
931
 * useful if you intend to convert the string to
932
 * a legacy encoding or pass it to a system with
933
 * less capable Unicode handling.
934
 *
935
 * Return value: a newly allocated string, that is the
936
 *   normalized form of @str, or %NULL if @str is not
937
 *   valid UTF-8.
938
 **/
939
static gchar *
940
g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
941
0
{
942
0
  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
943
0
  gchar *result = NULL;
944
945
0
  if (result_wc)
946
0
    result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
947
948
0
  g_free (result_wc);
949
950
0
  return result;
951
0
}
952
953
/* Public Libidn API starts here. */
954
955
/**
956
 * stringprep_utf8_to_unichar:
957
 * @p: a pointer to Unicode character encoded as UTF-8
958
 *
959
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
960
 * If @p does not point to a valid UTF-8 encoded character, results are
961
 * undefined.
962
 *
963
 * Return value: the resulting character.
964
 **/
965
uint32_t
966
stringprep_utf8_to_unichar (const char *p)
967
0
{
968
0
  return g_utf8_get_char (p);
969
0
}
970
971
/**
972
 * stringprep_unichar_to_utf8:
973
 * @c: a ISO10646 character code
974
 * @outbuf: output buffer, must have at least 6 bytes of space.
975
 *       If %NULL, the length will be computed and returned
976
 *       and nothing will be written to @outbuf.
977
 *
978
 * Converts a single character to UTF-8.
979
 *
980
 * Return value: number of bytes written.
981
 **/
982
int
983
stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
984
0
{
985
0
  return g_unichar_to_utf8 (c, outbuf);
986
0
}
987
988
#include <unistr.h>
989
990
/**
991
 * stringprep_utf8_to_ucs4:
992
 * @str: a UTF-8 encoded string
993
 * @len: the maximum length of @str to use. If @len < 0, then
994
 *       the string is nul-terminated.
995
 * @items_written: location to store the number of characters in the
996
 *                 result, or %NULL.
997
 *
998
 * Convert a string from UTF-8 to a 32-bit fixed width representation
999
 * as UCS-4.  The function now performs error checking to verify that
1000
 * the input is valid UTF-8 (before it was documented to not do error
1001
 * checking).
1002
 *
1003
 * Return value: a pointer to a newly allocated UCS-4 string.
1004
 *               This value must be deallocated by the caller.
1005
 **/
1006
uint32_t *
1007
stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
1008
0
{
1009
0
  size_t n;
1010
1011
0
  if (len < 0)
1012
0
    n = strlen (str);
1013
0
  else
1014
0
    n = len;
1015
1016
0
  if (u8_check ((const uint8_t *) str, n))
1017
0
    return NULL;
1018
1019
0
  return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
1020
0
}
1021
1022
/**
1023
 * stringprep_ucs4_to_utf8:
1024
 * @str: a UCS-4 encoded string
1025
 * @len: the maximum length of @str to use. If @len < 0, then
1026
 *       the string is terminated with a 0 character.
1027
 * @items_read: location to store number of characters read read, or %NULL.
1028
 * @items_written: location to store number of bytes written or %NULL.
1029
 *                 The value here stored does not include the trailing 0
1030
 *                 byte.
1031
 *
1032
 * Convert a string from a 32-bit fixed width representation as UCS-4.
1033
 * to UTF-8. The result will be terminated with a 0 byte.
1034
 *
1035
 * Return value: a pointer to a newly allocated UTF-8 string.
1036
 *               This value must be deallocated by the caller.
1037
 *               If an error occurs, %NULL will be returned.
1038
 **/
1039
char *
1040
stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1041
       size_t *items_read, size_t *items_written)
1042
0
{
1043
0
  return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1044
0
       (glong *) items_written);
1045
0
}
1046
1047
/**
1048
 * stringprep_utf8_nfkc_normalize:
1049
 * @str: a UTF-8 encoded string.
1050
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1051
 *
1052
 * Converts a string into canonical form, standardizing
1053
 * such issues as whether a character with an accent
1054
 * is represented as a base character and combining
1055
 * accent or as a single precomposed character.
1056
 *
1057
 * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1058
 * differences that do not affect the text content, such as the
1059
 * above-mentioned accent representation. It standardizes the
1060
 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1061
 * the standard forms (in this case DIGIT THREE). Formatting
1062
 * information may be lost but for most text operations such
1063
 * characters should be considered the same. It returns a result with
1064
 * composed forms rather than a maximally decomposed form.
1065
 *
1066
 * Return value: a newly allocated string, that is the
1067
 *   NFKC normalized form of @str.
1068
 **/
1069
char *
1070
stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1071
0
{
1072
0
  size_t n;
1073
1074
0
  if (len < 0)
1075
0
    n = strlen (str);
1076
0
  else
1077
0
    n = len;
1078
1079
0
  if (u8_check ((const uint8_t *) str, n))
1080
0
    return NULL;
1081
1082
0
  return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1083
0
}
1084
1085
#include <stdio.h>
1086
/**
1087
 * stringprep_ucs4_nfkc_normalize:
1088
 * @str: a Unicode string.
1089
 * @len: length of @str array, or -1 if @str is nul-terminated.
1090
 *
1091
 * Converts a UCS4 string into canonical form, see
1092
 * stringprep_utf8_nfkc_normalize() for more information.
1093
 *
1094
 * Return value: a newly allocated Unicode string, that is the NFKC
1095
 *   normalized form of @str.
1096
 **/
1097
uint32_t *
1098
stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)
1099
0
{
1100
0
  char *p;
1101
0
  uint32_t *result_wc;
1102
1103
0
  p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1104
0
  if (!p)
1105
0
    return NULL;
1106
1107
0
  result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1108
0
  free (p);
1109
1110
0
  return result_wc;
1111
0
}