| Line | Count | Source (jump to first uncovered line) | 
| 1 |  | /* nfkc.c --- Unicode normalization utilities. | 
| 2 |  |    Copyright (C) 2002-2024 Simon Josefsson | 
| 3 |  |  | 
| 4 |  |    This file is part of GNU Libidn. | 
| 5 |  |  | 
| 6 |  |    GNU Libidn is free software: you can redistribute it and/or | 
| 7 |  |    modify it under the terms of either: | 
| 8 |  |  | 
| 9 |  |      * the GNU Lesser General Public License as published by the Free | 
| 10 |  |        Software Foundation; either version 3 of the License, or (at | 
| 11 |  |        your option) any later version. | 
| 12 |  |  | 
| 13 |  |    or | 
| 14 |  |  | 
| 15 |  |      * the GNU General Public License as published by the Free | 
| 16 |  |        Software Foundation; either version 2 of the License, or (at | 
| 17 |  |        your option) any later version. | 
| 18 |  |  | 
| 19 |  |    or both in parallel, as here. | 
| 20 |  |  | 
| 21 |  |    GNU Libidn is distributed in the hope that it will be useful, | 
| 22 |  |    but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 23 |  |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
| 24 |  |    General Public License for more details. | 
| 25 |  |  | 
| 26 |  |    You should have received copies of the GNU General Public License and | 
| 27 |  |    the GNU Lesser General Public License along with this program.  If | 
| 28 |  |    not, see <https://www.gnu.org/licenses/>. */ | 
| 29 |  |  | 
| 30 |  | #ifdef HAVE_CONFIG_H | 
| 31 |  | # include "config.h" | 
| 32 |  | #endif | 
| 33 |  |  | 
| 34 |  | #include <stdlib.h> | 
| 35 |  | #include <string.h> | 
| 36 |  |  | 
| 37 |  | #include "stringprep.h" | 
| 38 |  |  | 
| 39 |  | /* Hacks to make syncing with GLIB code easier. */ | 
| 40 | 29.1k | #define gboolean int | 
| 41 | 41.3k | #define gchar char | 
| 42 |  | #define guchar unsigned char | 
| 43 | 47.7k | #define gint int | 
| 44 | 983k | #define guint unsigned int | 
| 45 | 4.33M | #define gushort unsigned short | 
| 46 |  | #define gint16 int16_t | 
| 47 |  | #define guint16 uint16_t | 
| 48 | 7.31M | #define gunichar uint32_t | 
| 49 | 1.35M | #define gsize size_t | 
| 50 |  | #define gssize ssize_t | 
| 51 | 50.6k | #define g_malloc malloc | 
| 52 | 1.70k | #define g_free free | 
| 53 | 373k | #define g_return_val_if_fail(expr,val)  {   \ | 
| 54 | 373k |     if (!(expr))         \ | 
| 55 | 373k |       return (val);         \ | 
| 56 | 373k |   } | 
| 57 |  |  | 
| 58 |  | /* Code from GLIB gmacros.h starts here. */ | 
| 59 |  |  | 
| 60 |  | /* GLIB - Library of useful routines for C programming | 
| 61 |  |  * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald | 
| 62 |  |  * | 
| 63 |  |  * This library is free software; you can redistribute it and/or | 
| 64 |  |  * modify it under the terms of the GNU Lesser General Public | 
| 65 |  |  * License as published by the Free Software Foundation; either | 
| 66 |  |  * version 2 of the License, or (at your option) any later version. | 
| 67 |  |  * | 
| 68 |  |  * This library is distributed in the hope that it will be useful, | 
| 69 |  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 70 |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
| 71 |  |  * Lesser General Public License for more details. | 
| 72 |  |  * | 
| 73 |  |  * You should have received a copy of the GNU Lesser General Public | 
| 74 |  |  * License along with this library; if not, write to the | 
| 75 |  |  * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 
| 76 |  |  * Boston, MA 02111-1307, USA. | 
| 77 |  |  */ | 
| 78 |  |  | 
| 79 |  | #ifndef FALSE | 
| 80 | 17.4M | # define  FALSE (0) | 
| 81 |  | #endif | 
| 82 |  |  | 
| 83 |  | #ifndef TRUE | 
| 84 | 8.74M | # define  TRUE  (!FALSE) | 
| 85 |  | #endif | 
| 86 |  |  | 
| 87 | 963k | #define G_N_ELEMENTS(arr)   (sizeof (arr) / sizeof ((arr)[0])) | 
| 88 |  |  | 
| 89 | 395k | #define G_UNLIKELY(expr) (expr) | 
| 90 |  |  | 
| 91 |  | /* Code from GLIB gunicode.h starts here. */ | 
| 92 |  |  | 
| 93 |  | /* gunicode.h - Unicode manipulation functions | 
| 94 |  |  * | 
| 95 |  |  *  Copyright (C) 1999, 2000 Tom Tromey | 
| 96 |  |  *  Copyright 2000, 2005 Red Hat, Inc. | 
| 97 |  |  * | 
| 98 |  |  * The Gnome Library is free software; you can redistribute it and/or | 
| 99 |  |  * modify it under the terms of the GNU Lesser General Public License as | 
| 100 |  |  * published by the Free Software Foundation; either version 2 of the | 
| 101 |  |  * License, or (at your option) any later version. | 
| 102 |  |  * | 
| 103 |  |  * The Gnome Library is distributed in the hope that it will be useful, | 
| 104 |  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 105 |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
| 106 |  |  * Lesser General Public License for more details. | 
| 107 |  |  * | 
| 108 |  |  * You should have received a copy of the GNU Lesser General Public | 
| 109 |  |  * License along with the Gnome Library; see the file COPYING.LIB.  If not, | 
| 110 |  |  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 
| 111 |  |  *   Boston, MA 02111-1307, USA. | 
| 112 |  |  */ | 
| 113 |  |  | 
| 114 |  | typedef enum | 
| 115 |  | { | 
| 116 |  |   G_NORMALIZE_DEFAULT, | 
| 117 |  |   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT, | 
| 118 |  |   G_NORMALIZE_DEFAULT_COMPOSE, | 
| 119 |  |   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE, | 
| 120 |  |   G_NORMALIZE_ALL, | 
| 121 |  |   G_NORMALIZE_NFKD = G_NORMALIZE_ALL, | 
| 122 |  |   G_NORMALIZE_ALL_COMPOSE, | 
| 123 |  |   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE | 
| 124 |  | } | 
| 125 |  | GNormalizeMode; | 
| 126 |  |  | 
| 127 | 10.2M | #define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)]) | 
| 128 |  |  | 
| 129 |  | /* Code from GLIB gutf8.c starts here. */ | 
| 130 |  |  | 
| 131 |  | /* gutf8.c - Operations on UTF-8 strings. | 
| 132 |  |  * | 
| 133 |  |  * Copyright (C) 1999 Tom Tromey | 
| 134 |  |  * Copyright (C) 2000 Red Hat, Inc. | 
| 135 |  |  * | 
| 136 |  |  * This library is free software; you can redistribute it and/or | 
| 137 |  |  * modify it under the terms of the GNU Lesser General Public | 
| 138 |  |  * License as published by the Free Software Foundation; either | 
| 139 |  |  * version 2 of the License, or (at your option) any later version. | 
| 140 |  |  * | 
| 141 |  |  * This library is distributed in the hope that it will be useful, | 
| 142 |  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 143 |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
| 144 |  |  * Lesser General Public License for more details. | 
| 145 |  |  * | 
| 146 |  |  * You should have received a copy of the GNU Lesser General Public | 
| 147 |  |  * License along with this library; if not, write to the | 
| 148 |  |  * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 
| 149 |  |  * Boston, MA 02111-1307, USA. | 
| 150 |  |  */ | 
| 151 |  |  | 
| 152 |  | #define UTF8_COMPUTE(Char, Mask, Len)   \ | 
| 153 | 5.34M |   if (Char < 128)       \ | 
| 154 | 5.34M |     {           \ | 
| 155 | 871k |       Len = 1;          \ | 
| 156 | 871k |       Mask = 0x7f;        \ | 
| 157 | 871k |     }            \ | 
| 158 | 5.34M |   else if ((Char & 0xe0) == 0xc0)   \ | 
| 159 | 4.47M |     {           \ | 
| 160 | 3.56M |       Len = 2;          \ | 
| 161 | 3.56M |       Mask = 0x1f;        \ | 
| 162 | 3.56M |     }            \ | 
| 163 | 4.47M |   else if ((Char & 0xf0) == 0xe0)   \ | 
| 164 | 904k |     {           \ | 
| 165 | 889k |       Len = 3;          \ | 
| 166 | 889k |       Mask = 0x0f;        \ | 
| 167 | 889k |     }            \ | 
| 168 | 904k |   else if ((Char & 0xf8) == 0xf0)   \ | 
| 169 | 15.2k |     {           \ | 
| 170 | 10.0k |       Len = 4;          \ | 
| 171 | 10.0k |       Mask = 0x07;        \ | 
| 172 | 10.0k |     }            \ | 
| 173 | 15.2k |   else if ((Char & 0xfc) == 0xf8)   \ | 
| 174 | 5.18k |     {           \ | 
| 175 | 567 |       Len = 5;          \ | 
| 176 | 567 |       Mask = 0x03;        \ | 
| 177 | 567 |     }            \ | 
| 178 | 5.18k |   else if ((Char & 0xfe) == 0xfc)   \ | 
| 179 | 4.61k |     {           \ | 
| 180 | 4.43k |       Len = 6;          \ | 
| 181 | 4.43k |       Mask = 0x01;        \ | 
| 182 | 4.43k |     }            \ | 
| 183 | 4.61k |   else            \ | 
| 184 | 4.61k |     Len = -1; | 
| 185 |  |  | 
| 186 |  | #define UTF8_LENGTH(Char)     \ | 
| 187 | 980k |   ((Char) < 0x80 ? 1 :        \ | 
| 188 | 980k |    ((Char) < 0x800 ? 2 :      \ | 
| 189 | 753k |     ((Char) < 0x10000 ? 3 :      \ | 
| 190 | 336k |      ((Char) < 0x200000 ? 4 :      \ | 
| 191 | 7.02k |       ((Char) < 0x4000000 ? 5 : 6))))) | 
| 192 |  |  | 
| 193 |  | #define UTF8_GET(Result, Chars, Count, Mask, Len)           \ | 
| 194 | 5.34M |   (Result) = (Chars)[0] & (Mask);               \ | 
| 195 | 10.7M |   for ((Count) = 1; (Count) < (Len); ++(Count))             \ | 
| 196 | 5.40M |     {                       \ | 
| 197 | 5.40M |       if (((Chars)[(Count)] & 0xc0) != 0x80)             \ | 
| 198 | 5.40M |   {                     \ | 
| 199 | 228 |     (Result) = -1;                  \ | 
| 200 | 228 |     break;                    \ | 
| 201 | 228 |   }                      \ | 
| 202 | 5.40M |       (Result) <<= 6;                   \ | 
| 203 | 5.40M |       (Result) |= ((Chars)[(Count)] & 0x3f);              \ | 
| 204 | 5.40M |     } | 
| 205 |  |  | 
| 206 |  | static const gchar utf8_skip_data[256] = { | 
| 207 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 208 |  |   1, 1, 1, 1, 1, 1, 1, | 
| 209 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 210 |  |   1, 1, 1, 1, 1, 1, 1, | 
| 211 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 212 |  |   1, 1, 1, 1, 1, 1, 1, | 
| 213 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 214 |  |   1, 1, 1, 1, 1, 1, 1, | 
| 215 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 216 |  |   1, 1, 1, 1, 1, 1, 1, | 
| 217 |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
| 218 |  |   1, 1, 1, 1, 1, 1, 1, | 
| 219 |  |   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 
| 220 |  |   2, 2, 2, 2, 2, 2, 2, | 
| 221 |  |   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, | 
| 222 |  |   5, 5, 5, 6, 6, 1, 1 | 
| 223 |  | }; | 
| 224 |  |  | 
| 225 |  | static const gchar *const g_utf8_skip = utf8_skip_data; | 
| 226 |  |  | 
| 227 |  | /* | 
| 228 |  |  * g_utf8_strlen: | 
| 229 |  |  * @p: pointer to the start of a UTF-8 encoded string | 
| 230 |  |  * @max: the maximum number of bytes to examine. If @max | 
| 231 |  |  *       is less than 0, then the string is assumed to be | 
| 232 |  |  *       nul-terminated. If @max is 0, @p will not be examined and | 
| 233 |  |  *       may be %NULL. | 
| 234 |  |  * | 
| 235 |  |  * Computes the length of the string in characters, not including | 
| 236 |  |  * the terminating nul character. | 
| 237 |  |  * | 
| 238 |  |  * Return value: the length of the string in characters | 
| 239 |  |  **/ | 
| 240 |  | static gsize | 
| 241 |  | g_utf8_strlen (const gchar *p) | 
| 242 | 356k | { | 
| 243 | 356k |   gsize len = 0; | 
| 244 |  |  | 
| 245 | 356k |   g_return_val_if_fail (p != NULL, 0); | 
| 246 |  |  | 
| 247 | 4.72M |   while (*p) | 
| 248 | 4.37M |     { | 
| 249 | 4.37M |       p = g_utf8_next_char (p); | 
| 250 | 4.37M |       ++len; | 
| 251 | 4.37M |     } | 
| 252 |  |  | 
| 253 | 356k |   return len; | 
| 254 | 356k | } | 
| 255 |  |  | 
| 256 |  | /* | 
| 257 |  |  * g_utf8_get_char: | 
| 258 |  |  * @p: a pointer to Unicode character encoded as UTF-8 | 
| 259 |  |  * | 
| 260 |  |  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. | 
| 261 |  |  * If @p does not point to a valid UTF-8 encoded character, results are | 
| 262 |  |  * undefined. If you are not sure that the bytes are complete | 
| 263 |  |  * valid Unicode characters, you should use g_utf8_get_char_validated() | 
| 264 |  |  * instead. | 
| 265 |  |  * | 
| 266 |  |  * Return value: the resulting character | 
| 267 |  |  **/ | 
| 268 |  | static gunichar | 
| 269 |  | g_utf8_get_char (const gchar *p) | 
| 270 | 5.34M | { | 
| 271 | 5.34M |   int i, mask = 0, len; | 
| 272 | 5.34M |   gunichar result; | 
| 273 | 5.34M |   unsigned char c = (unsigned char) *p; | 
| 274 |  |  | 
| 275 | 5.34M |   UTF8_COMPUTE (c, mask, len); | 
| 276 | 5.34M |   if (len == -1) | 
| 277 | 187 |     return (gunichar) - 1; | 
| 278 | 5.34M |   UTF8_GET (result, p, i, mask, len); | 
| 279 |  |  | 
| 280 | 5.34M |   return result; | 
| 281 | 5.34M | } | 
| 282 |  |  | 
| 283 |  | /* | 
| 284 |  |  * g_unichar_to_utf8: | 
| 285 |  |  * @c: a Unicode character code | 
| 286 |  |  * @outbuf: output buffer, must have at least 6 bytes of space. | 
| 287 |  |  *       If %NULL, the length will be computed and returned | 
| 288 |  |  *       and nothing will be written to @outbuf. | 
| 289 |  |  * | 
| 290 |  |  * Converts a single character to UTF-8. | 
| 291 |  |  * | 
| 292 |  |  * Return value: number of bytes written | 
| 293 |  |  **/ | 
| 294 |  | static int | 
| 295 |  | g_unichar_to_utf8 (gunichar c, gchar *outbuf) | 
| 296 | 983k | { | 
| 297 |  |   /* If this gets modified, also update the copy in g_string_insert_unichar() */ | 
| 298 | 983k |   guint len = 0; | 
| 299 | 983k |   int first; | 
| 300 | 983k |   int i; | 
| 301 |  |  | 
| 302 | 983k |   if (c < 0x80) | 
| 303 | 228k |     { | 
| 304 | 228k |       first = 0; | 
| 305 | 228k |       len = 1; | 
| 306 | 228k |     } | 
| 307 | 754k |   else if (c < 0x800) | 
| 308 | 417k |     { | 
| 309 | 417k |       first = 0xc0; | 
| 310 | 417k |       len = 2; | 
| 311 | 417k |     } | 
| 312 | 337k |   else if (c < 0x10000) | 
| 313 | 329k |     { | 
| 314 | 329k |       first = 0xe0; | 
| 315 | 329k |       len = 3; | 
| 316 | 329k |     } | 
| 317 | 7.61k |   else if (c < 0x200000) | 
| 318 | 4.70k |     { | 
| 319 | 4.70k |       first = 0xf0; | 
| 320 | 4.70k |       len = 4; | 
| 321 | 4.70k |     } | 
| 322 | 2.91k |   else if (c < 0x4000000) | 
| 323 | 295 |     { | 
| 324 | 295 |       first = 0xf8; | 
| 325 | 295 |       len = 5; | 
| 326 | 295 |     } | 
| 327 | 2.62k |   else | 
| 328 | 2.62k |     { | 
| 329 | 2.62k |       first = 0xfc; | 
| 330 | 2.62k |       len = 6; | 
| 331 | 2.62k |     } | 
| 332 |  |  | 
| 333 | 983k |   if (outbuf) | 
| 334 | 983k |     { | 
| 335 | 2.08M |       for (i = len - 1; i > 0; --i) | 
| 336 | 1.10M |   { | 
| 337 | 1.10M |     outbuf[i] = (c & 0x3f) | 0x80; | 
| 338 | 1.10M |     c >>= 6; | 
| 339 | 1.10M |   } | 
| 340 | 983k |       outbuf[0] = c | first; | 
| 341 | 983k |     } | 
| 342 |  |  | 
| 343 | 983k |   return len; | 
| 344 | 983k | } | 
| 345 |  |  | 
| 346 |  | /* | 
| 347 |  |  * g_utf8_to_ucs4_fast: | 
| 348 |  |  * @str: a UTF-8 encoded string | 
| 349 |  |  * @len: the maximum length of @str to use, in bytes. If @len < 0, | 
| 350 |  |  *       then the string is nul-terminated. | 
| 351 |  |  * @items_written: location to store the number of characters in the | 
| 352 |  |  *                 result, or %NULL. | 
| 353 |  |  * | 
| 354 |  |  * Convert a string from UTF-8 to a 32-bit fixed width | 
| 355 |  |  * representation as UCS-4, assuming valid UTF-8 input. | 
| 356 |  |  * This function is roughly twice as fast as g_utf8_to_ucs4() | 
| 357 |  |  * but does no error checking on the input. A trailing 0 character | 
| 358 |  |  * will be added to the string after the converted text. | 
| 359 |  |  * | 
| 360 |  |  * Return value: a pointer to a newly allocated UCS-4 string. | 
| 361 |  |  *               This value must be freed with g_free(). | 
| 362 |  |  **/ | 
| 363 |  | static gunichar * | 
| 364 |  | g_utf8_to_ucs4_fast (const gchar *str, gssize len, gsize *items_written) | 
| 365 | 16.6k | { | 
| 366 | 16.6k |   gunichar *result; | 
| 367 | 16.6k |   gsize n_chars, i; | 
| 368 | 16.6k |   const gchar *p; | 
| 369 |  |  | 
| 370 | 16.6k |   g_return_val_if_fail (str != NULL, NULL); | 
| 371 |  |  | 
| 372 | 16.6k |   p = str; | 
| 373 | 16.6k |   n_chars = 0; | 
| 374 | 16.6k |   if (len < 0) | 
| 375 | 16.6k |     { | 
| 376 | 558k |       while (*p) | 
| 377 | 541k |   { | 
| 378 | 541k |     p = g_utf8_next_char (p); | 
| 379 | 541k |     ++n_chars; | 
| 380 | 541k |   } | 
| 381 | 16.6k |     } | 
| 382 | 0 |   else | 
| 383 | 0 |     { | 
| 384 | 0 |       while (p < str + len && *p) | 
| 385 | 0 |   { | 
| 386 | 0 |     p = g_utf8_next_char (p); | 
| 387 | 0 |     ++n_chars; | 
| 388 | 0 |   } | 
| 389 | 0 |     } | 
| 390 |  |  | 
| 391 | 16.6k |   result = g_malloc (sizeof (gunichar) * (n_chars + 1)); | 
| 392 | 16.6k |   if (!result) | 
| 393 | 0 |     return NULL; | 
| 394 |  |  | 
| 395 | 16.6k |   p = str; | 
| 396 | 558k |   for (i = 0; i < n_chars; i++) | 
| 397 | 541k |     { | 
| 398 | 541k |       gunichar wc = (guchar) * p++; | 
| 399 |  |  | 
| 400 | 541k |       if (wc < 0x80) | 
| 401 | 146k |   { | 
| 402 | 146k |     result[i] = wc; | 
| 403 | 146k |   } | 
| 404 | 395k |       else | 
| 405 | 395k |   { | 
| 406 | 395k |     gunichar mask = 0x40; | 
| 407 |  |  | 
| 408 | 395k |     if (G_UNLIKELY ((wc & mask) == 0)) | 
| 409 | 0 |       { | 
| 410 |  |         /* It's an out-of-sequence 10xxxxxxx byte. | 
| 411 |  |          * Rather than making an ugly hash of this and the next byte | 
| 412 |  |          * and overrunning the buffer, it's more useful to treat it | 
| 413 |  |          * with a replacement character */ | 
| 414 | 0 |         result[i] = 0xfffd; | 
| 415 | 0 |         continue; | 
| 416 | 0 |       } | 
| 417 |  |  | 
| 418 | 395k |     do | 
| 419 | 698k |       { | 
| 420 | 698k |         wc <<= 6; | 
| 421 | 698k |         wc |= (guchar) (*p++) & 0x3f; | 
| 422 | 698k |         mask <<= 5; | 
| 423 | 698k |       } | 
| 424 | 698k |     while ((wc & mask) != 0); | 
| 425 |  |  | 
| 426 | 395k |     wc &= mask - 1; | 
| 427 |  |  | 
| 428 | 395k |     result[i] = wc; | 
| 429 | 395k |   } | 
| 430 | 541k |     } | 
| 431 | 16.6k |   result[i] = 0; | 
| 432 |  |  | 
| 433 | 16.6k |   if (items_written) | 
| 434 | 14.8k |     *items_written = i; | 
| 435 |  |  | 
| 436 | 16.6k |   return result; | 
| 437 | 16.6k | } | 
| 438 |  |  | 
| 439 |  | /* | 
| 440 |  |  * g_ucs4_to_utf8: | 
| 441 |  |  * @str: a UCS-4 encoded string | 
| 442 |  |  * @len: the maximum length (number of characters) of @str to use. | 
| 443 |  |  *       If @len < 0, then the string is nul-terminated. | 
| 444 |  |  * @items_read: location to store number of characters read, or %NULL. | 
| 445 |  |  * @items_written: location to store number of bytes written or %NULL. | 
| 446 |  |  *                 The value here stored does not include the trailing 0 | 
| 447 |  |  *                 byte. | 
| 448 |  |  * @error: location to store the error occurring, or %NULL to ignore | 
| 449 |  |  *         errors. Any of the errors in #GConvertError other than | 
| 450 |  |  *         %G_CONVERT_ERROR_NO_CONVERSION may occur. | 
| 451 |  |  * | 
| 452 |  |  * Convert a string from a 32-bit fixed width representation as UCS-4. | 
| 453 |  |  * to UTF-8. The result will be terminated with a 0 byte. | 
| 454 |  |  * | 
| 455 |  |  * Return value: a pointer to a newly allocated UTF-8 string. | 
| 456 |  |  *               This value must be freed with g_free(). If an | 
| 457 |  |  *               error occurs, %NULL will be returned and | 
| 458 |  |  *               @error set. In that case, @items_read will be | 
| 459 |  |  *               set to the position of the first invalid input | 
| 460 |  |  *               character. | 
| 461 |  |  **/ | 
| 462 |  | static gchar * | 
| 463 |  | g_ucs4_to_utf8 (const gunichar *str, | 
| 464 |  |     gsize len, gsize *items_read, gsize *items_written) | 
| 465 | 19.8k | { | 
| 466 | 19.8k |   gint result_length; | 
| 467 | 19.8k |   gchar *result = NULL; | 
| 468 | 19.8k |   gchar *p; | 
| 469 | 19.8k |   gsize i; | 
| 470 |  |  | 
| 471 | 19.8k |   result_length = 0; | 
| 472 | 1.00M |   for (i = 0; i < len; i++) | 
| 473 | 982k |     { | 
| 474 | 982k |       if (!str[i]) | 
| 475 | 1.70k |   break; | 
| 476 |  |  | 
| 477 | 981k |       if (str[i] >= 0x80000000) | 
| 478 | 405 |   goto err_out; | 
| 479 |  |  | 
| 480 | 980k |       result_length += UTF8_LENGTH (str[i]); | 
| 481 | 980k |     } | 
| 482 |  |  | 
| 483 | 19.4k |   result = g_malloc (result_length + 1); | 
| 484 | 19.4k |   if (!result) | 
| 485 | 0 |     return NULL; | 
| 486 | 19.4k |   p = result; | 
| 487 |  |  | 
| 488 | 19.4k |   i = 0; | 
| 489 | 1.00M |   while (p < result + result_length) | 
| 490 | 980k |     p += g_unichar_to_utf8 (str[i++], p); | 
| 491 |  |  | 
| 492 | 19.4k |   *p = '\0'; | 
| 493 |  |  | 
| 494 | 19.4k |   if (items_written) | 
| 495 | 0 |     *items_written = p - result; | 
| 496 |  |  | 
| 497 | 19.8k | err_out: | 
| 498 | 19.8k |   if (items_read) | 
| 499 | 0 |     *items_read = i; | 
| 500 |  |  | 
| 501 | 19.8k |   return result; | 
| 502 | 19.4k | } | 
| 503 |  |  | 
| 504 |  | /* Code from GLIB gunidecomp.c starts here. */ | 
| 505 |  |  | 
| 506 |  | /* decomp.c - Character decomposition. | 
| 507 |  |  * | 
| 508 |  |  *  Copyright (C) 1999, 2000 Tom Tromey | 
| 509 |  |  *  Copyright 2000 Red Hat, Inc. | 
| 510 |  |  * | 
| 511 |  |  * The Gnome Library is free software; you can redistribute it and/or | 
| 512 |  |  * modify it under the terms of the GNU Lesser General Public License as | 
| 513 |  |  * published by the Free Software Foundation; either version 2 of the | 
| 514 |  |  * License, or (at your option) any later version. | 
| 515 |  |  * | 
| 516 |  |  * The Gnome Library is distributed in the hope that it will be useful, | 
| 517 |  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 518 |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
| 519 |  |  * Lesser General Public License for more details. | 
| 520 |  |  * | 
| 521 |  |  * You should have received a copy of the GNU Lesser General Public | 
| 522 |  |  * License along with the Gnome Library; see the file COPYING.LIB.  If not, | 
| 523 |  |  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 
| 524 |  |  *   Boston, MA 02111-1307, USA. | 
| 525 |  |  */ | 
| 526 |  |  | 
| 527 |  | #include "gunidecomp.h" | 
| 528 |  | #include "gunicomp.h" | 
| 529 |  |  | 
| 530 |  | #define CC_PART1(Page, Char)            \ | 
| 531 | 13.8M |   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ | 
| 532 | 13.8M |    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX)  \ | 
| 533 | 13.8M |    : (cclass_data[combining_class_table_part1[Page]][Char])) | 
| 534 |  |  | 
| 535 |  | #define CC_PART2(Page, Char)            \ | 
| 536 | 5.37k |   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ | 
| 537 | 5.37k |    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ | 
| 538 | 5.37k |    : (cclass_data[combining_class_table_part2[Page]][Char])) | 
| 539 |  |  | 
| 540 |  | #define COMBINING_CLASS(Char)         \ | 
| 541 | 13.9M |   (((Char) <= G_UNICODE_LAST_CHAR_PART1)     \ | 
| 542 | 13.9M |    ? CC_PART1 ((Char) >> 8, (Char) & 0xff)     \ | 
| 543 | 13.9M |    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ | 
| 544 | 20.1k |       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ | 
| 545 | 20.1k |       : 0)) | 
| 546 |  |  | 
| 547 |  | /* constants for hangul syllable [de]composition */ | 
| 548 | 15.4M | #define SBase 0xAC00 | 
| 549 | 13.3M | #define LBase 0x1100 | 
| 550 | 4.35M | #define VBase 0x1161 | 
| 551 | 4.34M | #define TBase 0x11A7 | 
| 552 | 811k | #define LCount 19 | 
| 553 | 488k | #define VCount 21 | 
| 554 | 499k | #define TCount 28 | 
| 555 | 479k | #define NCount (VCount * TCount) | 
| 556 | 471k | #define SCount (LCount * NCount) | 
| 557 |  |  | 
| 558 |  | /* | 
| 559 |  |  * g_unicode_canonical_ordering: | 
| 560 |  |  * @string: a UCS-4 encoded string. | 
| 561 |  |  * @len: the maximum length of @string to use. | 
| 562 |  |  * | 
| 563 |  |  * Computes the canonical ordering of a string in-place. | 
| 564 |  |  * This rearranges decomposed characters in the string | 
| 565 |  |  * according to their combining classes.  See the Unicode | 
| 566 |  |  * manual for more information. | 
| 567 |  |  **/ | 
| 568 |  | static void | 
| 569 |  | g_unicode_canonical_ordering (gunichar *string, gsize len) | 
| 570 | 418k | { | 
| 571 | 418k |   gsize i; | 
| 572 | 418k |   int swap = 1; | 
| 573 |  |  | 
| 574 | 838k |   while (swap) | 
| 575 | 420k |     { | 
| 576 | 420k |       int last; | 
| 577 | 420k |       swap = 0; | 
| 578 | 420k |       last = COMBINING_CLASS (string[0]); | 
| 579 | 8.88M |       for (i = 0; i < len - 1; ++i) | 
| 580 | 8.46M |   { | 
| 581 | 8.46M |     int next = COMBINING_CLASS (string[i + 1]); | 
| 582 | 8.46M |     if (next != 0 && last > next) | 
| 583 | 8.01k |       { | 
| 584 | 8.01k |         gsize j; | 
| 585 |  |         /* Percolate item leftward through string.  */ | 
| 586 | 23.5k |         for (j = i + 1; j > 0; --j) | 
| 587 | 23.1k |     { | 
| 588 | 23.1k |       gunichar t; | 
| 589 | 23.1k |       if (COMBINING_CLASS (string[j - 1]) <= next) | 
| 590 | 7.59k |         break; | 
| 591 | 15.5k |       t = string[j]; | 
| 592 | 15.5k |       string[j] = string[j - 1]; | 
| 593 | 15.5k |       string[j - 1] = t; | 
| 594 | 15.5k |       swap = 1; | 
| 595 | 15.5k |     } | 
| 596 |  |         /* We're re-entering the loop looking at the old | 
| 597 |  |            character again.  */ | 
| 598 | 8.01k |         next = last; | 
| 599 | 8.01k |       } | 
| 600 | 8.46M |     last = next; | 
| 601 | 8.46M |   } | 
| 602 | 420k |     } | 
| 603 | 418k | } | 
| 604 |  |  | 
| 605 |  | /* http://www.unicode.org/unicode/reports/tr15/#Hangul | 
| 606 |  |  * r should be null or have sufficient space. Calling with r == NULL will | 
| 607 |  |  * only calculate the result_len; however, a buffer with space for three | 
| 608 |  |  * characters will always be big enough. */ | 
| 609 |  | static void | 
| 610 |  | decompose_hangul (gunichar s, gunichar *r, gsize *result_len) | 
| 611 | 8.29k | { | 
| 612 | 8.29k |   gint SIndex = s - SBase; | 
| 613 | 8.29k |   gint TIndex = SIndex % TCount; | 
| 614 |  |  | 
| 615 | 8.29k |   if (r) | 
| 616 | 4.14k |     { | 
| 617 | 4.14k |       r[0] = LBase + SIndex / NCount; | 
| 618 | 4.14k |       r[1] = VBase + (SIndex % NCount) / TCount; | 
| 619 | 4.14k |     } | 
| 620 |  |  | 
| 621 | 8.29k |   if (TIndex) | 
| 622 | 2.26k |     { | 
| 623 | 2.26k |       if (r) | 
| 624 | 1.13k |   r[2] = TBase + TIndex; | 
| 625 | 2.26k |       *result_len = 3; | 
| 626 | 2.26k |     } | 
| 627 | 6.02k |   else | 
| 628 | 6.02k |     *result_len = 2; | 
| 629 | 8.29k | } | 
| 630 |  |  | 
| 631 |  | /* returns a pointer to a null-terminated UTF-8 string */ | 
| 632 |  | static const gchar * | 
| 633 |  | find_decomposition (gunichar ch, gboolean compat) | 
| 634 | 963k | { | 
| 635 | 963k |   int start = 0; | 
| 636 | 963k |   int end = G_N_ELEMENTS (decomp_table); | 
| 637 |  |  | 
| 638 | 963k |   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch) | 
| 639 | 738k |     { | 
| 640 | 8.73M |       while (TRUE) | 
| 641 | 8.73M |   { | 
| 642 | 8.73M |     int half = (start + end) / 2; | 
| 643 | 8.73M |     if (ch == decomp_table[half].ch) | 
| 644 | 713k |       { | 
| 645 | 713k |         int offset; | 
| 646 |  |  | 
| 647 | 713k |         if (compat) | 
| 648 | 713k |     { | 
| 649 | 713k |       offset = decomp_table[half].compat_offset; | 
| 650 | 713k |       if (offset == G_UNICODE_NOT_PRESENT_OFFSET) | 
| 651 | 161k |         offset = decomp_table[half].canon_offset; | 
| 652 | 713k |     } | 
| 653 | 0 |         else | 
| 654 | 0 |     { | 
| 655 | 0 |       offset = decomp_table[half].canon_offset; | 
| 656 | 0 |       if (offset == G_UNICODE_NOT_PRESENT_OFFSET) | 
| 657 | 0 |         return NULL; | 
| 658 | 0 |     } | 
| 659 |  |  | 
| 660 | 713k |         return &(decomp_expansion_string[offset]); | 
| 661 | 713k |       } | 
| 662 | 8.01M |     else if (half == start) | 
| 663 | 24.6k |       break; | 
| 664 | 7.99M |     else if (ch > decomp_table[half].ch) | 
| 665 | 4.01M |       start = half; | 
| 666 | 3.97M |     else | 
| 667 | 3.97M |       end = half; | 
| 668 | 8.73M |   } | 
| 669 | 738k |     } | 
| 670 |  |  | 
| 671 | 249k |   return NULL; | 
| 672 | 963k | } | 
| 673 |  |  | 
| 674 |  | /* L,V => LV and LV,T => LVT  */ | 
| 675 |  | static gboolean | 
| 676 |  | combine_hangul (gunichar a, gunichar b, gunichar *result) | 
| 677 | 4.33M | { | 
| 678 | 4.33M |   if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase) | 
| 679 | 4.38k |     { | 
| 680 | 4.38k |       gint LIndex = a - LBase; | 
| 681 | 4.38k |       gint VIndex = b - VBase; | 
| 682 |  |  | 
| 683 | 4.38k |       *result = SBase + (LIndex * VCount + VIndex) * TCount; | 
| 684 | 4.38k |       return TRUE; | 
| 685 | 4.38k |     } | 
| 686 |  |  | 
| 687 | 4.33M |   if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase) | 
| 688 | 1.38k |     { | 
| 689 | 1.38k |       gint SIndex = a - SBase; | 
| 690 |  |  | 
| 691 | 1.38k |       if ((SIndex % TCount) == 0) | 
| 692 | 1.18k |   { | 
| 693 | 1.18k |     gint TIndex = b - TBase; | 
| 694 |  |  | 
| 695 | 1.18k |     *result = a + TIndex; | 
| 696 | 1.18k |     return TRUE; | 
| 697 | 1.18k |   } | 
| 698 | 1.38k |     } | 
| 699 |  |  | 
| 700 | 4.33M |   return FALSE; | 
| 701 | 4.33M | } | 
| 702 |  |  | 
| 703 |  | #define CI(Page, Char)          \ | 
| 704 | 8.10M |   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ | 
| 705 | 8.10M |    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX)  \ | 
| 706 | 8.10M |    : (compose_data[compose_table[Page]][Char])) | 
| 707 |  |  | 
| 708 |  | #define COMPOSE_INDEX(Char)           \ | 
| 709 | 8.12M |   (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) | 
| 710 |  |  | 
| 711 |  | static gboolean | 
| 712 |  | combine (gunichar a, gunichar b, gunichar *result) | 
| 713 | 4.33M | { | 
| 714 | 4.33M |   gushort index_a, index_b; | 
| 715 |  |  | 
| 716 | 4.33M |   if (combine_hangul (a, b, result)) | 
| 717 | 5.57k |     return TRUE; | 
| 718 |  |  | 
| 719 | 4.33M |   index_a = COMPOSE_INDEX (a); | 
| 720 |  |  | 
| 721 | 4.33M |   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) | 
| 722 | 538k |     { | 
| 723 | 538k |       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) | 
| 724 | 1.72k |   { | 
| 725 | 1.72k |     *result = | 
| 726 | 1.72k |       compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; | 
| 727 | 1.72k |     return TRUE; | 
| 728 | 1.72k |   } | 
| 729 | 537k |       else | 
| 730 | 537k |   return FALSE; | 
| 731 | 538k |     } | 
| 732 |  |  | 
| 733 | 3.79M |   index_b = COMPOSE_INDEX (b); | 
| 734 |  |  | 
| 735 | 3.79M |   if (index_b >= COMPOSE_SECOND_SINGLE_START) | 
| 736 | 689 |     { | 
| 737 | 689 |       if (a == | 
| 738 | 689 |     compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) | 
| 739 | 301 |   { | 
| 740 | 301 |     *result = | 
| 741 | 301 |       compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; | 
| 742 | 301 |     return TRUE; | 
| 743 | 301 |   } | 
| 744 | 388 |       else | 
| 745 | 388 |   return FALSE; | 
| 746 | 689 |     } | 
| 747 |  |  | 
| 748 | 3.79M |   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START | 
| 749 | 3.79M |       && index_b >= COMPOSE_SECOND_START | 
| 750 | 3.79M |       && index_b < COMPOSE_SECOND_SINGLE_START) | 
| 751 | 5.33k |     { | 
| 752 | 5.33k |       gunichar res = | 
| 753 | 5.33k |   compose_array[index_a - COMPOSE_FIRST_START][index_b - | 
| 754 | 5.33k |                  COMPOSE_SECOND_START]; | 
| 755 |  |  | 
| 756 | 5.33k |       if (res) | 
| 757 | 4.94k |   { | 
| 758 | 4.94k |     *result = res; | 
| 759 | 4.94k |     return TRUE; | 
| 760 | 4.94k |   } | 
| 761 | 5.33k |     } | 
| 762 |  |  | 
| 763 | 3.78M |   return FALSE; | 
| 764 | 3.79M | } | 
| 765 |  |  | 
| 766 |  | static gunichar * | 
| 767 |  | _g_utf8_normalize_wc (const gchar *str, gssize max_len, GNormalizeMode mode) | 
| 768 | 14.5k | { | 
| 769 | 14.5k |   gsize n_wc; | 
| 770 | 14.5k |   gunichar *wc_buffer; | 
| 771 | 14.5k |   const char *p; | 
| 772 | 14.5k |   gsize last_start; | 
| 773 | 14.5k |   gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD); | 
| 774 | 14.5k |   gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC); | 
| 775 |  |  | 
| 776 | 14.5k |   n_wc = 0; | 
| 777 | 14.5k |   p = str; | 
| 778 | 500k |   while ((max_len < 0 || p < str + max_len) && *p) | 
| 779 | 485k |     { | 
| 780 | 485k |       const gchar *decomp; | 
| 781 | 485k |       gunichar wc = g_utf8_get_char (p); | 
| 782 |  |  | 
| 783 | 485k |       if (wc >= SBase && wc < SBase + SCount) | 
| 784 | 4.14k |   { | 
| 785 | 4.14k |     gsize result_len; | 
| 786 | 4.14k |     decompose_hangul (wc, NULL, &result_len); | 
| 787 | 4.14k |     n_wc += result_len; | 
| 788 | 4.14k |   } | 
| 789 | 481k |       else | 
| 790 | 481k |   { | 
| 791 | 481k |     decomp = find_decomposition (wc, do_compat); | 
| 792 |  |  | 
| 793 | 481k |     if (decomp) | 
| 794 | 356k |       n_wc += g_utf8_strlen (decomp); | 
| 795 | 124k |     else | 
| 796 | 124k |       n_wc++; | 
| 797 | 481k |   } | 
| 798 |  |  | 
| 799 | 485k |       p = g_utf8_next_char (p); | 
| 800 | 485k |     } | 
| 801 |  |  | 
| 802 | 14.5k |   wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1)); | 
| 803 | 14.5k |   if (!wc_buffer) | 
| 804 | 0 |     return NULL; | 
| 805 |  |  | 
| 806 | 14.5k |   last_start = 0; | 
| 807 | 14.5k |   n_wc = 0; | 
| 808 | 14.5k |   p = str; | 
| 809 | 500k |   while ((max_len < 0 || p < str + max_len) && *p) | 
| 810 | 485k |     { | 
| 811 | 485k |       gunichar wc = g_utf8_get_char (p); | 
| 812 | 485k |       const gchar *decomp; | 
| 813 | 485k |       int cc; | 
| 814 | 485k |       gsize old_n_wc = n_wc; | 
| 815 |  |  | 
| 816 | 485k |       if (wc >= SBase && wc < SBase + SCount) | 
| 817 | 4.14k |   { | 
| 818 | 4.14k |     gsize result_len; | 
| 819 | 4.14k |     decompose_hangul (wc, wc_buffer + n_wc, &result_len); | 
| 820 | 4.14k |     n_wc += result_len; | 
| 821 | 4.14k |   } | 
| 822 | 481k |       else | 
| 823 | 481k |   { | 
| 824 | 481k |     decomp = find_decomposition (wc, do_compat); | 
| 825 |  |  | 
| 826 | 481k |     if (decomp) | 
| 827 | 356k |       { | 
| 828 | 356k |         const char *pd; | 
| 829 | 4.72M |         for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) | 
| 830 | 4.37M |     wc_buffer[n_wc++] = g_utf8_get_char (pd); | 
| 831 | 356k |       } | 
| 832 | 124k |     else | 
| 833 | 124k |       wc_buffer[n_wc++] = wc; | 
| 834 | 481k |   } | 
| 835 |  |  | 
| 836 | 485k |       if (n_wc > 0) | 
| 837 | 485k |   { | 
| 838 | 485k |     cc = COMBINING_CLASS (wc_buffer[old_n_wc]); | 
| 839 |  |  | 
| 840 | 485k |     if (cc == 0) | 
| 841 | 403k |       { | 
| 842 | 403k |         g_unicode_canonical_ordering (wc_buffer + last_start, | 
| 843 | 403k |               n_wc - last_start); | 
| 844 | 403k |         last_start = old_n_wc; | 
| 845 | 403k |       } | 
| 846 | 485k |   } | 
| 847 |  |  | 
| 848 | 485k |       p = g_utf8_next_char (p); | 
| 849 | 485k |     } | 
| 850 |  |  | 
| 851 | 14.5k |   if (n_wc > 0) | 
| 852 | 14.1k |     { | 
| 853 | 14.1k |       g_unicode_canonical_ordering (wc_buffer + last_start, | 
| 854 | 14.1k |             n_wc - last_start); | 
| 855 |  |       /* dead assignment: last_start = n_wc; */ | 
| 856 | 14.1k |     } | 
| 857 |  |  | 
| 858 | 14.5k |   wc_buffer[n_wc] = 0; | 
| 859 |  |  | 
| 860 |  |   /* All decomposed and reordered */ | 
| 861 |  |  | 
| 862 | 14.5k |   if (do_compose && n_wc > 0) | 
| 863 | 14.1k |     { | 
| 864 | 14.1k |       gsize i, j; | 
| 865 | 14.1k |       int last_cc = 0; | 
| 866 | 14.1k |       last_start = 0; | 
| 867 |  |  | 
| 868 | 4.51M |       for (i = 0; i < n_wc; i++) | 
| 869 | 4.50M |   { | 
| 870 | 4.50M |     int cc = COMBINING_CLASS (wc_buffer[i]); | 
| 871 |  |  | 
| 872 | 4.50M |     if (i > 0 && | 
| 873 | 4.50M |         (last_cc == 0 || last_cc != cc) && | 
| 874 | 4.50M |         combine (wc_buffer[last_start], wc_buffer[i], | 
| 875 | 4.33M |            &wc_buffer[last_start])) | 
| 876 | 12.5k |       { | 
| 877 | 1.10M |         for (j = i + 1; j < n_wc; j++) | 
| 878 | 1.08M |     wc_buffer[j - 1] = wc_buffer[j]; | 
| 879 | 12.5k |         n_wc--; | 
| 880 | 12.5k |         i--; | 
| 881 |  |  | 
| 882 | 12.5k |         if (i == last_start) | 
| 883 | 11.2k |     last_cc = 0; | 
| 884 | 1.24k |         else | 
| 885 | 1.24k |     last_cc = COMBINING_CLASS (wc_buffer[i - 1]); | 
| 886 |  |  | 
| 887 | 12.5k |         continue; | 
| 888 | 12.5k |       } | 
| 889 |  |  | 
| 890 | 4.49M |     if (cc == 0) | 
| 891 | 4.33M |       last_start = i; | 
| 892 |  |  | 
| 893 | 4.49M |     last_cc = cc; | 
| 894 | 4.49M |   } | 
| 895 | 14.1k |     } | 
| 896 |  |  | 
| 897 | 14.5k |   wc_buffer[n_wc] = 0; | 
| 898 |  |  | 
| 899 | 14.5k |   return wc_buffer; | 
| 900 | 14.5k | } | 
| 901 |  |  | 
| 902 |  | /* | 
| 903 |  |  * g_utf8_normalize: | 
| 904 |  |  * @str: a UTF-8 encoded string. | 
| 905 |  |  * @len: length of @str, in bytes, or -1 if @str is nul-terminated. | 
| 906 |  |  * @mode: the type of normalization to perform. | 
| 907 |  |  * | 
| 908 |  |  * Converts a string into canonical form, standardizing | 
| 909 |  |  * such issues as whether a character with an accent | 
| 910 |  |  * is represented as a base character and combining | 
| 911 |  |  * accent or as a single precomposed character. The | 
| 912 |  |  * string has to be valid UTF-8, otherwise %NULL is | 
| 913 |  |  * returned. You should generally call g_utf8_normalize() | 
| 914 |  |  * before comparing two Unicode strings. | 
| 915 |  |  * | 
| 916 |  |  * The normalization mode %G_NORMALIZE_DEFAULT only | 
| 917 |  |  * standardizes differences that do not affect the | 
| 918 |  |  * text content, such as the above-mentioned accent | 
| 919 |  |  * representation. %G_NORMALIZE_ALL also standardizes | 
| 920 |  |  * the "compatibility" characters in Unicode, such | 
| 921 |  |  * as SUPERSCRIPT THREE to the standard forms | 
| 922 |  |  * (in this case DIGIT THREE). Formatting information | 
| 923 |  |  * may be lost but for most text operations such | 
| 924 |  |  * characters should be considered the same. | 
| 925 |  |  * | 
| 926 |  |  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE | 
| 927 |  |  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, | 
| 928 |  |  * but returned a result with composed forms rather | 
| 929 |  |  * than a maximally decomposed form. This is often | 
| 930 |  |  * useful if you intend to convert the string to | 
| 931 |  |  * a legacy encoding or pass it to a system with | 
| 932 |  |  * less capable Unicode handling. | 
| 933 |  |  * | 
| 934 |  |  * Return value: a newly allocated string, that is the | 
| 935 |  |  *   normalized form of @str, or %NULL if @str is not | 
| 936 |  |  *   valid UTF-8. | 
| 937 |  |  **/ | 
| 938 |  | static gchar * | 
| 939 |  | g_utf8_normalize (const gchar *str, gssize len, GNormalizeMode mode) | 
| 940 | 1.70k | { | 
| 941 | 1.70k |   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); | 
| 942 | 1.70k |   gchar *result = NULL; | 
| 943 |  |  | 
| 944 | 1.70k |   if (result_wc) | 
| 945 | 1.70k |     result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL); | 
| 946 |  |  | 
| 947 | 1.70k |   g_free (result_wc); | 
| 948 |  |  | 
| 949 | 1.70k |   return result; | 
| 950 | 1.70k | } | 
| 951 |  |  | 
| 952 |  | /* Public Libidn API starts here. */ | 
| 953 |  |  | 
| 954 |  | /** | 
| 955 |  |  * stringprep_utf8_to_unichar: | 
| 956 |  |  * @p: a pointer to Unicode character encoded as UTF-8 | 
| 957 |  |  * | 
| 958 |  |  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. | 
| 959 |  |  * If @p does not point to a valid UTF-8 encoded character, results are | 
| 960 |  |  * undefined. | 
| 961 |  |  * | 
| 962 |  |  * Return value: the resulting character. | 
| 963 |  |  **/ | 
| 964 |  | uint32_t | 
| 965 |  | stringprep_utf8_to_unichar (const char *p) | 
| 966 | 2.51k | { | 
| 967 | 2.51k |   return g_utf8_get_char (p); | 
| 968 | 2.51k | } | 
| 969 |  |  | 
| 970 |  | /** | 
| 971 |  |  * stringprep_unichar_to_utf8: | 
| 972 |  |  * @c: a ISO10646 character code | 
| 973 |  |  * @outbuf: output buffer, must have at least 6 bytes of space. | 
| 974 |  |  *       If %NULL, the length will be computed and returned | 
| 975 |  |  *       and nothing will be written to @outbuf. | 
| 976 |  |  * | 
| 977 |  |  * Converts a single character to UTF-8. | 
| 978 |  |  * | 
| 979 |  |  * Return value: number of bytes written. | 
| 980 |  |  **/ | 
| 981 |  | int | 
| 982 |  | stringprep_unichar_to_utf8 (uint32_t c, char *outbuf) | 
| 983 | 2.51k | { | 
| 984 | 2.51k |   return g_unichar_to_utf8 (c, outbuf); | 
| 985 | 2.51k | } | 
| 986 |  |  | 
| 987 |  | #include <unistr.h> | 
| 988 |  |  | 
| 989 |  | /** | 
| 990 |  |  * stringprep_utf8_to_ucs4: | 
| 991 |  |  * @str: a UTF-8 encoded string | 
| 992 |  |  * @len: the maximum length of @str to use. If @len < 0, then | 
| 993 |  |  *       the string is nul-terminated. | 
| 994 |  |  * @items_written: location to store the number of characters in the | 
| 995 |  |  *                 result, or %NULL. | 
| 996 |  |  * | 
| 997 |  |  * Convert a string from UTF-8 to a 32-bit fixed width representation | 
| 998 |  |  * as UCS-4.  The function now performs error checking to verify that | 
| 999 |  |  * the input is valid UTF-8 (before it was documented to not do error | 
| 1000 |  |  * checking). | 
| 1001 |  |  * | 
| 1002 |  |  * Return value: a pointer to a newly allocated UCS-4 string. | 
| 1003 |  |  *               This value must be deallocated by the caller. | 
| 1004 |  |  **/ | 
| 1005 |  | uint32_t * | 
| 1006 |  | stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written) | 
| 1007 | 20.0k | { | 
| 1008 | 20.0k |   size_t n; | 
| 1009 |  |  | 
| 1010 | 20.0k |   if (len < 0) | 
| 1011 | 20.0k |     n = strlen (str); | 
| 1012 | 0 |   else | 
| 1013 | 0 |     n = len; | 
| 1014 |  |  | 
| 1015 | 20.0k |   if (u8_check ((const uint8_t *) str, n)) | 
| 1016 | 3.33k |     return NULL; | 
| 1017 |  |  | 
| 1018 | 16.6k |   return g_utf8_to_ucs4_fast (str, len, items_written); | 
| 1019 | 20.0k | } | 
| 1020 |  |  | 
| 1021 |  | /** | 
| 1022 |  |  * stringprep_ucs4_to_utf8: | 
| 1023 |  |  * @str: a UCS-4 encoded string | 
| 1024 |  |  * @len: the maximum length of @str to use. If @len < 0, then | 
| 1025 |  |  *       the string is terminated with a 0 character. | 
| 1026 |  |  * @items_read: location to store number of characters read read, or %NULL. | 
| 1027 |  |  * @items_written: location to store number of bytes written or %NULL. | 
| 1028 |  |  *                 The value here stored does not include the trailing 0 | 
| 1029 |  |  *                 byte. | 
| 1030 |  |  * | 
| 1031 |  |  * Convert a string from a 32-bit fixed width representation as UCS-4. | 
| 1032 |  |  * to UTF-8. The result will be terminated with a 0 byte. | 
| 1033 |  |  * | 
| 1034 |  |  * Return value: a pointer to a newly allocated UTF-8 string. | 
| 1035 |  |  *               This value must be deallocated by the caller. | 
| 1036 |  |  *               If an error occurs, %NULL will be returned. | 
| 1037 |  |  **/ | 
| 1038 |  | char * | 
| 1039 |  | stringprep_ucs4_to_utf8 (const uint32_t *str, ssize_t len, | 
| 1040 |  |        size_t *items_read, size_t *items_written) | 
| 1041 | 18.1k | { | 
| 1042 | 18.1k |   return g_ucs4_to_utf8 (str, len, items_read, items_written); | 
| 1043 | 18.1k | } | 
| 1044 |  |  | 
| 1045 |  | /** | 
| 1046 |  |  * stringprep_utf8_nfkc_normalize: | 
| 1047 |  |  * @str: a UTF-8 encoded string. | 
| 1048 |  |  * @len: length of @str, in bytes, or -1 if @str is nul-terminated. | 
| 1049 |  |  * | 
| 1050 |  |  * Converts a string into canonical form, standardizing | 
| 1051 |  |  * such issues as whether a character with an accent | 
| 1052 |  |  * is represented as a base character and combining | 
| 1053 |  |  * accent or as a single precomposed character. | 
| 1054 |  |  * | 
| 1055 |  |  * The normalization mode is NFKC (ALL COMPOSE).  It standardizes | 
| 1056 |  |  * differences that do not affect the text content, such as the | 
| 1057 |  |  * above-mentioned accent representation. It standardizes the | 
| 1058 |  |  * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to | 
| 1059 |  |  * the standard forms (in this case DIGIT THREE). Formatting | 
| 1060 |  |  * information may be lost but for most text operations such | 
| 1061 |  |  * characters should be considered the same. It returns a result with | 
| 1062 |  |  * composed forms rather than a maximally decomposed form. | 
| 1063 |  |  * | 
| 1064 |  |  * Return value: a newly allocated string, that is the | 
| 1065 |  |  *   NFKC normalized form of @str. | 
| 1066 |  |  **/ | 
| 1067 |  | char * | 
| 1068 |  | stringprep_utf8_nfkc_normalize (const char *str, ssize_t len) | 
| 1069 | 2.51k | { | 
| 1070 | 2.51k |   size_t n; | 
| 1071 |  |  | 
| 1072 | 2.51k |   if (len < 0) | 
| 1073 | 0 |     n = strlen (str); | 
| 1074 | 2.51k |   else | 
| 1075 | 2.51k |     n = len; | 
| 1076 |  |  | 
| 1077 | 2.51k |   if (u8_check ((const uint8_t *) str, n)) | 
| 1078 | 812 |     return NULL; | 
| 1079 |  |  | 
| 1080 | 1.70k |   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC); | 
| 1081 | 2.51k | } | 
| 1082 |  |  | 
| 1083 |  | #include <stdio.h> | 
| 1084 |  | /** | 
| 1085 |  |  * stringprep_ucs4_nfkc_normalize: | 
| 1086 |  |  * @str: a Unicode string. | 
| 1087 |  |  * @len: length of @str array, or -1 if @str is nul-terminated. | 
| 1088 |  |  * | 
| 1089 |  |  * Converts a UCS4 string into canonical form, see | 
| 1090 |  |  * stringprep_utf8_nfkc_normalize() for more information. | 
| 1091 |  |  * | 
| 1092 |  |  * Return value: a newly allocated Unicode string, that is the NFKC | 
| 1093 |  |  *   normalized form of @str. | 
| 1094 |  |  **/ | 
| 1095 |  | uint32_t * | 
| 1096 |  | stringprep_ucs4_nfkc_normalize (const uint32_t *str, ssize_t len) | 
| 1097 | 13.2k | { | 
| 1098 | 13.2k |   char *p; | 
| 1099 | 13.2k |   uint32_t *result_wc; | 
| 1100 |  |  | 
| 1101 | 13.2k |   p = stringprep_ucs4_to_utf8 (str, len, 0, 0); | 
| 1102 | 13.2k |   if (!p) | 
| 1103 | 405 |     return NULL; | 
| 1104 |  |  | 
| 1105 | 12.8k |   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC); | 
| 1106 | 12.8k |   free (p); | 
| 1107 |  |  | 
| 1108 | 12.8k |   return result_wc; | 
| 1109 | 13.2k | } |