Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* nfkc.c --- Unicode normalization utilities.  | 
2  |  |    Copyright (C) 2002-2023 Simon Josefsson  | 
3  |  |  | 
4  |  |    This file is part of GNU Libidn.  | 
5  |  |  | 
6  |  |    GNU Libidn is free software: you can redistribute it and/or  | 
7  |  |    modify it under the terms of either:  | 
8  |  |  | 
9  |  |      * the GNU Lesser General Public License as published by the Free  | 
10  |  |        Software Foundation; either version 3 of the License, or (at  | 
11  |  |        your option) any later version.  | 
12  |  |  | 
13  |  |    or  | 
14  |  |  | 
15  |  |      * the GNU General Public License as published by the Free  | 
16  |  |        Software Foundation; either version 2 of the License, or (at  | 
17  |  |        your option) any later version.  | 
18  |  |  | 
19  |  |    or both in parallel, as here.  | 
20  |  |  | 
21  |  |    GNU Libidn is distributed in the hope that it will be useful,  | 
22  |  |    but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
23  |  |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU  | 
24  |  |    General Public License for more details.  | 
25  |  |  | 
26  |  |    You should have received copies of the GNU General Public License and  | 
27  |  |    the GNU Lesser General Public License along with this program.  If  | 
28  |  |    not, see <https://www.gnu.org/licenses/>. */  | 
29  |  |  | 
30  |  | #ifdef HAVE_CONFIG_H  | 
31  |  | # include "config.h"  | 
32  |  | #endif  | 
33  |  |  | 
34  |  | #include <stdlib.h>  | 
35  |  | #include <string.h>  | 
36  |  |  | 
37  |  | #include "stringprep.h"  | 
38  |  |  | 
39  |  | /* Hacks to make syncing with GLIB code easier. */  | 
40  | 12.5M  | #define gboolean int  | 
41  | 37.2M  | #define gchar char  | 
42  |  | #define guchar unsigned char  | 
43  | 841k  | #define glong long  | 
44  | 37.2M  | #define gint int  | 
45  | 19.7M  | #define guint unsigned int  | 
46  | 2.10M  | #define gushort unsigned short  | 
47  |  | #define gint16 int16_t  | 
48  |  | #define guint16 uint16_t  | 
49  | 93.7M  | #define gunichar uint32_t  | 
50  | 57.1M  | #define gsize size_t  | 
51  |  | #define gssize ssize_t  | 
52  | 43.6M  | #define g_malloc malloc  | 
53  | 0  | #define g_free free  | 
54  | 19.5M  | #define g_return_val_if_fail(expr,val)  {   \ | 
55  | 19.5M  |     if (!(expr))         \  | 
56  | 19.5M  |       return (val);         \  | 
57  | 19.5M  |   }  | 
58  |  |  | 
59  |  | /* Code from GLIB gmacros.h starts here. */  | 
60  |  |  | 
61  |  | /* GLIB - Library of useful routines for C programming  | 
62  |  |  * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald  | 
63  |  |  *  | 
64  |  |  * This library is free software; you can redistribute it and/or  | 
65  |  |  * modify it under the terms of the GNU Lesser General Public  | 
66  |  |  * License as published by the Free Software Foundation; either  | 
67  |  |  * version 2 of the License, or (at your option) any later version.  | 
68  |  |  *  | 
69  |  |  * This library is distributed in the hope that it will be useful,  | 
70  |  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
71  |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU  | 
72  |  |  * Lesser General Public License for more details.  | 
73  |  |  *  | 
74  |  |  * You should have received a copy of the GNU Lesser General Public  | 
75  |  |  * License along with this library; if not, write to the  | 
76  |  |  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,  | 
77  |  |  * Boston, MA 02111-1307, USA.  | 
78  |  |  */  | 
79  |  |  | 
80  |  | #ifndef FALSE  | 
81  | 173M  | # define  FALSE (0)  | 
82  |  | #endif  | 
83  |  |  | 
84  |  | #ifndef TRUE  | 
85  | 169M  | # define  TRUE  (!FALSE)  | 
86  |  | #endif  | 
87  |  |  | 
88  | 13.3M  | #define G_N_ELEMENTS(arr)   (sizeof (arr) / sizeof ((arr)[0]))  | 
89  |  |  | 
90  | 19.1M  | #define G_UNLIKELY(expr) (expr)  | 
91  |  |  | 
92  |  | /* Code from GLIB gunicode.h starts here. */  | 
93  |  |  | 
94  |  | /* gunicode.h - Unicode manipulation functions  | 
95  |  |  *  | 
96  |  |  *  Copyright (C) 1999, 2000 Tom Tromey  | 
97  |  |  *  Copyright 2000, 2005 Red Hat, Inc.  | 
98  |  |  *  | 
99  |  |  * The Gnome Library is free software; you can redistribute it and/or  | 
100  |  |  * modify it under the terms of the GNU Lesser General Public License as  | 
101  |  |  * published by the Free Software Foundation; either version 2 of the  | 
102  |  |  * License, or (at your option) any later version.  | 
103  |  |  *  | 
104  |  |  * The Gnome Library is distributed in the hope that it will be useful,  | 
105  |  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
106  |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU  | 
107  |  |  * Lesser General Public License for more details.  | 
108  |  |  *  | 
109  |  |  * You should have received a copy of the GNU Lesser General Public  | 
110  |  |  * License along with the Gnome Library; see the file COPYING.LIB.  If not,  | 
111  |  |  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,  | 
112  |  |  *   Boston, MA 02111-1307, USA.  | 
113  |  |  */  | 
114  |  |  | 
115  |  | typedef enum  | 
116  |  | { | 
117  |  |   G_NORMALIZE_DEFAULT,  | 
118  |  |   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,  | 
119  |  |   G_NORMALIZE_DEFAULT_COMPOSE,  | 
120  |  |   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,  | 
121  |  |   G_NORMALIZE_ALL,  | 
122  |  |   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,  | 
123  |  |   G_NORMALIZE_ALL_COMPOSE,  | 
124  |  |   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE  | 
125  |  | }  | 
126  |  | GNormalizeMode;  | 
127  |  |  | 
128  | 38.2M  | #define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])  | 
129  |  |  | 
130  |  | /* Code from GLIB gutf8.c starts here. */  | 
131  |  |  | 
132  |  | /* gutf8.c - Operations on UTF-8 strings.  | 
133  |  |  *  | 
134  |  |  * Copyright (C) 1999 Tom Tromey  | 
135  |  |  * Copyright (C) 2000 Red Hat, Inc.  | 
136  |  |  *  | 
137  |  |  * This library is free software; you can redistribute it and/or  | 
138  |  |  * modify it under the terms of the GNU Lesser General Public  | 
139  |  |  * License as published by the Free Software Foundation; either  | 
140  |  |  * version 2 of the License, or (at your option) any later version.  | 
141  |  |  *  | 
142  |  |  * This library is distributed in the hope that it will be useful,  | 
143  |  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
144  |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU  | 
145  |  |  * Lesser General Public License for more details.  | 
146  |  |  *  | 
147  |  |  * You should have received a copy of the GNU Lesser General Public  | 
148  |  |  * License along with this library; if not, write to the  | 
149  |  |  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,  | 
150  |  |  * Boston, MA 02111-1307, USA.  | 
151  |  |  */  | 
152  |  |  | 
153  |  | #define UTF8_COMPUTE(Char, Mask, Len)   \  | 
154  | 15.9M  |   if (Char < 128)       \  | 
155  | 15.9M  |     {           \ | 
156  | 961k  |       Len = 1;          \  | 
157  | 961k  |       Mask = 0x7f;        \  | 
158  | 961k  |     }            \  | 
159  | 15.9M  |   else if ((Char & 0xe0) == 0xc0)   \  | 
160  | 14.9M  |     {           \ | 
161  | 14.0M  |       Len = 2;          \  | 
162  | 14.0M  |       Mask = 0x1f;        \  | 
163  | 14.0M  |     }            \  | 
164  | 14.9M  |   else if ((Char & 0xf0) == 0xe0)   \  | 
165  | 883k  |     {           \ | 
166  | 862k  |       Len = 3;          \  | 
167  | 862k  |       Mask = 0x0f;        \  | 
168  | 862k  |     }            \  | 
169  | 883k  |   else if ((Char & 0xf8) == 0xf0)   \  | 
170  | 20.9k  |     {           \ | 
171  | 20.9k  |       Len = 4;          \  | 
172  | 20.9k  |       Mask = 0x07;        \  | 
173  | 20.9k  |     }            \  | 
174  | 20.9k  |   else if ((Char & 0xfc) == 0xf8)   \  | 
175  | 0  |     {           \ | 
176  | 0  |       Len = 5;          \  | 
177  | 0  |       Mask = 0x03;        \  | 
178  | 0  |     }            \  | 
179  | 0  |   else if ((Char & 0xfe) == 0xfc)   \  | 
180  | 0  |     {           \ | 
181  | 0  |       Len = 6;          \  | 
182  | 0  |       Mask = 0x01;        \  | 
183  | 0  |     }            \  | 
184  | 0  |   else            \  | 
185  | 0  |     Len = -1;  | 
186  |  |  | 
187  |  | #define UTF8_LENGTH(Char)     \  | 
188  | 19.7M  |   ((Char) < 0x80 ? 1 :        \  | 
189  | 19.7M  |    ((Char) < 0x800 ? 2 :      \  | 
190  | 19.1M  |     ((Char) < 0x10000 ? 3 :      \  | 
191  | 300k  |      ((Char) < 0x200000 ? 4 :      \  | 
192  | 12.6k  |       ((Char) < 0x4000000 ? 5 : 6)))))  | 
193  |  |  | 
194  |  | #define UTF8_GET(Result, Chars, Count, Mask, Len)           \  | 
195  | 15.9M  |   (Result) = (Chars)[0] & (Mask);               \  | 
196  | 31.7M  |   for ((Count) = 1; (Count) < (Len); ++(Count))             \  | 
197  | 15.9M  |     {                       \ | 
198  | 15.8M  |       if (((Chars)[(Count)] & 0xc0) != 0x80)             \  | 
199  | 15.8M  |   {                     \ | 
200  | 0  |     (Result) = -1;                  \  | 
201  | 0  |     break;                    \  | 
202  | 0  |   }                      \  | 
203  | 15.8M  |       (Result) <<= 6;                   \  | 
204  | 15.8M  |       (Result) |= ((Chars)[(Count)] & 0x3f);              \  | 
205  | 15.8M  |     }  | 
206  |  |  | 
207  |  | static const gchar utf8_skip_data[256] = { | 
208  |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  | 
209  |  |   1, 1, 1, 1, 1, 1, 1,  | 
210  |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  | 
211  |  |   1, 1, 1, 1, 1, 1, 1,  | 
212  |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  | 
213  |  |   1, 1, 1, 1, 1, 1, 1,  | 
214  |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  | 
215  |  |   1, 1, 1, 1, 1, 1, 1,  | 
216  |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  | 
217  |  |   1, 1, 1, 1, 1, 1, 1,  | 
218  |  |   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  | 
219  |  |   1, 1, 1, 1, 1, 1, 1,  | 
220  |  |   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  | 
221  |  |   2, 2, 2, 2, 2, 2, 2,  | 
222  |  |   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,  | 
223  |  |   5, 5, 5, 6, 6, 1, 1  | 
224  |  | };  | 
225  |  |  | 
226  |  | static const gchar *const g_utf8_skip = utf8_skip_data;  | 
227  |  |  | 
228  |  | /*  | 
229  |  |  * g_utf8_strlen:  | 
230  |  |  * @p: pointer to the start of a UTF-8 encoded string  | 
231  |  |  * @max: the maximum number of bytes to examine. If @max  | 
232  |  |  *       is less than 0, then the string is assumed to be  | 
233  |  |  *       nul-terminated. If @max is 0, @p will not be examined and  | 
234  |  |  *       may be %NULL.  | 
235  |  |  *  | 
236  |  |  * Computes the length of the string in characters, not including  | 
237  |  |  * the terminating nul character.  | 
238  |  |  *  | 
239  |  |  * Return value: the length of the string in characters  | 
240  |  |  **/  | 
241  |  | static glong  | 
242  |  | g_utf8_strlen (const gchar * p)  | 
243  | 841k  | { | 
244  | 841k  |   glong len = 0;  | 
245  |  |  | 
246  | 841k  |   g_return_val_if_fail (p != NULL, 0);  | 
247  |  |  | 
248  | 3.39M  |   while (*p)  | 
249  | 2.55M  |     { | 
250  | 2.55M  |       p = g_utf8_next_char (p);  | 
251  | 2.55M  |       ++len;  | 
252  | 2.55M  |     }  | 
253  |  |  | 
254  | 841k  |   return len;  | 
255  | 841k  | }  | 
256  |  |  | 
257  |  | /*  | 
258  |  |  * g_utf8_get_char:  | 
259  |  |  * @p: a pointer to Unicode character encoded as UTF-8  | 
260  |  |  *  | 
261  |  |  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.  | 
262  |  |  * If @p does not point to a valid UTF-8 encoded character, results are  | 
263  |  |  * undefined. If you are not sure that the bytes are complete  | 
264  |  |  * valid Unicode characters, you should use g_utf8_get_char_validated()  | 
265  |  |  * instead.  | 
266  |  |  *  | 
267  |  |  * Return value: the resulting character  | 
268  |  |  **/  | 
269  |  | static gunichar  | 
270  |  | g_utf8_get_char (const gchar * p)  | 
271  | 15.9M  | { | 
272  | 15.9M  |   int i, mask = 0, len;  | 
273  | 15.9M  |   gunichar result;  | 
274  | 15.9M  |   unsigned char c = (unsigned char) *p;  | 
275  |  |  | 
276  | 15.9M  |   UTF8_COMPUTE (c, mask, len);  | 
277  | 15.9M  |   if (len == -1)  | 
278  | 0  |     return (gunichar) - 1;  | 
279  | 15.9M  |   UTF8_GET (result, p, i, mask, len);  | 
280  |  |  | 
281  | 15.9M  |   return result;  | 
282  | 15.9M  | }  | 
283  |  |  | 
284  |  | /*  | 
285  |  |  * g_unichar_to_utf8:  | 
286  |  |  * @c: a Unicode character code  | 
287  |  |  * @outbuf: output buffer, must have at least 6 bytes of space.  | 
288  |  |  *       If %NULL, the length will be computed and returned  | 
289  |  |  *       and nothing will be written to @outbuf.  | 
290  |  |  *  | 
291  |  |  * Converts a single character to UTF-8.  | 
292  |  |  *  | 
293  |  |  * Return value: number of bytes written  | 
294  |  |  **/  | 
295  |  | static int  | 
296  |  | g_unichar_to_utf8 (gunichar c, gchar * outbuf)  | 
297  | 19.7M  | { | 
298  |  |   /* If this gets modified, also update the copy in g_string_insert_unichar() */  | 
299  | 19.7M  |   guint len = 0;  | 
300  | 19.7M  |   int first;  | 
301  | 19.7M  |   int i;  | 
302  |  |  | 
303  | 19.7M  |   if (c < 0x80)  | 
304  | 638k  |     { | 
305  | 638k  |       first = 0;  | 
306  | 638k  |       len = 1;  | 
307  | 638k  |     }  | 
308  | 19.1M  |   else if (c < 0x800)  | 
309  | 18.8M  |     { | 
310  | 18.8M  |       first = 0xc0;  | 
311  | 18.8M  |       len = 2;  | 
312  | 18.8M  |     }  | 
313  | 300k  |   else if (c < 0x10000)  | 
314  | 287k  |     { | 
315  | 287k  |       first = 0xe0;  | 
316  | 287k  |       len = 3;  | 
317  | 287k  |     }  | 
318  | 12.6k  |   else if (c < 0x200000)  | 
319  | 12.6k  |     { | 
320  | 12.6k  |       first = 0xf0;  | 
321  | 12.6k  |       len = 4;  | 
322  | 12.6k  |     }  | 
323  | 0  |   else if (c < 0x4000000)  | 
324  | 0  |     { | 
325  | 0  |       first = 0xf8;  | 
326  | 0  |       len = 5;  | 
327  | 0  |     }  | 
328  | 0  |   else  | 
329  | 0  |     { | 
330  | 0  |       first = 0xfc;  | 
331  | 0  |       len = 6;  | 
332  | 0  |     }  | 
333  |  |  | 
334  | 19.7M  |   if (outbuf)  | 
335  | 19.7M  |     { | 
336  | 39.2M  |       for (i = len - 1; i > 0; --i)  | 
337  | 19.4M  |   { | 
338  | 19.4M  |     outbuf[i] = (c & 0x3f) | 0x80;  | 
339  | 19.4M  |     c >>= 6;  | 
340  | 19.4M  |   }  | 
341  | 19.7M  |       outbuf[0] = c | first;  | 
342  | 19.7M  |     }  | 
343  |  |  | 
344  | 19.7M  |   return len;  | 
345  | 19.7M  | }  | 
346  |  |  | 
347  |  | /*  | 
348  |  |  * g_utf8_to_ucs4_fast:  | 
349  |  |  * @str: a UTF-8 encoded string  | 
350  |  |  * @len: the maximum length of @str to use, in bytes. If @len < 0,  | 
351  |  |  *       then the string is nul-terminated.  | 
352  |  |  * @items_written: location to store the number of characters in the  | 
353  |  |  *                 result, or %NULL.  | 
354  |  |  *  | 
355  |  |  * Convert a string from UTF-8 to a 32-bit fixed width  | 
356  |  |  * representation as UCS-4, assuming valid UTF-8 input.  | 
357  |  |  * This function is roughly twice as fast as g_utf8_to_ucs4()  | 
358  |  |  * but does no error checking on the input. A trailing 0 character  | 
359  |  |  * will be added to the string after the converted text.  | 
360  |  |  *  | 
361  |  |  * Return value: a pointer to a newly allocated UCS-4 string.  | 
362  |  |  *               This value must be freed with g_free().  | 
363  |  |  **/  | 
364  |  | static gunichar *  | 
365  |  | g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)  | 
366  | 18.6M  | { | 
367  | 18.6M  |   gunichar *result;  | 
368  | 18.6M  |   gsize n_chars, i;  | 
369  | 18.6M  |   const gchar *p;  | 
370  |  |  | 
371  | 18.6M  |   g_return_val_if_fail (str != NULL, NULL);  | 
372  |  |  | 
373  | 18.6M  |   p = str;  | 
374  | 18.6M  |   n_chars = 0;  | 
375  | 18.6M  |   if (len < 0)  | 
376  | 18.6M  |     { | 
377  | 38.4M  |       while (*p)  | 
378  | 19.8M  |   { | 
379  | 19.8M  |     p = g_utf8_next_char (p);  | 
380  | 19.8M  |     ++n_chars;  | 
381  | 19.8M  |   }  | 
382  | 18.6M  |     }  | 
383  | 0  |   else  | 
384  | 0  |     { | 
385  | 0  |       while (p < str + len && *p)  | 
386  | 0  |   { | 
387  | 0  |     p = g_utf8_next_char (p);  | 
388  | 0  |     ++n_chars;  | 
389  | 0  |   }  | 
390  | 0  |     }  | 
391  |  |  | 
392  | 18.6M  |   result = g_malloc (sizeof (gunichar) * (n_chars + 1));  | 
393  | 18.6M  |   if (!result)  | 
394  | 0  |     return NULL;  | 
395  |  |  | 
396  | 18.6M  |   p = str;  | 
397  | 38.4M  |   for (i = 0; i < n_chars; i++)  | 
398  | 19.8M  |     { | 
399  | 19.8M  |       gunichar wc = (guchar) * p++;  | 
400  |  |  | 
401  | 19.8M  |       if (wc < 0x80)  | 
402  | 683k  |   { | 
403  | 683k  |     result[i] = wc;  | 
404  | 683k  |   }  | 
405  | 19.1M  |       else  | 
406  | 19.1M  |   { | 
407  | 19.1M  |     gunichar mask = 0x40;  | 
408  |  |  | 
409  | 19.1M  |     if (G_UNLIKELY ((wc & mask) == 0))  | 
410  | 0  |       { | 
411  |  |         /* It's an out-of-sequence 10xxxxxxx byte.  | 
412  |  |          * Rather than making an ugly hash of this and the next byte  | 
413  |  |          * and overrunning the buffer, it's more useful to treat it  | 
414  |  |          * with a replacement character */  | 
415  | 0  |         result[i] = 0xfffd;  | 
416  | 0  |         continue;  | 
417  | 0  |       }  | 
418  |  |  | 
419  | 19.1M  |     do  | 
420  | 19.4M  |       { | 
421  | 19.4M  |         wc <<= 6;  | 
422  | 19.4M  |         wc |= (guchar) (*p++) & 0x3f;  | 
423  | 19.4M  |         mask <<= 5;  | 
424  | 19.4M  |       }  | 
425  | 19.4M  |     while ((wc & mask) != 0);  | 
426  |  |  | 
427  | 19.1M  |     wc &= mask - 1;  | 
428  |  |  | 
429  | 19.1M  |     result[i] = wc;  | 
430  | 19.1M  |   }  | 
431  | 19.8M  |     }  | 
432  | 18.6M  |   result[i] = 0;  | 
433  |  |  | 
434  | 18.6M  |   if (items_written)  | 
435  | 12.5M  |     *items_written = i;  | 
436  |  |  | 
437  | 18.6M  |   return result;  | 
438  | 18.6M  | }  | 
439  |  |  | 
440  |  | /*  | 
441  |  |  * g_ucs4_to_utf8:  | 
442  |  |  * @str: a UCS-4 encoded string  | 
443  |  |  * @len: the maximum length (number of characters) of @str to use.  | 
444  |  |  *       If @len < 0, then the string is nul-terminated.  | 
445  |  |  * @items_read: location to store number of characters read, or %NULL.  | 
446  |  |  * @items_written: location to store number of bytes written or %NULL.  | 
447  |  |  *                 The value here stored does not include the trailing 0  | 
448  |  |  *                 byte.  | 
449  |  |  * @error: location to store the error occurring, or %NULL to ignore  | 
450  |  |  *         errors. Any of the errors in #GConvertError other than  | 
451  |  |  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.  | 
452  |  |  *  | 
453  |  |  * Convert a string from a 32-bit fixed width representation as UCS-4.  | 
454  |  |  * to UTF-8. The result will be terminated with a 0 byte.  | 
455  |  |  *  | 
456  |  |  * Return value: a pointer to a newly allocated UTF-8 string.  | 
457  |  |  *               This value must be freed with g_free(). If an  | 
458  |  |  *               error occurs, %NULL will be returned and  | 
459  |  |  *               @error set. In that case, @items_read will be  | 
460  |  |  *               set to the position of the first invalid input  | 
461  |  |  *               character.  | 
462  |  |  **/  | 
463  |  | static gchar *  | 
464  |  | g_ucs4_to_utf8 (const gunichar * str,  | 
465  |  |     glong len, glong * items_read, glong * items_written)  | 
466  | 18.6M  | { | 
467  | 18.6M  |   gint result_length;  | 
468  | 18.6M  |   gchar *result = NULL;  | 
469  | 18.6M  |   gchar *p;  | 
470  | 18.6M  |   gint i;  | 
471  |  |  | 
472  | 18.6M  |   result_length = 0;  | 
473  | 38.4M  |   for (i = 0; len < 0 || i < len; i++)  | 
474  | 19.7M  |     { | 
475  | 19.7M  |       if (!str[i])  | 
476  | 0  |   break;  | 
477  |  |  | 
478  | 19.7M  |       if (str[i] >= 0x80000000)  | 
479  | 0  |   goto err_out;  | 
480  |  |  | 
481  | 19.7M  |       result_length += UTF8_LENGTH (str[i]);  | 
482  | 19.7M  |     }  | 
483  |  |  | 
484  | 18.6M  |   result = g_malloc (result_length + 1);  | 
485  | 18.6M  |   if (!result)  | 
486  | 0  |     return NULL;  | 
487  | 18.6M  |   p = result;  | 
488  |  |  | 
489  | 18.6M  |   i = 0;  | 
490  | 38.4M  |   while (p < result + result_length)  | 
491  | 19.7M  |     p += g_unichar_to_utf8 (str[i++], p);  | 
492  |  |  | 
493  | 18.6M  |   *p = '\0';  | 
494  |  |  | 
495  | 18.6M  |   if (items_written)  | 
496  | 0  |     *items_written = p - result;  | 
497  |  |  | 
498  | 18.6M  | err_out:  | 
499  | 18.6M  |   if (items_read)  | 
500  | 0  |     *items_read = i;  | 
501  |  |  | 
502  | 18.6M  |   return result;  | 
503  | 18.6M  | }  | 
504  |  |  | 
505  |  | /* Code from GLIB gunidecomp.c starts here. */  | 
506  |  |  | 
507  |  | /* decomp.c - Character decomposition.  | 
508  |  |  *  | 
509  |  |  *  Copyright (C) 1999, 2000 Tom Tromey  | 
510  |  |  *  Copyright 2000 Red Hat, Inc.  | 
511  |  |  *  | 
512  |  |  * The Gnome Library is free software; you can redistribute it and/or  | 
513  |  |  * modify it under the terms of the GNU Lesser General Public License as  | 
514  |  |  * published by the Free Software Foundation; either version 2 of the  | 
515  |  |  * License, or (at your option) any later version.  | 
516  |  |  *  | 
517  |  |  * The Gnome Library is distributed in the hope that it will be useful,  | 
518  |  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
519  |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU  | 
520  |  |  * Lesser General Public License for more details.  | 
521  |  |  *  | 
522  |  |  * You should have received a copy of the GNU Lesser General Public  | 
523  |  |  * License along with the Gnome Library; see the file COPYING.LIB.  If not,  | 
524  |  |  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,  | 
525  |  |  *   Boston, MA 02111-1307, USA.  | 
526  |  |  */  | 
527  |  |  | 
528  |  | #include "gunidecomp.h"  | 
529  |  | #include "gunicomp.h"  | 
530  |  |  | 
531  |  | #define CC_PART1(Page, Char)            \  | 
532  | 31.8M  |   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \  | 
533  | 31.8M  |    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX)  \  | 
534  | 31.8M  |    : (cclass_data[combining_class_table_part1[Page]][Char]))  | 
535  |  |  | 
536  |  | #define CC_PART2(Page, Char)            \  | 
537  | 8.22k  |   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \  | 
538  | 8.22k  |    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \  | 
539  | 8.22k  |    : (cclass_data[combining_class_table_part2[Page]][Char]))  | 
540  |  |  | 
541  |  | #define COMBINING_CLASS(Char)         \  | 
542  | 31.8M  |   (((Char) <= G_UNICODE_LAST_CHAR_PART1)     \  | 
543  | 31.8M  |    ? CC_PART1 ((Char) >> 8, (Char) & 0xff)     \  | 
544  | 31.8M  |    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \  | 
545  | 30.0k  |       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \  | 
546  | 30.0k  |       : 0))  | 
547  |  |  | 
548  |  | /* constants for hangul syllable [de]composition */  | 
549  | 33.1M  | #define SBase 0xAC00  | 
550  | 6.90M  | #define LBase 0x1100  | 
551  | 2.11M  | #define VBase 0x1161  | 
552  | 2.10M  | #define TBase 0x11A7  | 
553  | 670k  | #define LCount 19  | 
554  | 91.6k  | #define VCount 21  | 
555  | 95.9k  | #define TCount 28  | 
556  | 88.3k  | #define NCount (VCount * TCount)  | 
557  | 85.9k  | #define SCount (LCount * NCount)  | 
558  |  |  | 
559  |  | /*  | 
560  |  |  * g_unicode_canonical_ordering:  | 
561  |  |  * @string: a UCS-4 encoded string.  | 
562  |  |  * @len: the maximum length of @string to use.  | 
563  |  |  *  | 
564  |  |  * Computes the canonical ordering of a string in-place.  | 
565  |  |  * This rearranges decomposed characters in the string  | 
566  |  |  * according to their combining classes.  See the Unicode  | 
567  |  |  * manual for more information.  | 
568  |  |  **/  | 
569  |  | static void  | 
570  |  | g_unicode_canonical_ordering (gunichar * string, gsize len)  | 
571  | 12.9M  | { | 
572  | 12.9M  |   gsize i;  | 
573  | 12.9M  |   int swap = 1;  | 
574  |  |  | 
575  | 25.9M  |   while (swap)  | 
576  | 12.9M  |     { | 
577  | 12.9M  |       int last;  | 
578  | 12.9M  |       swap = 0;  | 
579  | 12.9M  |       last = COMBINING_CLASS (string[0]);  | 
580  | 16.7M  |       for (i = 0; i < len - 1; ++i)  | 
581  | 3.83M  |   { | 
582  | 3.83M  |     int next = COMBINING_CLASS (string[i + 1]);  | 
583  | 3.83M  |     if (next != 0 && last > next)  | 
584  | 2.63k  |       { | 
585  | 2.63k  |         gsize j;  | 
586  |  |         /* Percolate item leftward through string.  */  | 
587  | 11.0k  |         for (j = i + 1; j > 0; --j)  | 
588  | 10.8k  |     { | 
589  | 10.8k  |       gunichar t;  | 
590  | 10.8k  |       if (COMBINING_CLASS (string[j - 1]) <= next)  | 
591  | 2.42k  |         break;  | 
592  | 8.42k  |       t = string[j];  | 
593  | 8.42k  |       string[j] = string[j - 1];  | 
594  | 8.42k  |       string[j - 1] = t;  | 
595  | 8.42k  |       swap = 1;  | 
596  | 8.42k  |     }  | 
597  |  |         /* We're re-entering the loop looking at the old  | 
598  |  |            character again.  */  | 
599  | 2.63k  |         next = last;  | 
600  | 2.63k  |       }  | 
601  | 3.83M  |     last = next;  | 
602  | 3.83M  |   }  | 
603  | 12.9M  |     }  | 
604  | 12.9M  | }  | 
605  |  |  | 
606  |  | /* http://www.unicode.org/unicode/reports/tr15/#Hangul  | 
607  |  |  * r should be null or have sufficient space. Calling with r == NULL will  | 
608  |  |  * only calculate the result_len; however, a buffer with space for three  | 
609  |  |  * characters will always be big enough. */  | 
610  |  | static void  | 
611  |  | decompose_hangul (gunichar s, gunichar * r, gsize * result_len)  | 
612  | 2.31k  | { | 
613  | 2.31k  |   gint SIndex = s - SBase;  | 
614  | 2.31k  |   gint TIndex = SIndex % TCount;  | 
615  |  |  | 
616  | 2.31k  |   if (r)  | 
617  | 1.15k  |     { | 
618  | 1.15k  |       r[0] = LBase + SIndex / NCount;  | 
619  | 1.15k  |       r[1] = VBase + (SIndex % NCount) / TCount;  | 
620  | 1.15k  |     }  | 
621  |  |  | 
622  | 2.31k  |   if (TIndex)  | 
623  | 1.80k  |     { | 
624  | 1.80k  |       if (r)  | 
625  | 901  |   r[2] = TBase + TIndex;  | 
626  | 1.80k  |       *result_len = 3;  | 
627  | 1.80k  |     }  | 
628  | 516  |   else  | 
629  | 516  |     *result_len = 2;  | 
630  | 2.31k  | }  | 
631  |  |  | 
632  |  | /* returns a pointer to a null-terminated UTF-8 string */  | 
633  |  | static const gchar *  | 
634  |  | find_decomposition (gunichar ch, gboolean compat)  | 
635  | 13.3M  | { | 
636  | 13.3M  |   int start = 0;  | 
637  | 13.3M  |   int end = G_N_ELEMENTS (decomp_table);  | 
638  |  |  | 
639  | 13.3M  |   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)  | 
640  | 12.9M  |     { | 
641  | 168M  |       while (TRUE)  | 
642  | 168M  |   { | 
643  | 168M  |     int half = (start + end) / 2;  | 
644  | 168M  |     if (ch == decomp_table[half].ch)  | 
645  | 1.68M  |       { | 
646  | 1.68M  |         int offset;  | 
647  |  |  | 
648  | 1.68M  |         if (compat)  | 
649  | 1.68M  |     { | 
650  | 1.68M  |       offset = decomp_table[half].compat_offset;  | 
651  | 1.68M  |       if (offset == G_UNICODE_NOT_PRESENT_OFFSET)  | 
652  | 1.14M  |         offset = decomp_table[half].canon_offset;  | 
653  | 1.68M  |     }  | 
654  | 0  |         else  | 
655  | 0  |     { | 
656  | 0  |       offset = decomp_table[half].canon_offset;  | 
657  | 0  |       if (offset == G_UNICODE_NOT_PRESENT_OFFSET)  | 
658  | 0  |         return NULL;  | 
659  | 0  |     }  | 
660  |  |  | 
661  | 1.68M  |         return &(decomp_expansion_string[offset]);  | 
662  | 1.68M  |       }  | 
663  | 167M  |     else if (half == start)  | 
664  | 11.3M  |       break;  | 
665  | 155M  |     else if (ch > decomp_table[half].ch)  | 
666  | 49.6M  |       start = half;  | 
667  | 106M  |     else  | 
668  | 106M  |       end = half;  | 
669  | 168M  |   }  | 
670  | 12.9M  |     }  | 
671  |  |  | 
672  | 11.6M  |   return NULL;  | 
673  | 13.3M  | }  | 
674  |  |  | 
675  |  | /* L,V => LV and LV,T => LVT  */  | 
676  |  | static gboolean  | 
677  |  | combine_hangul (gunichar a, gunichar b, gunichar * result)  | 
678  | 2.10M  | { | 
679  | 2.10M  |   if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)  | 
680  | 1.54k  |     { | 
681  | 1.54k  |       gint LIndex = a - LBase;  | 
682  | 1.54k  |       gint VIndex = b - VBase;  | 
683  |  |  | 
684  | 1.54k  |       *result = SBase + (LIndex * VCount + VIndex) * TCount;  | 
685  | 1.54k  |       return TRUE;  | 
686  | 1.54k  |     }  | 
687  |  |  | 
688  | 2.10M  |   if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)  | 
689  | 1.09k  |     { | 
690  | 1.09k  |       gint SIndex = a - SBase;  | 
691  |  |  | 
692  | 1.09k  |       if ((SIndex % TCount) == 0)  | 
693  | 901  |   { | 
694  | 901  |     gint TIndex = b - TBase;  | 
695  |  |  | 
696  | 901  |     *result = a + TIndex;  | 
697  | 901  |     return TRUE;  | 
698  | 901  |   }  | 
699  | 1.09k  |     }  | 
700  |  |  | 
701  | 2.10M  |   return FALSE;  | 
702  | 2.10M  | }  | 
703  |  |  | 
704  |  | #define CI(Page, Char)          \  | 
705  | 3.75M  |   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \  | 
706  | 3.75M  |    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX)  \  | 
707  | 3.75M  |    : (compose_data[compose_table[Page]][Char]))  | 
708  |  |  | 
709  |  | #define COMPOSE_INDEX(Char)           \  | 
710  | 3.95M  |   (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))  | 
711  |  |  | 
712  |  | static gboolean  | 
713  |  | combine (gunichar a, gunichar b, gunichar * result)  | 
714  | 2.10M  | { | 
715  | 2.10M  |   gushort index_a, index_b;  | 
716  |  |  | 
717  | 2.10M  |   if (combine_hangul (a, b, result))  | 
718  | 2.45k  |     return TRUE;  | 
719  |  |  | 
720  | 2.10M  |   index_a = COMPOSE_INDEX (a);  | 
721  |  |  | 
722  | 2.10M  |   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)  | 
723  | 252k  |     { | 
724  | 252k  |       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])  | 
725  | 17.8k  |   { | 
726  | 17.8k  |     *result =  | 
727  | 17.8k  |       compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];  | 
728  | 17.8k  |     return TRUE;  | 
729  | 17.8k  |   }  | 
730  | 234k  |       else  | 
731  | 234k  |   return FALSE;  | 
732  | 252k  |     }  | 
733  |  |  | 
734  | 1.84M  |   index_b = COMPOSE_INDEX (b);  | 
735  |  |  | 
736  | 1.84M  |   if (index_b >= COMPOSE_SECOND_SINGLE_START)  | 
737  | 1.18k  |     { | 
738  | 1.18k  |       if (a ==  | 
739  | 1.18k  |     compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])  | 
740  | 741  |   { | 
741  | 741  |     *result =  | 
742  | 741  |       compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];  | 
743  | 741  |     return TRUE;  | 
744  | 741  |   }  | 
745  | 443  |       else  | 
746  | 443  |   return FALSE;  | 
747  | 1.18k  |     }  | 
748  |  |  | 
749  | 1.84M  |   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START  | 
750  | 1.84M  |       && index_b >= COMPOSE_SECOND_START  | 
751  | 1.84M  |       && index_b < COMPOSE_SECOND_SINGLE_START)  | 
752  | 561k  |     { | 
753  | 561k  |       gunichar res =  | 
754  | 561k  |   compose_array[index_a - COMPOSE_FIRST_START][index_b -  | 
755  | 561k  |                  COMPOSE_SECOND_START];  | 
756  |  |  | 
757  | 561k  |       if (res)  | 
758  | 560k  |   { | 
759  | 560k  |     *result = res;  | 
760  | 560k  |     return TRUE;  | 
761  | 560k  |   }  | 
762  | 561k  |     }  | 
763  |  |  | 
764  | 1.28M  |   return FALSE;  | 
765  | 1.84M  | }  | 
766  |  |  | 
767  |  | static gunichar *  | 
768  |  | _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)  | 
769  | 6.28M  | { | 
770  | 6.28M  |   gsize n_wc;  | 
771  | 6.28M  |   gunichar *wc_buffer;  | 
772  | 6.28M  |   const char *p;  | 
773  | 6.28M  |   gsize last_start;  | 
774  | 6.28M  |   gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);  | 
775  | 6.28M  |   gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);  | 
776  |  |  | 
777  | 6.28M  |   n_wc = 0;  | 
778  | 6.28M  |   p = str;  | 
779  | 12.9M  |   while ((max_len < 0 || p < str + max_len) && *p)  | 
780  | 6.68M  |     { | 
781  | 6.68M  |       const gchar *decomp;  | 
782  | 6.68M  |       gunichar wc = g_utf8_get_char (p);  | 
783  |  |  | 
784  | 6.68M  |       if (wc >= SBase && wc < SBase + SCount)  | 
785  | 1.15k  |   { | 
786  | 1.15k  |     gsize result_len;  | 
787  | 1.15k  |     decompose_hangul (wc, NULL, &result_len);  | 
788  | 1.15k  |     n_wc += result_len;  | 
789  | 1.15k  |   }  | 
790  | 6.68M  |       else  | 
791  | 6.68M  |   { | 
792  | 6.68M  |     decomp = find_decomposition (wc, do_compat);  | 
793  |  |  | 
794  | 6.68M  |     if (decomp)  | 
795  | 841k  |       n_wc += g_utf8_strlen (decomp);  | 
796  | 5.84M  |     else  | 
797  | 5.84M  |       n_wc++;  | 
798  | 6.68M  |   }  | 
799  |  |  | 
800  | 6.68M  |       p = g_utf8_next_char (p);  | 
801  | 6.68M  |     }  | 
802  |  |  | 
803  | 6.28M  |   wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));  | 
804  | 6.28M  |   if (!wc_buffer)  | 
805  | 0  |     return NULL;  | 
806  |  |  | 
807  | 6.28M  |   last_start = 0;  | 
808  | 6.28M  |   n_wc = 0;  | 
809  | 6.28M  |   p = str;  | 
810  | 12.9M  |   while ((max_len < 0 || p < str + max_len) && *p)  | 
811  | 6.68M  |     { | 
812  | 6.68M  |       gunichar wc = g_utf8_get_char (p);  | 
813  | 6.68M  |       const gchar *decomp;  | 
814  | 6.68M  |       int cc;  | 
815  | 6.68M  |       gsize old_n_wc = n_wc;  | 
816  |  |  | 
817  | 6.68M  |       if (wc >= SBase && wc < SBase + SCount)  | 
818  | 1.15k  |   { | 
819  | 1.15k  |     gsize result_len;  | 
820  | 1.15k  |     decompose_hangul (wc, wc_buffer + n_wc, &result_len);  | 
821  | 1.15k  |     n_wc += result_len;  | 
822  | 1.15k  |   }  | 
823  | 6.68M  |       else  | 
824  | 6.68M  |   { | 
825  | 6.68M  |     decomp = find_decomposition (wc, do_compat);  | 
826  |  |  | 
827  | 6.68M  |     if (decomp)  | 
828  | 841k  |       { | 
829  | 841k  |         const char *pd;  | 
830  | 3.39M  |         for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))  | 
831  | 2.55M  |     wc_buffer[n_wc++] = g_utf8_get_char (pd);  | 
832  | 841k  |       }  | 
833  | 5.84M  |     else  | 
834  | 5.84M  |       wc_buffer[n_wc++] = wc;  | 
835  | 6.68M  |   }  | 
836  |  |  | 
837  | 6.68M  |       if (n_wc > 0)  | 
838  | 6.68M  |   { | 
839  | 6.68M  |     cc = COMBINING_CLASS (wc_buffer[old_n_wc]);  | 
840  |  |  | 
841  | 6.68M  |     if (cc == 0)  | 
842  | 6.67M  |       { | 
843  | 6.67M  |         g_unicode_canonical_ordering (wc_buffer + last_start,  | 
844  | 6.67M  |               n_wc - last_start);  | 
845  | 6.67M  |         last_start = old_n_wc;  | 
846  | 6.67M  |       }  | 
847  | 6.68M  |   }  | 
848  |  |  | 
849  | 6.68M  |       p = g_utf8_next_char (p);  | 
850  | 6.68M  |     }  | 
851  |  |  | 
852  | 6.28M  |   if (n_wc > 0)  | 
853  | 6.28M  |     { | 
854  | 6.28M  |       g_unicode_canonical_ordering (wc_buffer + last_start,  | 
855  | 6.28M  |             n_wc - last_start);  | 
856  |  |       /* dead assignment: last_start = n_wc; */  | 
857  | 6.28M  |     }  | 
858  |  |  | 
859  | 6.28M  |   wc_buffer[n_wc] = 0;  | 
860  |  |  | 
861  |  |   /* All decomposed and reordered */  | 
862  |  |  | 
863  | 6.28M  |   if (do_compose && n_wc > 0)  | 
864  | 6.28M  |     { | 
865  | 6.28M  |       gsize i, j;  | 
866  | 6.28M  |       int last_cc = 0;  | 
867  | 6.28M  |       last_start = 0;  | 
868  |  |  | 
869  | 14.6M  |       for (i = 0; i < n_wc; i++)  | 
870  | 8.39M  |   { | 
871  | 8.39M  |     int cc = COMBINING_CLASS (wc_buffer[i]);  | 
872  |  |  | 
873  | 8.39M  |     if (i > 0 &&  | 
874  | 8.39M  |         (last_cc == 0 || last_cc != cc) &&  | 
875  | 8.39M  |         combine (wc_buffer[last_start], wc_buffer[i],  | 
876  | 2.10M  |            &wc_buffer[last_start]))  | 
877  | 581k  |       { | 
878  | 1.03M  |         for (j = i + 1; j < n_wc; j++)  | 
879  | 448k  |     wc_buffer[j - 1] = wc_buffer[j];  | 
880  | 581k  |         n_wc--;  | 
881  | 581k  |         i--;  | 
882  |  |  | 
883  | 581k  |         if (i == last_start)  | 
884  | 581k  |     last_cc = 0;  | 
885  | 622  |         else  | 
886  | 622  |     last_cc = COMBINING_CLASS (wc_buffer[i - 1]);  | 
887  |  |  | 
888  | 581k  |         continue;  | 
889  | 581k  |       }  | 
890  |  |  | 
891  | 7.81M  |     if (cc == 0)  | 
892  | 7.79M  |       last_start = i;  | 
893  |  |  | 
894  | 7.81M  |     last_cc = cc;  | 
895  | 7.81M  |   }  | 
896  | 6.28M  |     }  | 
897  |  |  | 
898  | 6.28M  |   wc_buffer[n_wc] = 0;  | 
899  |  |  | 
900  | 6.28M  |   return wc_buffer;  | 
901  | 6.28M  | }  | 
902  |  |  | 
903  |  | /*  | 
904  |  |  * g_utf8_normalize:  | 
905  |  |  * @str: a UTF-8 encoded string.  | 
906  |  |  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.  | 
907  |  |  * @mode: the type of normalization to perform.  | 
908  |  |  *  | 
909  |  |  * Converts a string into canonical form, standardizing  | 
910  |  |  * such issues as whether a character with an accent  | 
911  |  |  * is represented as a base character and combining  | 
912  |  |  * accent or as a single precomposed character. The  | 
913  |  |  * string has to be valid UTF-8, otherwise %NULL is  | 
914  |  |  * returned. You should generally call g_utf8_normalize()  | 
915  |  |  * before comparing two Unicode strings.  | 
916  |  |  *  | 
917  |  |  * The normalization mode %G_NORMALIZE_DEFAULT only  | 
918  |  |  * standardizes differences that do not affect the  | 
919  |  |  * text content, such as the above-mentioned accent  | 
920  |  |  * representation. %G_NORMALIZE_ALL also standardizes  | 
921  |  |  * the "compatibility" characters in Unicode, such  | 
922  |  |  * as SUPERSCRIPT THREE to the standard forms  | 
923  |  |  * (in this case DIGIT THREE). Formatting information  | 
924  |  |  * may be lost but for most text operations such  | 
925  |  |  * characters should be considered the same.  | 
926  |  |  *  | 
927  |  |  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE  | 
928  |  |  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,  | 
929  |  |  * but returned a result with composed forms rather  | 
930  |  |  * than a maximally decomposed form. This is often  | 
931  |  |  * useful if you intend to convert the string to  | 
932  |  |  * a legacy encoding or pass it to a system with  | 
933  |  |  * less capable Unicode handling.  | 
934  |  |  *  | 
935  |  |  * Return value: a newly allocated string, that is the  | 
936  |  |  *   normalized form of @str, or %NULL if @str is not  | 
937  |  |  *   valid UTF-8.  | 
938  |  |  **/  | 
939  |  | static gchar *  | 
940  |  | g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)  | 
941  | 0  | { | 
942  | 0  |   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);  | 
943  | 0  |   gchar *result = NULL;  | 
944  |  | 
  | 
945  | 0  |   if (result_wc)  | 
946  | 0  |     result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);  | 
947  |  | 
  | 
948  | 0  |   g_free (result_wc);  | 
949  |  | 
  | 
950  | 0  |   return result;  | 
951  | 0  | }  | 
952  |  |  | 
953  |  | /* Public Libidn API starts here. */  | 
954  |  |  | 
955  |  | /**  | 
956  |  |  * stringprep_utf8_to_unichar:  | 
957  |  |  * @p: a pointer to Unicode character encoded as UTF-8  | 
958  |  |  *  | 
959  |  |  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.  | 
960  |  |  * If @p does not point to a valid UTF-8 encoded character, results are  | 
961  |  |  * undefined.  | 
962  |  |  *  | 
963  |  |  * Return value: the resulting character.  | 
964  |  |  **/  | 
965  |  | uint32_t  | 
966  |  | stringprep_utf8_to_unichar (const char *p)  | 
967  | 0  | { | 
968  | 0  |   return g_utf8_get_char (p);  | 
969  | 0  | }  | 
970  |  |  | 
971  |  | /**  | 
972  |  |  * stringprep_unichar_to_utf8:  | 
973  |  |  * @c: a ISO10646 character code  | 
974  |  |  * @outbuf: output buffer, must have at least 6 bytes of space.  | 
975  |  |  *       If %NULL, the length will be computed and returned  | 
976  |  |  *       and nothing will be written to @outbuf.  | 
977  |  |  *  | 
978  |  |  * Converts a single character to UTF-8.  | 
979  |  |  *  | 
980  |  |  * Return value: number of bytes written.  | 
981  |  |  **/  | 
982  |  | int  | 
983  |  | stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)  | 
984  | 0  | { | 
985  | 0  |   return g_unichar_to_utf8 (c, outbuf);  | 
986  | 0  | }  | 
987  |  |  | 
988  |  | #include <unistr.h>  | 
989  |  |  | 
990  |  | /**  | 
991  |  |  * stringprep_utf8_to_ucs4:  | 
992  |  |  * @str: a UTF-8 encoded string  | 
993  |  |  * @len: the maximum length of @str to use. If @len < 0, then  | 
994  |  |  *       the string is nul-terminated.  | 
995  |  |  * @items_written: location to store the number of characters in the  | 
996  |  |  *                 result, or %NULL.  | 
997  |  |  *  | 
998  |  |  * Convert a string from UTF-8 to a 32-bit fixed width representation  | 
999  |  |  * as UCS-4.  The function now performs error checking to verify that  | 
1000  |  |  * the input is valid UTF-8 (before it was documented to not do error  | 
1001  |  |  * checking).  | 
1002  |  |  *  | 
1003  |  |  * Return value: a pointer to a newly allocated UCS-4 string.  | 
1004  |  |  *               This value must be deallocated by the caller.  | 
1005  |  |  **/  | 
1006  |  | uint32_t *  | 
1007  |  | stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)  | 
1008  | 18.6M  | { | 
1009  | 18.6M  |   size_t n;  | 
1010  |  |  | 
1011  | 18.6M  |   if (len < 0)  | 
1012  | 18.6M  |     n = strlen (str);  | 
1013  | 0  |   else  | 
1014  | 0  |     n = len;  | 
1015  |  |  | 
1016  | 18.6M  |   if (u8_check ((const uint8_t *) str, n))  | 
1017  | 2.02k  |     return NULL;  | 
1018  |  |  | 
1019  | 18.6M  |   return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);  | 
1020  | 18.6M  | }  | 
1021  |  |  | 
1022  |  | /**  | 
1023  |  |  * stringprep_ucs4_to_utf8:  | 
1024  |  |  * @str: a UCS-4 encoded string  | 
1025  |  |  * @len: the maximum length of @str to use. If @len < 0, then  | 
1026  |  |  *       the string is terminated with a 0 character.  | 
1027  |  |  * @items_read: location to store number of characters read read, or %NULL.  | 
1028  |  |  * @items_written: location to store number of bytes written or %NULL.  | 
1029  |  |  *                 The value here stored does not include the trailing 0  | 
1030  |  |  *                 byte.  | 
1031  |  |  *  | 
1032  |  |  * Convert a string from a 32-bit fixed width representation as UCS-4.  | 
1033  |  |  * to UTF-8. The result will be terminated with a 0 byte.  | 
1034  |  |  *  | 
1035  |  |  * Return value: a pointer to a newly allocated UTF-8 string.  | 
1036  |  |  *               This value must be deallocated by the caller.  | 
1037  |  |  *               If an error occurs, %NULL will be returned.  | 
1038  |  |  **/  | 
1039  |  | char *  | 
1040  |  | stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,  | 
1041  |  |        size_t *items_read, size_t *items_written)  | 
1042  | 18.6M  | { | 
1043  | 18.6M  |   return g_ucs4_to_utf8 (str, len, (glong *) items_read,  | 
1044  | 18.6M  |        (glong *) items_written);  | 
1045  | 18.6M  | }  | 
1046  |  |  | 
1047  |  | /**  | 
1048  |  |  * stringprep_utf8_nfkc_normalize:  | 
1049  |  |  * @str: a UTF-8 encoded string.  | 
1050  |  |  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.  | 
1051  |  |  *  | 
1052  |  |  * Converts a string into canonical form, standardizing  | 
1053  |  |  * such issues as whether a character with an accent  | 
1054  |  |  * is represented as a base character and combining  | 
1055  |  |  * accent or as a single precomposed character.  | 
1056  |  |  *  | 
1057  |  |  * The normalization mode is NFKC (ALL COMPOSE).  It standardizes  | 
1058  |  |  * differences that do not affect the text content, such as the  | 
1059  |  |  * above-mentioned accent representation. It standardizes the  | 
1060  |  |  * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to  | 
1061  |  |  * the standard forms (in this case DIGIT THREE). Formatting  | 
1062  |  |  * information may be lost but for most text operations such  | 
1063  |  |  * characters should be considered the same. It returns a result with  | 
1064  |  |  * composed forms rather than a maximally decomposed form.  | 
1065  |  |  *  | 
1066  |  |  * Return value: a newly allocated string, that is the  | 
1067  |  |  *   NFKC normalized form of @str.  | 
1068  |  |  **/  | 
1069  |  | char *  | 
1070  |  | stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)  | 
1071  | 0  | { | 
1072  | 0  |   size_t n;  | 
1073  |  | 
  | 
1074  | 0  |   if (len < 0)  | 
1075  | 0  |     n = strlen (str);  | 
1076  | 0  |   else  | 
1077  | 0  |     n = len;  | 
1078  |  | 
  | 
1079  | 0  |   if (u8_check ((const uint8_t *) str, n))  | 
1080  | 0  |     return NULL;  | 
1081  |  |  | 
1082  | 0  |   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);  | 
1083  | 0  | }  | 
1084  |  |  | 
1085  |  | #include <stdio.h>  | 
1086  |  | /**  | 
1087  |  |  * stringprep_ucs4_nfkc_normalize:  | 
1088  |  |  * @str: a Unicode string.  | 
1089  |  |  * @len: length of @str array, or -1 if @str is nul-terminated.  | 
1090  |  |  *  | 
1091  |  |  * Converts a UCS4 string into canonical form, see  | 
1092  |  |  * stringprep_utf8_nfkc_normalize() for more information.  | 
1093  |  |  *  | 
1094  |  |  * Return value: a newly allocated Unicode string, that is the NFKC  | 
1095  |  |  *   normalized form of @str.  | 
1096  |  |  **/  | 
1097  |  | uint32_t *  | 
1098  |  | stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)  | 
1099  | 6.28M  | { | 
1100  | 6.28M  |   char *p;  | 
1101  | 6.28M  |   uint32_t *result_wc;  | 
1102  |  |  | 
1103  | 6.28M  |   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);  | 
1104  | 6.28M  |   if (!p)  | 
1105  | 0  |     return NULL;  | 
1106  |  |  | 
1107  | 6.28M  |   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);  | 
1108  | 6.28M  |   free (p);  | 
1109  |  |  | 
1110  | 6.28M  |   return result_wc;  | 
1111  | 6.28M  | }  |