/src/glib/glib/ghostutils.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- */  | 
2  |  |  | 
3  |  | /* GLIB - Library of useful routines for C programming  | 
4  |  |  * Copyright (C) 2008 Red Hat, Inc.  | 
5  |  |  *  | 
6  |  |  * SPDX-License-Identifier: LGPL-2.1-or-later  | 
7  |  |  *  | 
8  |  |  * This library is free software; you can redistribute it and/or  | 
9  |  |  * modify it under the terms of the GNU Lesser General Public  | 
10  |  |  * License as published by the Free Software Foundation; either  | 
11  |  |  * version 2.1 of the License, or (at your option) any later version.  | 
12  |  |  *  | 
13  |  |  * This library is distributed in the hope that it will be useful,  | 
14  |  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
15  |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU  | 
16  |  |  * Lesser General Public License for more details.  | 
17  |  |  *  | 
18  |  |  * You should have received a copy of the GNU Lesser General  | 
19  |  |  * Public License along with this library; if not, see <http://www.gnu.org/licenses/>.  | 
20  |  |  */  | 
21  |  |  | 
22  |  | #include "config.h"  | 
23  |  | #include "glibconfig.h"  | 
24  |  |  | 
25  |  | #include <string.h>  | 
26  |  |  | 
27  |  | #ifdef G_OS_UNIX  | 
28  |  | #include <unistd.h>  | 
29  |  | #endif  | 
30  |  |  | 
31  |  | #include "ghostutils.h"  | 
32  |  |  | 
33  |  | #include "garray.h"  | 
34  |  | #include "gmem.h"  | 
35  |  | #include "gstring.h"  | 
36  |  | #include "gstrfuncs.h"  | 
37  |  | #include "glibintl.h"  | 
38  |  |  | 
39  |  | #ifdef G_PLATFORM_WIN32  | 
40  |  | #include <windows.h>  | 
41  |  | #endif  | 
42  |  |  | 
43  |  |  | 
44  |  | /**  | 
45  |  |  * SECTION:ghostutils  | 
46  |  |  * @short_description: Internet hostname utilities  | 
47  |  |  *  | 
48  |  |  * Functions for manipulating internet hostnames; in particular, for  | 
49  |  |  * converting between Unicode and ASCII-encoded forms of  | 
50  |  |  * Internationalized Domain Names (IDNs).  | 
51  |  |  *  | 
52  |  |  * The  | 
53  |  |  * [Internationalized Domain Names for Applications (IDNA)](http://www.ietf.org/rfc/rfc3490.txt)  | 
54  |  |  * standards allow for the use  | 
55  |  |  * of Unicode domain names in applications, while providing  | 
56  |  |  * backward-compatibility with the old ASCII-only DNS, by defining an  | 
57  |  |  * ASCII-Compatible Encoding of any given Unicode name, which can be  | 
58  |  |  * used with non-IDN-aware applications and protocols. (For example,  | 
59  |  |  * "Παν語.org" maps to "xn--4wa8awb4637h.org".)  | 
60  |  |  **/  | 
61  |  |  | 
62  | 0  | #define IDNA_ACE_PREFIX     "xn--"  | 
63  | 0  | #define IDNA_ACE_PREFIX_LEN 4  | 
64  |  |  | 
65  |  | /* Punycode constants, from RFC 3492. */  | 
66  |  |  | 
67  | 0  | #define PUNYCODE_BASE          36  | 
68  | 0  | #define PUNYCODE_TMIN           1  | 
69  | 0  | #define PUNYCODE_TMAX          26  | 
70  | 0  | #define PUNYCODE_SKEW          38  | 
71  | 0  | #define PUNYCODE_DAMP         700  | 
72  | 0  | #define PUNYCODE_INITIAL_BIAS  72  | 
73  | 0  | #define PUNYCODE_INITIAL_N   0x80  | 
74  |  |  | 
75  | 0  | #define PUNYCODE_IS_BASIC(cp) ((guint)(cp) < 0x80)  | 
76  |  |  | 
77  |  | /* Encode/decode a single base-36 digit */  | 
78  |  | static inline gchar  | 
79  |  | encode_digit (guint dig)  | 
80  | 0  | { | 
81  | 0  |   if (dig < 26)  | 
82  | 0  |     return dig + 'a';  | 
83  | 0  |   else  | 
84  | 0  |     return dig - 26 + '0';  | 
85  | 0  | }  | 
86  |  |  | 
87  |  | static inline guint  | 
88  |  | decode_digit (gchar dig)  | 
89  | 0  | { | 
90  | 0  |   if (dig >= 'A' && dig <= 'Z')  | 
91  | 0  |     return dig - 'A';  | 
92  | 0  |   else if (dig >= 'a' && dig <= 'z')  | 
93  | 0  |     return dig - 'a';  | 
94  | 0  |   else if (dig >= '0' && dig <= '9')  | 
95  | 0  |     return dig - '0' + 26;  | 
96  | 0  |   else  | 
97  | 0  |     return G_MAXUINT;  | 
98  | 0  | }  | 
99  |  |  | 
100  |  | /* Punycode bias adaptation algorithm, RFC 3492 section 6.1 */  | 
101  |  | static guint  | 
102  |  | adapt (guint    delta,  | 
103  |  |        guint    numpoints,  | 
104  |  |        gboolean firsttime)  | 
105  | 0  | { | 
106  | 0  |   guint k;  | 
107  |  | 
  | 
108  | 0  |   delta = firsttime ? delta / PUNYCODE_DAMP : delta / 2;  | 
109  | 0  |   delta += delta / numpoints;  | 
110  |  | 
  | 
111  | 0  |   k = 0;  | 
112  | 0  |   while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2)  | 
113  | 0  |     { | 
114  | 0  |       delta /= PUNYCODE_BASE - PUNYCODE_TMIN;  | 
115  | 0  |       k += PUNYCODE_BASE;  | 
116  | 0  |     }  | 
117  |  | 
  | 
118  | 0  |   return k + ((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta /  | 
119  | 0  |         (delta + PUNYCODE_SKEW));  | 
120  | 0  | }  | 
121  |  |  | 
122  |  | /* Punycode encoder, RFC 3492 section 6.3. The algorithm is  | 
123  |  |  * sufficiently bizarre that it's not really worth trying to explain  | 
124  |  |  * here.  | 
125  |  |  */  | 
126  |  | static gboolean  | 
127  |  | punycode_encode (const gchar *input_utf8,  | 
128  |  |                  gsize        input_utf8_length,  | 
129  |  |      GString     *output)  | 
130  | 0  | { | 
131  | 0  |   guint delta, handled_chars, num_basic_chars, bias, j, q, k, t, digit;  | 
132  | 0  |   gunichar n, m, *input;  | 
133  | 0  |   glong written_chars;  | 
134  | 0  |   gsize input_length;  | 
135  | 0  |   gboolean success = FALSE;  | 
136  |  |  | 
137  |  |   /* Convert from UTF-8 to Unicode code points */  | 
138  | 0  |   input = g_utf8_to_ucs4 (input_utf8, input_utf8_length, NULL,  | 
139  | 0  |         &written_chars, NULL);  | 
140  | 0  |   if (!input)  | 
141  | 0  |     return FALSE;  | 
142  |  |  | 
143  | 0  |   input_length = (gsize) (written_chars > 0 ? written_chars : 0);  | 
144  |  |  | 
145  |  |   /* Copy basic chars */  | 
146  | 0  |   for (j = num_basic_chars = 0; j < input_length; j++)  | 
147  | 0  |     { | 
148  | 0  |       if (PUNYCODE_IS_BASIC (input[j]))  | 
149  | 0  |   { | 
150  | 0  |     g_string_append_c (output, g_ascii_tolower (input[j]));  | 
151  | 0  |     num_basic_chars++;  | 
152  | 0  |   }  | 
153  | 0  |     }  | 
154  | 0  |   if (num_basic_chars)  | 
155  | 0  |     g_string_append_c (output, '-');  | 
156  |  | 
  | 
157  | 0  |   handled_chars = num_basic_chars;  | 
158  |  |  | 
159  |  |   /* Encode non-basic chars */  | 
160  | 0  |   delta = 0;  | 
161  | 0  |   bias = PUNYCODE_INITIAL_BIAS;  | 
162  | 0  |   n = PUNYCODE_INITIAL_N;  | 
163  | 0  |   while (handled_chars < input_length)  | 
164  | 0  |     { | 
165  |  |       /* let m = the minimum {non-basic} code point >= n in the input */ | 
166  | 0  |       for (m = G_MAXUINT, j = 0; j < input_length; j++)  | 
167  | 0  |   { | 
168  | 0  |     if (input[j] >= n && input[j] < m)  | 
169  | 0  |       m = input[j];  | 
170  | 0  |   }  | 
171  |  | 
  | 
172  | 0  |       if (m - n > (G_MAXUINT - delta) / (handled_chars + 1))  | 
173  | 0  |   goto fail;  | 
174  | 0  |       delta += (m - n) * (handled_chars + 1);  | 
175  | 0  |       n = m;  | 
176  |  | 
  | 
177  | 0  |       for (j = 0; j < input_length; j++)  | 
178  | 0  |   { | 
179  | 0  |     if (input[j] < n)  | 
180  | 0  |       { | 
181  | 0  |         if (++delta == 0)  | 
182  | 0  |     goto fail;  | 
183  | 0  |       }  | 
184  | 0  |     else if (input[j] == n)  | 
185  | 0  |       { | 
186  | 0  |         q = delta;  | 
187  | 0  |         for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE)  | 
188  | 0  |     { | 
189  | 0  |       if (k <= bias)  | 
190  | 0  |         t = PUNYCODE_TMIN;  | 
191  | 0  |       else if (k >= bias + PUNYCODE_TMAX)  | 
192  | 0  |         t = PUNYCODE_TMAX;  | 
193  | 0  |       else  | 
194  | 0  |         t = k - bias;  | 
195  | 0  |       if (q < t)  | 
196  | 0  |         break;  | 
197  | 0  |       digit = t + (q - t) % (PUNYCODE_BASE - t);  | 
198  | 0  |       g_string_append_c (output, encode_digit (digit));  | 
199  | 0  |       q = (q - t) / (PUNYCODE_BASE - t);  | 
200  | 0  |     }  | 
201  |  | 
  | 
202  | 0  |         g_string_append_c (output, encode_digit (q));  | 
203  | 0  |         bias = adapt (delta, handled_chars + 1, handled_chars == num_basic_chars);  | 
204  | 0  |         delta = 0;  | 
205  | 0  |         handled_chars++;  | 
206  | 0  |       }  | 
207  | 0  |   }  | 
208  |  |  | 
209  | 0  |       delta++;  | 
210  | 0  |       n++;  | 
211  | 0  |     }  | 
212  |  |  | 
213  | 0  |   success = TRUE;  | 
214  |  | 
  | 
215  | 0  |  fail:  | 
216  | 0  |   g_free (input);  | 
217  | 0  |   return success;  | 
218  | 0  | }  | 
219  |  |  | 
220  |  | /* From RFC 3454, Table B.1 */  | 
221  | 0  | #define idna_is_junk(ch) ((ch) == 0x00AD || (ch) == 0x1806 || (ch) == 0x200B || (ch) == 0x2060 || (ch) == 0xFEFF || (ch) == 0x034F || (ch) == 0x180B || (ch) == 0x180C || (ch) == 0x180D || (ch) == 0x200C || (ch) == 0x200D || ((ch) >= 0xFE00 && (ch) <= 0xFE0F))  | 
222  |  |  | 
223  |  | /* Scan @str for "junk" and return a cleaned-up string if any junk  | 
224  |  |  * is found. Else return %NULL.  | 
225  |  |  */  | 
226  |  | static gchar *  | 
227  |  | remove_junk (const gchar *str,  | 
228  |  |              gint         len)  | 
229  | 0  | { | 
230  | 0  |   GString *cleaned = NULL;  | 
231  | 0  |   const gchar *p;  | 
232  | 0  |   gunichar ch;  | 
233  |  | 
  | 
234  | 0  |   for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p))  | 
235  | 0  |     { | 
236  | 0  |       ch = g_utf8_get_char (p);  | 
237  | 0  |       if (idna_is_junk (ch))  | 
238  | 0  |   { | 
239  | 0  |     if (!cleaned)  | 
240  | 0  |       { | 
241  | 0  |         cleaned = g_string_new (NULL);  | 
242  | 0  |         g_string_append_len (cleaned, str, p - str);  | 
243  | 0  |       }  | 
244  | 0  |   }  | 
245  | 0  |       else if (cleaned)  | 
246  | 0  |   g_string_append_unichar (cleaned, ch);  | 
247  | 0  |     }  | 
248  |  | 
  | 
249  | 0  |   if (cleaned)  | 
250  | 0  |     return g_string_free (cleaned, FALSE);  | 
251  | 0  |   else  | 
252  | 0  |     return NULL;  | 
253  | 0  | }  | 
254  |  |  | 
255  |  | static inline gboolean  | 
256  |  | contains_uppercase_letters (const gchar *str,  | 
257  |  |                             gint         len)  | 
258  | 0  | { | 
259  | 0  |   const gchar *p;  | 
260  |  | 
  | 
261  | 0  |   for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p))  | 
262  | 0  |     { | 
263  | 0  |       if (g_unichar_isupper (g_utf8_get_char (p)))  | 
264  | 0  |   return TRUE;  | 
265  | 0  |     }  | 
266  | 0  |   return FALSE;  | 
267  | 0  | }  | 
268  |  |  | 
269  |  | static inline gboolean  | 
270  |  | contains_non_ascii (const gchar *str,  | 
271  |  |                     gint         len)  | 
272  | 0  | { | 
273  | 0  |   const gchar *p;  | 
274  |  | 
  | 
275  | 0  |   for (p = str; len == -1 ? *p : p < str + len; p++)  | 
276  | 0  |     { | 
277  | 0  |       if ((guchar)*p > 0x80)  | 
278  | 0  |   return TRUE;  | 
279  | 0  |     }  | 
280  | 0  |   return FALSE;  | 
281  | 0  | }  | 
282  |  |  | 
283  |  | /* RFC 3454, Appendix C. ish. */  | 
284  |  | static inline gboolean  | 
285  |  | idna_is_prohibited (gunichar ch)  | 
286  | 0  | { | 
287  | 0  |   switch (g_unichar_type (ch))  | 
288  | 0  |     { | 
289  | 0  |     case G_UNICODE_CONTROL:  | 
290  | 0  |     case G_UNICODE_FORMAT:  | 
291  | 0  |     case G_UNICODE_UNASSIGNED:  | 
292  | 0  |     case G_UNICODE_PRIVATE_USE:  | 
293  | 0  |     case G_UNICODE_SURROGATE:  | 
294  | 0  |     case G_UNICODE_LINE_SEPARATOR:  | 
295  | 0  |     case G_UNICODE_PARAGRAPH_SEPARATOR:  | 
296  | 0  |     case G_UNICODE_SPACE_SEPARATOR:  | 
297  | 0  |       return TRUE;  | 
298  |  |  | 
299  | 0  |     case G_UNICODE_OTHER_SYMBOL:  | 
300  | 0  |       if (ch == 0xFFFC || ch == 0xFFFD ||  | 
301  | 0  |     (ch >= 0x2FF0 && ch <= 0x2FFB))  | 
302  | 0  |   return TRUE;  | 
303  | 0  |       return FALSE;  | 
304  |  |  | 
305  | 0  |     case G_UNICODE_NON_SPACING_MARK:  | 
306  | 0  |       if (ch == 0x0340 || ch == 0x0341)  | 
307  | 0  |   return TRUE;  | 
308  | 0  |       return FALSE;  | 
309  |  |  | 
310  | 0  |     default:  | 
311  | 0  |       return FALSE;  | 
312  | 0  |     }  | 
313  | 0  | }  | 
314  |  |  | 
315  |  | /* RFC 3491 IDN cleanup algorithm. */  | 
316  |  | static gchar *  | 
317  |  | nameprep (const gchar *hostname,  | 
318  |  |           gint         len,  | 
319  |  |           gboolean    *is_unicode)  | 
320  | 0  | { | 
321  | 0  |   gchar *name, *tmp = NULL, *p;  | 
322  |  |  | 
323  |  |   /* It would be nice if we could do this without repeatedly  | 
324  |  |    * allocating strings and converting back and forth between  | 
325  |  |    * gunichars and UTF-8... The code does at least avoid doing most of  | 
326  |  |    * the sub-operations when they would just be equivalent to a  | 
327  |  |    * g_strdup().  | 
328  |  |    */  | 
329  |  |  | 
330  |  |   /* Remove presentation-only characters */  | 
331  | 0  |   name = remove_junk (hostname, len);  | 
332  | 0  |   if (name)  | 
333  | 0  |     { | 
334  | 0  |       tmp = name;  | 
335  | 0  |       len = -1;  | 
336  | 0  |     }  | 
337  | 0  |   else  | 
338  | 0  |     name = (gchar *)hostname;  | 
339  |  |  | 
340  |  |   /* Convert to lowercase */  | 
341  | 0  |   if (contains_uppercase_letters (name, len))  | 
342  | 0  |     { | 
343  | 0  |       name = g_utf8_strdown (name, len);  | 
344  | 0  |       g_free (tmp);  | 
345  | 0  |       tmp = name;  | 
346  | 0  |       len = -1;  | 
347  | 0  |     }  | 
348  |  |  | 
349  |  |   /* If there are no UTF8 characters, we're done. */  | 
350  | 0  |   if (!contains_non_ascii (name, len))  | 
351  | 0  |     { | 
352  | 0  |       *is_unicode = FALSE;  | 
353  | 0  |       if (name == (gchar *)hostname)  | 
354  | 0  |         return len == -1 ? g_strdup (hostname) : g_strndup (hostname, len);  | 
355  | 0  |       else  | 
356  | 0  |         return name;  | 
357  | 0  |     }  | 
358  |  |  | 
359  | 0  |   *is_unicode = TRUE;  | 
360  |  |  | 
361  |  |   /* Normalize */  | 
362  | 0  |   name = g_utf8_normalize (name, len, G_NORMALIZE_NFKC);  | 
363  | 0  |   g_free (tmp);  | 
364  | 0  |   tmp = name;  | 
365  |  | 
  | 
366  | 0  |   if (!name)  | 
367  | 0  |     return NULL;  | 
368  |  |  | 
369  |  |   /* KC normalization may have created more capital letters (eg,  | 
370  |  |    * angstrom -> capital A with ring). So we have to lowercasify a  | 
371  |  |    * second time. (This is more-or-less how the nameprep algorithm  | 
372  |  |    * does it. If tolower(nfkc(tolower(X))) is guaranteed to be the  | 
373  |  |    * same as tolower(nfkc(X)), then we could skip the first tolower,  | 
374  |  |    * but I'm not sure it is.)  | 
375  |  |    */  | 
376  | 0  |   if (contains_uppercase_letters (name, -1))  | 
377  | 0  |     { | 
378  | 0  |       name = g_utf8_strdown (name, -1);  | 
379  | 0  |       g_free (tmp);  | 
380  | 0  |       tmp = name;  | 
381  | 0  |     }  | 
382  |  |  | 
383  |  |   /* Check for prohibited characters */  | 
384  | 0  |   for (p = name; *p; p = g_utf8_next_char (p))  | 
385  | 0  |     { | 
386  | 0  |       if (idna_is_prohibited (g_utf8_get_char (p)))  | 
387  | 0  |   { | 
388  | 0  |     name = NULL;  | 
389  | 0  |           g_free (tmp);  | 
390  | 0  |     goto done;  | 
391  | 0  |   }  | 
392  | 0  |     }  | 
393  |  |  | 
394  |  |   /* FIXME: We're supposed to verify certain constraints on bidi  | 
395  |  |    * characters, but glib does not appear to have that information.  | 
396  |  |    */  | 
397  |  |  | 
398  | 0  |  done:  | 
399  | 0  |   return name;  | 
400  | 0  | }  | 
401  |  |  | 
402  |  | /* RFC 3490, section 3.1 says '.', 0x3002, 0xFF0E, and 0xFF61 count as  | 
403  |  |  * label-separating dots. @str must be '\0'-terminated.  | 
404  |  |  */  | 
405  | 0  | #define idna_is_dot(str) ( \  | 
406  | 0  |   ((guchar)(str)[0] == '.') ||                                                 \  | 
407  | 0  |   ((guchar)(str)[0] == 0xE3 && (guchar)(str)[1] == 0x80 && (guchar)(str)[2] == 0x82) || \  | 
408  | 0  |   ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBC && (guchar)(str)[2] == 0x8E) || \  | 
409  | 0  |   ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBD && (guchar)(str)[2] == 0xA1) )  | 
410  |  |  | 
411  |  | static const gchar *  | 
412  |  | idna_end_of_label (const gchar *str)  | 
413  | 0  | { | 
414  | 0  |   for (; *str; str = g_utf8_next_char (str))  | 
415  | 0  |     { | 
416  | 0  |       if (idna_is_dot (str))  | 
417  | 0  |         return str;  | 
418  | 0  |     }  | 
419  | 0  |   return str;  | 
420  | 0  | }  | 
421  |  |  | 
422  |  | static gsize  | 
423  |  | get_hostname_max_length_bytes (void)  | 
424  | 0  | { | 
425  |  | #if defined(G_OS_WIN32)  | 
426  |  |   wchar_t tmp[MAX_COMPUTERNAME_LENGTH];  | 
427  |  |   return sizeof (tmp) / sizeof (tmp[0]);  | 
428  |  | #elif defined(_SC_HOST_NAME_MAX)  | 
429  |  |   glong max = sysconf (_SC_HOST_NAME_MAX);  | 
430  | 0  |   if (max > 0)  | 
431  | 0  |     return (gsize) max;  | 
432  |  |  | 
433  | 0  | #ifdef HOST_NAME_MAX  | 
434  | 0  |   return HOST_NAME_MAX;  | 
435  |  | #else  | 
436  |  |   return _POSIX_HOST_NAME_MAX;  | 
437  |  | #endif /* HOST_NAME_MAX */  | 
438  |  | #else  | 
439  |  |   /* Fallback to some reasonable value  | 
440  |  |    * See https://stackoverflow.com/questions/8724954/what-is-the-maximum-number-of-characters-for-a-host-name-in-unix/28918017#28918017 */  | 
441  |  |   return 255;  | 
442  |  | #endif  | 
443  | 0  | }  | 
444  |  |  | 
445  |  | /* Returns %TRUE if `strlen (str) > comparison_length`, but without actually  | 
446  |  |  * running `strlen(str)`, as that would take a very long time for long  | 
447  |  |  * (untrusted) input strings. */  | 
448  |  | static gboolean  | 
449  |  | strlen_greater_than (const gchar *str,  | 
450  |  |                      gsize        comparison_length)  | 
451  | 0  | { | 
452  | 0  |   gsize i;  | 
453  |  | 
  | 
454  | 0  |   for (i = 0; str[i] != '\0'; i++)  | 
455  | 0  |     if (i > comparison_length)  | 
456  | 0  |       return TRUE;  | 
457  |  |  | 
458  | 0  |   return FALSE;  | 
459  | 0  | }  | 
460  |  |  | 
461  |  | /**  | 
462  |  |  * g_hostname_to_ascii:  | 
463  |  |  * @hostname: a valid UTF-8 or ASCII hostname  | 
464  |  |  *  | 
465  |  |  * Converts @hostname to its canonical ASCII form; an ASCII-only  | 
466  |  |  * string containing no uppercase letters and not ending with a  | 
467  |  |  * trailing dot.  | 
468  |  |  *  | 
469  |  |  * Returns: (nullable) (transfer full): an ASCII hostname, which must be freed,  | 
470  |  |  *    or %NULL if @hostname is in some way invalid.  | 
471  |  |  *  | 
472  |  |  * Since: 2.22  | 
473  |  |  **/  | 
474  |  | gchar *  | 
475  |  | g_hostname_to_ascii (const gchar *hostname)  | 
476  | 0  | { | 
477  | 0  |   gchar *name, *label, *p;  | 
478  | 0  |   GString *out;  | 
479  | 0  |   gssize llen, oldlen;  | 
480  | 0  |   gboolean unicode;  | 
481  | 0  |   gsize hostname_max_length_bytes = get_hostname_max_length_bytes ();  | 
482  |  |  | 
483  |  |   /* Do an initial check on the hostname length, as overlong hostnames take a  | 
484  |  |    * long time in the IDN cleanup algorithm in nameprep(). The ultimate  | 
485  |  |    * restriction is that the IDN-decoded (i.e. pure ASCII) hostname cannot be  | 
486  |  |    * longer than 255 bytes. That’s the least restrictive limit on hostname  | 
487  |  |    * length of all the ways hostnames can be interpreted. Typically, the  | 
488  |  |    * hostname will be an FQDN, which is limited to 253 bytes long. POSIX  | 
489  |  |    * hostnames are limited to `get_hostname_max_length_bytes()` (typically 255  | 
490  |  |    * bytes).  | 
491  |  |    *  | 
492  |  |    * See https://stackoverflow.com/a/28918017/2931197  | 
493  |  |    *  | 
494  |  |    * It’s possible for a hostname to be %-encoded, in which case its decoded  | 
495  |  |    * length will be as much as 3× shorter.  | 
496  |  |    *  | 
497  |  |    * It’s also possible for a hostname to use overlong UTF-8 encodings, in which  | 
498  |  |    * case its decoded length will be as much as 4× shorter.  | 
499  |  |    *  | 
500  |  |    * Note: This check is not intended as an absolute guarantee that a hostname  | 
501  |  |    * is the right length and will be accepted by other systems. It’s intended to  | 
502  |  |    * stop wildly-invalid hostnames from taking forever in nameprep().  | 
503  |  |    */  | 
504  | 0  |   if (hostname_max_length_bytes <= G_MAXSIZE / 4 &&  | 
505  | 0  |       strlen_greater_than (hostname, 4 * MAX (255, hostname_max_length_bytes)))  | 
506  | 0  |     return NULL;  | 
507  |  |  | 
508  | 0  |   label = name = nameprep (hostname, -1, &unicode);  | 
509  | 0  |   if (!name || !unicode)  | 
510  | 0  |     return name;  | 
511  |  |  | 
512  | 0  |   out = g_string_new (NULL);  | 
513  |  | 
  | 
514  | 0  |   do  | 
515  | 0  |     { | 
516  | 0  |       unicode = FALSE;  | 
517  | 0  |       for (p = label; *p && !idna_is_dot (p); p++)  | 
518  | 0  |   { | 
519  | 0  |     if ((guchar)*p > 0x80)  | 
520  | 0  |       unicode = TRUE;  | 
521  | 0  |   }  | 
522  |  | 
  | 
523  | 0  |       oldlen = out->len;  | 
524  | 0  |       llen = p - label;  | 
525  | 0  |       if (unicode)  | 
526  | 0  |   { | 
527  | 0  |           if (!strncmp (label, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))  | 
528  | 0  |             goto fail;  | 
529  |  |  | 
530  | 0  |     g_string_append (out, IDNA_ACE_PREFIX);  | 
531  | 0  |     if (!punycode_encode (label, llen, out))  | 
532  | 0  |       goto fail;  | 
533  | 0  |   }  | 
534  | 0  |       else  | 
535  | 0  |         g_string_append_len (out, label, llen);  | 
536  |  |  | 
537  | 0  |       if (out->len - oldlen > 63)  | 
538  | 0  |   goto fail;  | 
539  |  |  | 
540  | 0  |       label += llen;  | 
541  | 0  |       if (*label)  | 
542  | 0  |         label = g_utf8_next_char (label);  | 
543  | 0  |       if (*label)  | 
544  | 0  |         g_string_append_c (out, '.');  | 
545  | 0  |     }  | 
546  | 0  |   while (*label);  | 
547  |  |  | 
548  | 0  |   g_free (name);  | 
549  | 0  |   return g_string_free (out, FALSE);  | 
550  |  |  | 
551  | 0  |  fail:  | 
552  | 0  |   g_free (name);  | 
553  | 0  |   g_string_free (out, TRUE);  | 
554  | 0  |   return NULL;  | 
555  | 0  | }  | 
556  |  |  | 
557  |  | /**  | 
558  |  |  * g_hostname_is_non_ascii:  | 
559  |  |  * @hostname: a hostname  | 
560  |  |  *  | 
561  |  |  * Tests if @hostname contains Unicode characters. If this returns  | 
562  |  |  * %TRUE, you need to encode the hostname with g_hostname_to_ascii()  | 
563  |  |  * before using it in non-IDN-aware contexts.  | 
564  |  |  *  | 
565  |  |  * Note that a hostname might contain a mix of encoded and unencoded  | 
566  |  |  * segments, and so it is possible for g_hostname_is_non_ascii() and  | 
567  |  |  * g_hostname_is_ascii_encoded() to both return %TRUE for a name.  | 
568  |  |  *  | 
569  |  |  * Returns: %TRUE if @hostname contains any non-ASCII characters  | 
570  |  |  *  | 
571  |  |  * Since: 2.22  | 
572  |  |  **/  | 
573  |  | gboolean  | 
574  |  | g_hostname_is_non_ascii (const gchar *hostname)  | 
575  | 0  | { | 
576  | 0  |   return contains_non_ascii (hostname, -1);  | 
577  | 0  | }  | 
578  |  |  | 
579  |  | /* Punycode decoder, RFC 3492 section 6.2. As with punycode_encode(),  | 
580  |  |  * read the RFC if you want to understand what this is actually doing.  | 
581  |  |  */  | 
582  |  | static gboolean  | 
583  |  | punycode_decode (const gchar *input,  | 
584  |  |                  gsize        input_length,  | 
585  |  |                  GString     *output)  | 
586  | 0  | { | 
587  | 0  |   GArray *output_chars;  | 
588  | 0  |   gunichar n;  | 
589  | 0  |   guint i, bias;  | 
590  | 0  |   guint oldi, w, k, digit, t;  | 
591  | 0  |   const gchar *split;  | 
592  |  | 
  | 
593  | 0  |   n = PUNYCODE_INITIAL_N;  | 
594  | 0  |   i = 0;  | 
595  | 0  |   bias = PUNYCODE_INITIAL_BIAS;  | 
596  |  | 
  | 
597  | 0  |   split = input + input_length - 1;  | 
598  | 0  |   while (split > input && *split != '-')  | 
599  | 0  |     split--;  | 
600  | 0  |   if (split > input)  | 
601  | 0  |     { | 
602  | 0  |       output_chars = g_array_sized_new (FALSE, FALSE, sizeof (gunichar),  | 
603  | 0  |           split - input);  | 
604  | 0  |       input_length -= (split - input) + 1;  | 
605  | 0  |       while (input < split)  | 
606  | 0  |   { | 
607  | 0  |     gunichar ch = (gunichar)*input++;  | 
608  | 0  |     if (!PUNYCODE_IS_BASIC (ch))  | 
609  | 0  |       goto fail;  | 
610  | 0  |     g_array_append_val (output_chars, ch);  | 
611  | 0  |   }  | 
612  | 0  |       input++;  | 
613  | 0  |     }  | 
614  | 0  |   else  | 
615  | 0  |     output_chars = g_array_new (FALSE, FALSE, sizeof (gunichar));  | 
616  |  |  | 
617  | 0  |   while (input_length)  | 
618  | 0  |     { | 
619  | 0  |       oldi = i;  | 
620  | 0  |       w = 1;  | 
621  | 0  |       for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE)  | 
622  | 0  |   { | 
623  | 0  |     if (!input_length--)  | 
624  | 0  |       goto fail;  | 
625  | 0  |     digit = decode_digit (*input++);  | 
626  | 0  |     if (digit >= PUNYCODE_BASE)  | 
627  | 0  |       goto fail;  | 
628  | 0  |     if (digit > (G_MAXUINT - i) / w)  | 
629  | 0  |       goto fail;  | 
630  | 0  |     i += digit * w;  | 
631  | 0  |     if (k <= bias)  | 
632  | 0  |       t = PUNYCODE_TMIN;  | 
633  | 0  |     else if (k >= bias + PUNYCODE_TMAX)  | 
634  | 0  |       t = PUNYCODE_TMAX;  | 
635  | 0  |     else  | 
636  | 0  |       t = k - bias;  | 
637  | 0  |     if (digit < t)  | 
638  | 0  |       break;  | 
639  | 0  |     if (w > G_MAXUINT / (PUNYCODE_BASE - t))  | 
640  | 0  |       goto fail;  | 
641  | 0  |     w *= (PUNYCODE_BASE - t);  | 
642  | 0  |   }  | 
643  |  |  | 
644  | 0  |       bias = adapt (i - oldi, output_chars->len + 1, oldi == 0);  | 
645  |  | 
  | 
646  | 0  |       if (i / (output_chars->len + 1) > G_MAXUINT - n)  | 
647  | 0  |   goto fail;  | 
648  | 0  |       n += i / (output_chars->len + 1);  | 
649  | 0  |       i %= (output_chars->len + 1);  | 
650  |  | 
  | 
651  | 0  |       g_array_insert_val (output_chars, i++, n);  | 
652  | 0  |     }  | 
653  |  |  | 
654  | 0  |   for (i = 0; i < output_chars->len; i++)  | 
655  | 0  |     g_string_append_unichar (output, g_array_index (output_chars, gunichar, i));  | 
656  | 0  |   g_array_free (output_chars, TRUE);  | 
657  | 0  |   return TRUE;  | 
658  |  |  | 
659  | 0  |  fail:  | 
660  | 0  |   g_array_free (output_chars, TRUE);  | 
661  | 0  |   return FALSE;  | 
662  | 0  | }  | 
663  |  |  | 
664  |  | /**  | 
665  |  |  * g_hostname_to_unicode:  | 
666  |  |  * @hostname: a valid UTF-8 or ASCII hostname  | 
667  |  |  *  | 
668  |  |  * Converts @hostname to its canonical presentation form; a UTF-8  | 
669  |  |  * string in Unicode normalization form C, containing no uppercase  | 
670  |  |  * letters, no forbidden characters, and no ASCII-encoded segments,  | 
671  |  |  * and not ending with a trailing dot.  | 
672  |  |  *  | 
673  |  |  * Of course if @hostname is not an internationalized hostname, then  | 
674  |  |  * the canonical presentation form will be entirely ASCII.  | 
675  |  |  *  | 
676  |  |  * Returns: (nullable) (transfer full): a UTF-8 hostname, which must be freed,  | 
677  |  |  *    or %NULL if @hostname is in some way invalid.  | 
678  |  |  *  | 
679  |  |  * Since: 2.22  | 
680  |  |  **/  | 
681  |  | gchar *  | 
682  |  | g_hostname_to_unicode (const gchar *hostname)  | 
683  | 0  | { | 
684  | 0  |   GString *out;  | 
685  | 0  |   gssize llen;  | 
686  | 0  |   gsize hostname_max_length_bytes = get_hostname_max_length_bytes ();  | 
687  |  |  | 
688  |  |   /* See the comment at the top of g_hostname_to_ascii(). */  | 
689  | 0  |   if (hostname_max_length_bytes <= G_MAXSIZE / 4 &&  | 
690  | 0  |       strlen_greater_than (hostname, 4 * MAX (255, hostname_max_length_bytes)))  | 
691  | 0  |     return NULL;  | 
692  |  |  | 
693  | 0  |   out = g_string_new (NULL);  | 
694  |  | 
  | 
695  | 0  |   do  | 
696  | 0  |     { | 
697  | 0  |       llen = idna_end_of_label (hostname) - hostname;  | 
698  | 0  |       if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))  | 
699  | 0  |   { | 
700  | 0  |     hostname += IDNA_ACE_PREFIX_LEN;  | 
701  | 0  |     llen -= IDNA_ACE_PREFIX_LEN;  | 
702  | 0  |     if (!punycode_decode (hostname, llen, out))  | 
703  | 0  |       { | 
704  | 0  |         g_string_free (out, TRUE);  | 
705  | 0  |         return NULL;  | 
706  | 0  |       }  | 
707  | 0  |   }  | 
708  | 0  |       else  | 
709  | 0  |         { | 
710  | 0  |           gboolean unicode;  | 
711  | 0  |           gchar *canonicalized = nameprep (hostname, llen, &unicode);  | 
712  |  | 
  | 
713  | 0  |           if (!canonicalized)  | 
714  | 0  |             { | 
715  | 0  |               g_string_free (out, TRUE);  | 
716  | 0  |               return NULL;  | 
717  | 0  |             }  | 
718  | 0  |           g_string_append (out, canonicalized);  | 
719  | 0  |           g_free (canonicalized);  | 
720  | 0  |         }  | 
721  |  |  | 
722  | 0  |       hostname += llen;  | 
723  | 0  |       if (*hostname)  | 
724  | 0  |         hostname = g_utf8_next_char (hostname);  | 
725  | 0  |       if (*hostname)  | 
726  | 0  |         g_string_append_c (out, '.');  | 
727  | 0  |     }  | 
728  | 0  |   while (*hostname);  | 
729  |  |  | 
730  | 0  |   return g_string_free (out, FALSE);  | 
731  | 0  | }  | 
732  |  |  | 
733  |  | /**  | 
734  |  |  * g_hostname_is_ascii_encoded:  | 
735  |  |  * @hostname: a hostname  | 
736  |  |  *  | 
737  |  |  * Tests if @hostname contains segments with an ASCII-compatible  | 
738  |  |  * encoding of an Internationalized Domain Name. If this returns  | 
739  |  |  * %TRUE, you should decode the hostname with g_hostname_to_unicode()  | 
740  |  |  * before displaying it to the user.  | 
741  |  |  *  | 
742  |  |  * Note that a hostname might contain a mix of encoded and unencoded  | 
743  |  |  * segments, and so it is possible for g_hostname_is_non_ascii() and  | 
744  |  |  * g_hostname_is_ascii_encoded() to both return %TRUE for a name.  | 
745  |  |  *  | 
746  |  |  * Returns: %TRUE if @hostname contains any ASCII-encoded  | 
747  |  |  * segments.  | 
748  |  |  *  | 
749  |  |  * Since: 2.22  | 
750  |  |  **/  | 
751  |  | gboolean  | 
752  |  | g_hostname_is_ascii_encoded (const gchar *hostname)  | 
753  | 0  | { | 
754  | 0  |   while (1)  | 
755  | 0  |     { | 
756  | 0  |       if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))  | 
757  | 0  |   return TRUE;  | 
758  | 0  |       hostname = idna_end_of_label (hostname);  | 
759  | 0  |       if (*hostname)  | 
760  | 0  |         hostname = g_utf8_next_char (hostname);  | 
761  | 0  |       if (!*hostname)  | 
762  | 0  |   return FALSE;  | 
763  | 0  |     }  | 
764  | 0  | }  | 
765  |  |  | 
766  |  | /**  | 
767  |  |  * g_hostname_is_ip_address:  | 
768  |  |  * @hostname: a hostname (or IP address in string form)  | 
769  |  |  *  | 
770  |  |  * Tests if @hostname is the string form of an IPv4 or IPv6 address.  | 
771  |  |  * (Eg, "192.168.0.1".)  | 
772  |  |  *  | 
773  |  |  * Since 2.66, IPv6 addresses with a zone-id are accepted (RFC6874).  | 
774  |  |  *  | 
775  |  |  * Returns: %TRUE if @hostname is an IP address  | 
776  |  |  *  | 
777  |  |  * Since: 2.22  | 
778  |  |  **/  | 
779  |  | gboolean  | 
780  |  | g_hostname_is_ip_address (const gchar *hostname)  | 
781  | 0  | { | 
782  | 0  |   gchar *p, *end;  | 
783  | 0  |   gint nsegments, octet;  | 
784  |  |  | 
785  |  |   /* On Linux we could implement this using inet_pton, but the Windows  | 
786  |  |    * equivalent of that requires linking against winsock, so we just  | 
787  |  |    * figure this out ourselves. Tested by tests/hostutils.c.  | 
788  |  |    */  | 
789  |  | 
  | 
790  | 0  |   p = (char *)hostname;  | 
791  |  | 
  | 
792  | 0  |   if (strchr (p, ':'))  | 
793  | 0  |     { | 
794  | 0  |       gboolean skipped;  | 
795  |  |  | 
796  |  |       /* If it contains a ':', it's an IPv6 address (assuming it's an  | 
797  |  |        * IP address at all). This consists of eight ':'-separated  | 
798  |  |        * segments, each containing a 1-4 digit hex number, except that  | 
799  |  |        * optionally: (a) the last two segments can be replaced by an  | 
800  |  |        * IPv4 address, and (b) a single span of 1 to 8 "0000" segments  | 
801  |  |        * can be replaced with just "::".  | 
802  |  |        */  | 
803  |  | 
  | 
804  | 0  |       nsegments = 0;  | 
805  | 0  |       skipped = FALSE;  | 
806  | 0  |       while (*p && *p != '%' && nsegments < 8)  | 
807  | 0  |         { | 
808  |  |           /* Each segment after the first must be preceded by a ':'.  | 
809  |  |            * (We also handle half of the "string starts with ::" case  | 
810  |  |            * here.)  | 
811  |  |            */  | 
812  | 0  |           if (p != (char *)hostname || (p[0] == ':' && p[1] == ':'))  | 
813  | 0  |             { | 
814  | 0  |               if (*p != ':')  | 
815  | 0  |                 return FALSE;  | 
816  | 0  |               p++;  | 
817  | 0  |             }  | 
818  |  |  | 
819  |  |           /* If there's another ':', it means we're skipping some segments */  | 
820  | 0  |           if (*p == ':' && !skipped)  | 
821  | 0  |             { | 
822  | 0  |               skipped = TRUE;  | 
823  | 0  |               nsegments++;  | 
824  |  |  | 
825  |  |               /* Handle the "string ends with ::" case */  | 
826  | 0  |               if (!p[1])  | 
827  | 0  |                 p++;  | 
828  |  | 
  | 
829  | 0  |               continue;  | 
830  | 0  |             }  | 
831  |  |  | 
832  |  |           /* Read the segment, make sure it's valid. */  | 
833  | 0  |           for (end = p; g_ascii_isxdigit (*end); end++)  | 
834  | 0  |             ;  | 
835  | 0  |           if (end == p || end > p + 4)  | 
836  | 0  |             return FALSE;  | 
837  |  |  | 
838  | 0  |           if (*end == '.')  | 
839  | 0  |             { | 
840  | 0  |               if ((nsegments == 6 && !skipped) || (nsegments <= 6 && skipped))  | 
841  | 0  |                 goto parse_ipv4;  | 
842  | 0  |               else  | 
843  | 0  |                 return FALSE;  | 
844  | 0  |             }  | 
845  |  |  | 
846  | 0  |           nsegments++;  | 
847  | 0  |           p = end;  | 
848  | 0  |         }  | 
849  |  |  | 
850  | 0  |       return (!*p || (p[0] == '%' && p[1])) && (nsegments == 8 || skipped);  | 
851  | 0  |     }  | 
852  |  |  | 
853  | 0  |  parse_ipv4:  | 
854  |  |  | 
855  |  |   /* Parse IPv4: N.N.N.N, where each N <= 255 and doesn't have leading 0s. */  | 
856  | 0  |   for (nsegments = 0; nsegments < 4; nsegments++)  | 
857  | 0  |     { | 
858  | 0  |       if (nsegments != 0)  | 
859  | 0  |         { | 
860  | 0  |           if (*p != '.')  | 
861  | 0  |             return FALSE;  | 
862  | 0  |           p++;  | 
863  | 0  |         }  | 
864  |  |  | 
865  |  |       /* Check the segment; a little tricker than the IPv6 case since  | 
866  |  |        * we can't allow extra leading 0s, and we can't assume that all  | 
867  |  |        * strings of valid length are within range.  | 
868  |  |        */  | 
869  | 0  |       octet = 0;  | 
870  | 0  |       if (*p == '0')  | 
871  | 0  |         end = p + 1;  | 
872  | 0  |       else  | 
873  | 0  |         { | 
874  | 0  |           for (end = p; g_ascii_isdigit (*end); end++)  | 
875  | 0  |             { | 
876  | 0  |               octet = 10 * octet + (*end - '0');  | 
877  |  | 
  | 
878  | 0  |               if (octet > 255)  | 
879  | 0  |                 break;  | 
880  | 0  |             }  | 
881  | 0  |         }  | 
882  | 0  |       if (end == p || end > p + 3 || octet > 255)  | 
883  | 0  |         return FALSE;  | 
884  |  |  | 
885  | 0  |       p = end;  | 
886  | 0  |     }  | 
887  |  |  | 
888  |  |   /* If there's nothing left to parse, then it's ok. */  | 
889  | 0  |   return !*p;  | 
890  | 0  | }  |