/src/glib-2.80.0/glib/ghostutils.c
Line | Count | Source |
1 | | /* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- */ |
2 | | |
3 | | /* GLIB - Library of useful routines for C programming |
4 | | * Copyright (C) 2008 Red Hat, Inc. |
5 | | * |
6 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
7 | | * |
8 | | * This library is free software; you can redistribute it and/or |
9 | | * modify it under the terms of the GNU Lesser General Public |
10 | | * License as published by the Free Software Foundation; either |
11 | | * version 2.1 of the License, or (at your option) any later version. |
12 | | * |
13 | | * This library is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 | | * Lesser General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU Lesser General |
19 | | * Public License along with this library; if not, see <http://www.gnu.org/licenses/>. |
20 | | */ |
21 | | |
22 | | #include "config.h" |
23 | | #include "glibconfig.h" |
24 | | |
25 | | #include <string.h> |
26 | | |
27 | | #ifdef G_OS_UNIX |
28 | | #include <unistd.h> |
29 | | #endif |
30 | | |
31 | | #include "ghostutils.h" |
32 | | |
33 | | #include "garray.h" |
34 | | #include "gmem.h" |
35 | | #include "gstring.h" |
36 | | #include "gstrfuncs.h" |
37 | | #include "glibintl.h" |
38 | | |
39 | | #ifdef G_PLATFORM_WIN32 |
40 | | #include <windows.h> |
41 | | #endif |
42 | | |
43 | | |
44 | 0 | #define IDNA_ACE_PREFIX "xn--" |
45 | 0 | #define IDNA_ACE_PREFIX_LEN 4 |
46 | | |
47 | | /* Punycode constants, from RFC 3492. */ |
48 | | |
49 | 0 | #define PUNYCODE_BASE 36 |
50 | 0 | #define PUNYCODE_TMIN 1 |
51 | 0 | #define PUNYCODE_TMAX 26 |
52 | 0 | #define PUNYCODE_SKEW 38 |
53 | 0 | #define PUNYCODE_DAMP 700 |
54 | 0 | #define PUNYCODE_INITIAL_BIAS 72 |
55 | 0 | #define PUNYCODE_INITIAL_N 0x80 |
56 | | |
57 | 0 | #define PUNYCODE_IS_BASIC(cp) ((guint)(cp) < 0x80) |
58 | | |
59 | | /* Encode/decode a single base-36 digit */ |
60 | | static inline gchar |
61 | | encode_digit (guint dig) |
62 | 0 | { |
63 | 0 | if (dig < 26) |
64 | 0 | return dig + 'a'; |
65 | 0 | else |
66 | 0 | return dig - 26 + '0'; |
67 | 0 | } |
68 | | |
69 | | static inline guint |
70 | | decode_digit (gchar dig) |
71 | 0 | { |
72 | 0 | if (dig >= 'A' && dig <= 'Z') |
73 | 0 | return dig - 'A'; |
74 | 0 | else if (dig >= 'a' && dig <= 'z') |
75 | 0 | return dig - 'a'; |
76 | 0 | else if (dig >= '0' && dig <= '9') |
77 | 0 | return dig - '0' + 26; |
78 | 0 | else |
79 | 0 | return G_MAXUINT; |
80 | 0 | } |
81 | | |
82 | | /* Punycode bias adaptation algorithm, RFC 3492 section 6.1 */ |
83 | | static guint |
84 | | adapt (guint delta, |
85 | | guint numpoints, |
86 | | gboolean firsttime) |
87 | 0 | { |
88 | 0 | guint k; |
89 | |
|
90 | 0 | delta = firsttime ? delta / PUNYCODE_DAMP : delta / 2; |
91 | 0 | delta += delta / numpoints; |
92 | |
|
93 | 0 | k = 0; |
94 | 0 | while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2) |
95 | 0 | { |
96 | 0 | delta /= PUNYCODE_BASE - PUNYCODE_TMIN; |
97 | 0 | k += PUNYCODE_BASE; |
98 | 0 | } |
99 | |
|
100 | 0 | return k + ((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta / |
101 | 0 | (delta + PUNYCODE_SKEW)); |
102 | 0 | } |
103 | | |
104 | | /* Punycode encoder, RFC 3492 section 6.3. The algorithm is |
105 | | * sufficiently bizarre that it's not really worth trying to explain |
106 | | * here. |
107 | | */ |
108 | | static gboolean |
109 | | punycode_encode (const gchar *input_utf8, |
110 | | gsize input_utf8_length, |
111 | | GString *output) |
112 | 0 | { |
113 | 0 | guint delta, handled_chars, num_basic_chars, bias, j, q, k, t, digit; |
114 | 0 | gunichar n, m, *input; |
115 | 0 | glong written_chars; |
116 | 0 | gsize input_length; |
117 | 0 | gboolean success = FALSE; |
118 | | |
119 | | /* Convert from UTF-8 to Unicode code points */ |
120 | 0 | input = g_utf8_to_ucs4 (input_utf8, input_utf8_length, NULL, |
121 | 0 | &written_chars, NULL); |
122 | 0 | if (!input) |
123 | 0 | return FALSE; |
124 | | |
125 | 0 | input_length = (gsize) (written_chars > 0 ? written_chars : 0); |
126 | | |
127 | | /* Copy basic chars */ |
128 | 0 | for (j = num_basic_chars = 0; j < input_length; j++) |
129 | 0 | { |
130 | 0 | if (PUNYCODE_IS_BASIC (input[j])) |
131 | 0 | { |
132 | 0 | g_string_append_c (output, g_ascii_tolower (input[j])); |
133 | 0 | num_basic_chars++; |
134 | 0 | } |
135 | 0 | } |
136 | 0 | if (num_basic_chars) |
137 | 0 | g_string_append_c (output, '-'); |
138 | |
|
139 | 0 | handled_chars = num_basic_chars; |
140 | | |
141 | | /* Encode non-basic chars */ |
142 | 0 | delta = 0; |
143 | 0 | bias = PUNYCODE_INITIAL_BIAS; |
144 | 0 | n = PUNYCODE_INITIAL_N; |
145 | 0 | while (handled_chars < input_length) |
146 | 0 | { |
147 | | /* let m = the minimum {non-basic} code point >= n in the input */ |
148 | 0 | for (m = G_MAXUINT, j = 0; j < input_length; j++) |
149 | 0 | { |
150 | 0 | if (input[j] >= n && input[j] < m) |
151 | 0 | m = input[j]; |
152 | 0 | } |
153 | |
|
154 | 0 | if (m - n > (G_MAXUINT - delta) / (handled_chars + 1)) |
155 | 0 | goto fail; |
156 | 0 | delta += (m - n) * (handled_chars + 1); |
157 | 0 | n = m; |
158 | |
|
159 | 0 | for (j = 0; j < input_length; j++) |
160 | 0 | { |
161 | 0 | if (input[j] < n) |
162 | 0 | { |
163 | 0 | if (++delta == 0) |
164 | 0 | goto fail; |
165 | 0 | } |
166 | 0 | else if (input[j] == n) |
167 | 0 | { |
168 | 0 | q = delta; |
169 | 0 | for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE) |
170 | 0 | { |
171 | 0 | if (k <= bias) |
172 | 0 | t = PUNYCODE_TMIN; |
173 | 0 | else if (k >= bias + PUNYCODE_TMAX) |
174 | 0 | t = PUNYCODE_TMAX; |
175 | 0 | else |
176 | 0 | t = k - bias; |
177 | 0 | if (q < t) |
178 | 0 | break; |
179 | 0 | digit = t + (q - t) % (PUNYCODE_BASE - t); |
180 | 0 | g_string_append_c (output, encode_digit (digit)); |
181 | 0 | q = (q - t) / (PUNYCODE_BASE - t); |
182 | 0 | } |
183 | |
|
184 | 0 | g_string_append_c (output, encode_digit (q)); |
185 | 0 | bias = adapt (delta, handled_chars + 1, handled_chars == num_basic_chars); |
186 | 0 | delta = 0; |
187 | 0 | handled_chars++; |
188 | 0 | } |
189 | 0 | } |
190 | | |
191 | 0 | delta++; |
192 | 0 | n++; |
193 | 0 | } |
194 | | |
195 | 0 | success = TRUE; |
196 | |
|
197 | 0 | fail: |
198 | 0 | g_free (input); |
199 | 0 | return success; |
200 | 0 | } |
201 | | |
202 | | /* From RFC 3454, Table B.1 */ |
203 | 0 | #define idna_is_junk(ch) ((ch) == 0x00AD || (ch) == 0x1806 || (ch) == 0x200B || (ch) == 0x2060 || (ch) == 0xFEFF || (ch) == 0x034F || (ch) == 0x180B || (ch) == 0x180C || (ch) == 0x180D || (ch) == 0x200C || (ch) == 0x200D || ((ch) >= 0xFE00 && (ch) <= 0xFE0F)) |
204 | | |
205 | | /* Scan @str for "junk" and return a cleaned-up string if any junk |
206 | | * is found. Else return %NULL. |
207 | | */ |
208 | | static gchar * |
209 | | remove_junk (const gchar *str, |
210 | | gint len) |
211 | 0 | { |
212 | 0 | GString *cleaned = NULL; |
213 | 0 | const gchar *p; |
214 | 0 | gunichar ch; |
215 | |
|
216 | 0 | for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p)) |
217 | 0 | { |
218 | 0 | ch = g_utf8_get_char (p); |
219 | 0 | if (idna_is_junk (ch)) |
220 | 0 | { |
221 | 0 | if (!cleaned) |
222 | 0 | { |
223 | 0 | cleaned = g_string_new (NULL); |
224 | 0 | g_string_append_len (cleaned, str, p - str); |
225 | 0 | } |
226 | 0 | } |
227 | 0 | else if (cleaned) |
228 | 0 | g_string_append_unichar (cleaned, ch); |
229 | 0 | } |
230 | |
|
231 | 0 | if (cleaned) |
232 | 0 | return g_string_free (cleaned, FALSE); |
233 | 0 | else |
234 | 0 | return NULL; |
235 | 0 | } |
236 | | |
237 | | static inline gboolean |
238 | | contains_uppercase_letters (const gchar *str, |
239 | | gint len) |
240 | 0 | { |
241 | 0 | const gchar *p; |
242 | |
|
243 | 0 | for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p)) |
244 | 0 | { |
245 | 0 | if (g_unichar_isupper (g_utf8_get_char (p))) |
246 | 0 | return TRUE; |
247 | 0 | } |
248 | 0 | return FALSE; |
249 | 0 | } |
250 | | |
251 | | static inline gboolean |
252 | | contains_non_ascii (const gchar *str, |
253 | | gint len) |
254 | 0 | { |
255 | 0 | const gchar *p; |
256 | |
|
257 | 0 | for (p = str; len == -1 ? *p : p < str + len; p++) |
258 | 0 | { |
259 | 0 | if ((guchar)*p > 0x80) |
260 | 0 | return TRUE; |
261 | 0 | } |
262 | 0 | return FALSE; |
263 | 0 | } |
264 | | |
265 | | /* RFC 3454, Appendix C. ish. */ |
266 | | static inline gboolean |
267 | | idna_is_prohibited (gunichar ch) |
268 | 0 | { |
269 | 0 | switch (g_unichar_type (ch)) |
270 | 0 | { |
271 | 0 | case G_UNICODE_CONTROL: |
272 | 0 | case G_UNICODE_FORMAT: |
273 | 0 | case G_UNICODE_UNASSIGNED: |
274 | 0 | case G_UNICODE_PRIVATE_USE: |
275 | 0 | case G_UNICODE_SURROGATE: |
276 | 0 | case G_UNICODE_LINE_SEPARATOR: |
277 | 0 | case G_UNICODE_PARAGRAPH_SEPARATOR: |
278 | 0 | case G_UNICODE_SPACE_SEPARATOR: |
279 | 0 | return TRUE; |
280 | | |
281 | 0 | case G_UNICODE_OTHER_SYMBOL: |
282 | 0 | if (ch == 0xFFFC || ch == 0xFFFD || |
283 | 0 | (ch >= 0x2FF0 && ch <= 0x2FFB)) |
284 | 0 | return TRUE; |
285 | 0 | return FALSE; |
286 | | |
287 | 0 | case G_UNICODE_NON_SPACING_MARK: |
288 | 0 | if (ch == 0x0340 || ch == 0x0341) |
289 | 0 | return TRUE; |
290 | 0 | return FALSE; |
291 | | |
292 | 0 | default: |
293 | 0 | return FALSE; |
294 | 0 | } |
295 | 0 | } |
296 | | |
297 | | /* RFC 3491 IDN cleanup algorithm. */ |
298 | | static gchar * |
299 | | nameprep (const gchar *hostname, |
300 | | gint len, |
301 | | gboolean *is_unicode) |
302 | 0 | { |
303 | 0 | gchar *name, *tmp = NULL, *p; |
304 | | |
305 | | /* It would be nice if we could do this without repeatedly |
306 | | * allocating strings and converting back and forth between |
307 | | * gunichars and UTF-8... The code does at least avoid doing most of |
308 | | * the sub-operations when they would just be equivalent to a |
309 | | * g_strdup(). |
310 | | */ |
311 | | |
312 | | /* Remove presentation-only characters */ |
313 | 0 | name = remove_junk (hostname, len); |
314 | 0 | if (name) |
315 | 0 | { |
316 | 0 | tmp = name; |
317 | 0 | len = -1; |
318 | 0 | } |
319 | 0 | else |
320 | 0 | name = (gchar *)hostname; |
321 | | |
322 | | /* Convert to lowercase */ |
323 | 0 | if (contains_uppercase_letters (name, len)) |
324 | 0 | { |
325 | 0 | name = g_utf8_strdown (name, len); |
326 | 0 | g_free (tmp); |
327 | 0 | tmp = name; |
328 | 0 | len = -1; |
329 | 0 | } |
330 | | |
331 | | /* If there are no UTF8 characters, we're done. */ |
332 | 0 | if (!contains_non_ascii (name, len)) |
333 | 0 | { |
334 | 0 | *is_unicode = FALSE; |
335 | 0 | if (name == (gchar *)hostname) |
336 | 0 | return len == -1 ? g_strdup (hostname) : g_strndup (hostname, len); |
337 | 0 | else |
338 | 0 | return name; |
339 | 0 | } |
340 | | |
341 | 0 | *is_unicode = TRUE; |
342 | | |
343 | | /* Normalize */ |
344 | 0 | name = g_utf8_normalize (name, len, G_NORMALIZE_NFKC); |
345 | 0 | g_free (tmp); |
346 | 0 | tmp = name; |
347 | |
|
348 | 0 | if (!name) |
349 | 0 | return NULL; |
350 | | |
351 | | /* KC normalization may have created more capital letters (eg, |
352 | | * angstrom -> capital A with ring). So we have to lowercasify a |
353 | | * second time. (This is more-or-less how the nameprep algorithm |
354 | | * does it. If tolower(nfkc(tolower(X))) is guaranteed to be the |
355 | | * same as tolower(nfkc(X)), then we could skip the first tolower, |
356 | | * but I'm not sure it is.) |
357 | | */ |
358 | 0 | if (contains_uppercase_letters (name, -1)) |
359 | 0 | { |
360 | 0 | name = g_utf8_strdown (name, -1); |
361 | 0 | g_free (tmp); |
362 | 0 | tmp = name; |
363 | 0 | } |
364 | | |
365 | | /* Check for prohibited characters */ |
366 | 0 | for (p = name; *p; p = g_utf8_next_char (p)) |
367 | 0 | { |
368 | 0 | if (idna_is_prohibited (g_utf8_get_char (p))) |
369 | 0 | { |
370 | 0 | name = NULL; |
371 | 0 | g_free (tmp); |
372 | 0 | goto done; |
373 | 0 | } |
374 | 0 | } |
375 | | |
376 | | /* FIXME: We're supposed to verify certain constraints on bidi |
377 | | * characters, but glib does not appear to have that information. |
378 | | */ |
379 | | |
380 | 0 | done: |
381 | 0 | return name; |
382 | 0 | } |
383 | | |
384 | | /* RFC 3490, section 3.1 says '.', 0x3002, 0xFF0E, and 0xFF61 count as |
385 | | * label-separating dots. @str must be '\0'-terminated. |
386 | | */ |
387 | 0 | #define idna_is_dot(str) ( \ |
388 | 0 | ((guchar)(str)[0] == '.') || \ |
389 | 0 | ((guchar)(str)[0] == 0xE3 && (guchar)(str)[1] == 0x80 && (guchar)(str)[2] == 0x82) || \ |
390 | 0 | ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBC && (guchar)(str)[2] == 0x8E) || \ |
391 | 0 | ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBD && (guchar)(str)[2] == 0xA1) ) |
392 | | |
393 | | static const gchar * |
394 | | idna_end_of_label (const gchar *str) |
395 | 0 | { |
396 | 0 | for (; *str; str = g_utf8_next_char (str)) |
397 | 0 | { |
398 | 0 | if (idna_is_dot (str)) |
399 | 0 | return str; |
400 | 0 | } |
401 | 0 | return str; |
402 | 0 | } |
403 | | |
404 | | static gsize |
405 | | get_hostname_max_length_bytes (void) |
406 | 0 | { |
407 | | #if defined(G_OS_WIN32) |
408 | | wchar_t tmp[MAX_COMPUTERNAME_LENGTH]; |
409 | | return sizeof (tmp) / sizeof (tmp[0]); |
410 | | #elif defined(_SC_HOST_NAME_MAX) |
411 | 0 | glong max = sysconf (_SC_HOST_NAME_MAX); |
412 | 0 | if (max > 0) |
413 | 0 | return (gsize) max; |
414 | | |
415 | 0 | #ifdef HOST_NAME_MAX |
416 | 0 | return HOST_NAME_MAX; |
417 | | #else |
418 | | return _POSIX_HOST_NAME_MAX; |
419 | | #endif /* HOST_NAME_MAX */ |
420 | | #else |
421 | | /* Fallback to some reasonable value |
422 | | * See https://stackoverflow.com/questions/8724954/what-is-the-maximum-number-of-characters-for-a-host-name-in-unix/28918017#28918017 */ |
423 | | return 255; |
424 | | #endif |
425 | 0 | } |
426 | | |
427 | | /* Returns %TRUE if `strlen (str) > comparison_length`, but without actually |
428 | | * running `strlen(str)`, as that would take a very long time for long |
429 | | * (untrusted) input strings. */ |
430 | | static gboolean |
431 | | strlen_greater_than (const gchar *str, |
432 | | gsize comparison_length) |
433 | 0 | { |
434 | 0 | gsize i; |
435 | |
|
436 | 0 | for (i = 0; str[i] != '\0'; i++) |
437 | 0 | if (i > comparison_length) |
438 | 0 | return TRUE; |
439 | | |
440 | 0 | return FALSE; |
441 | 0 | } |
442 | | |
443 | | /** |
444 | | * g_hostname_to_ascii: |
445 | | * @hostname: a valid UTF-8 or ASCII hostname |
446 | | * |
447 | | * Converts @hostname to its canonical ASCII form; an ASCII-only |
448 | | * string containing no uppercase letters and not ending with a |
449 | | * trailing dot. |
450 | | * |
451 | | * Returns: (nullable) (transfer full): an ASCII hostname, which must be freed, |
452 | | * or %NULL if @hostname is in some way invalid. |
453 | | * |
454 | | * Since: 2.22 |
455 | | **/ |
456 | | gchar * |
457 | | g_hostname_to_ascii (const gchar *hostname) |
458 | 0 | { |
459 | 0 | gchar *name, *label, *p; |
460 | 0 | GString *out; |
461 | 0 | gssize llen, oldlen; |
462 | 0 | gboolean unicode; |
463 | 0 | gsize hostname_max_length_bytes = get_hostname_max_length_bytes (); |
464 | | |
465 | | /* Do an initial check on the hostname length, as overlong hostnames take a |
466 | | * long time in the IDN cleanup algorithm in nameprep(). The ultimate |
467 | | * restriction is that the IDN-decoded (i.e. pure ASCII) hostname cannot be |
468 | | * longer than 255 bytes. That’s the least restrictive limit on hostname |
469 | | * length of all the ways hostnames can be interpreted. Typically, the |
470 | | * hostname will be an FQDN, which is limited to 253 bytes long. POSIX |
471 | | * hostnames are limited to `get_hostname_max_length_bytes()` (typically 255 |
472 | | * bytes). |
473 | | * |
474 | | * See https://stackoverflow.com/a/28918017/2931197 |
475 | | * |
476 | | * It’s possible for a hostname to be %-encoded, in which case its decoded |
477 | | * length will be as much as 3× shorter. |
478 | | * |
479 | | * It’s also possible for a hostname to use overlong UTF-8 encodings, in which |
480 | | * case its decoded length will be as much as 4× shorter. |
481 | | * |
482 | | * Note: This check is not intended as an absolute guarantee that a hostname |
483 | | * is the right length and will be accepted by other systems. It’s intended to |
484 | | * stop wildly-invalid hostnames from taking forever in nameprep(). |
485 | | */ |
486 | 0 | if (hostname_max_length_bytes <= G_MAXSIZE / 4 && |
487 | 0 | strlen_greater_than (hostname, 4 * MAX (255, hostname_max_length_bytes))) |
488 | 0 | return NULL; |
489 | | |
490 | 0 | label = name = nameprep (hostname, -1, &unicode); |
491 | 0 | if (!name || !unicode) |
492 | 0 | return name; |
493 | | |
494 | 0 | out = g_string_new (NULL); |
495 | |
|
496 | 0 | do |
497 | 0 | { |
498 | 0 | unicode = FALSE; |
499 | 0 | for (p = label; *p && !idna_is_dot (p); p++) |
500 | 0 | { |
501 | 0 | if ((guchar)*p > 0x80) |
502 | 0 | unicode = TRUE; |
503 | 0 | } |
504 | |
|
505 | 0 | oldlen = out->len; |
506 | 0 | llen = p - label; |
507 | 0 | if (unicode) |
508 | 0 | { |
509 | 0 | if (!strncmp (label, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
510 | 0 | goto fail; |
511 | | |
512 | 0 | g_string_append (out, IDNA_ACE_PREFIX); |
513 | 0 | if (!punycode_encode (label, llen, out)) |
514 | 0 | goto fail; |
515 | 0 | } |
516 | 0 | else |
517 | 0 | g_string_append_len (out, label, llen); |
518 | | |
519 | 0 | if (out->len - oldlen > 63) |
520 | 0 | goto fail; |
521 | | |
522 | 0 | label += llen; |
523 | 0 | if (*label) |
524 | 0 | label = g_utf8_next_char (label); |
525 | 0 | if (*label) |
526 | 0 | g_string_append_c (out, '.'); |
527 | 0 | } |
528 | 0 | while (*label); |
529 | | |
530 | 0 | g_free (name); |
531 | 0 | return g_string_free (out, FALSE); |
532 | | |
533 | 0 | fail: |
534 | 0 | g_free (name); |
535 | 0 | g_string_free (out, TRUE); |
536 | 0 | return NULL; |
537 | 0 | } |
538 | | |
539 | | /** |
540 | | * g_hostname_is_non_ascii: |
541 | | * @hostname: a hostname |
542 | | * |
543 | | * Tests if @hostname contains Unicode characters. If this returns |
544 | | * %TRUE, you need to encode the hostname with g_hostname_to_ascii() |
545 | | * before using it in non-IDN-aware contexts. |
546 | | * |
547 | | * Note that a hostname might contain a mix of encoded and unencoded |
548 | | * segments, and so it is possible for g_hostname_is_non_ascii() and |
549 | | * g_hostname_is_ascii_encoded() to both return %TRUE for a name. |
550 | | * |
551 | | * Returns: %TRUE if @hostname contains any non-ASCII characters |
552 | | * |
553 | | * Since: 2.22 |
554 | | **/ |
555 | | gboolean |
556 | | g_hostname_is_non_ascii (const gchar *hostname) |
557 | 0 | { |
558 | 0 | return contains_non_ascii (hostname, -1); |
559 | 0 | } |
560 | | |
561 | | /* Punycode decoder, RFC 3492 section 6.2. As with punycode_encode(), |
562 | | * read the RFC if you want to understand what this is actually doing. |
563 | | */ |
564 | | static gboolean |
565 | | punycode_decode (const gchar *input, |
566 | | gsize input_length, |
567 | | GString *output) |
568 | 0 | { |
569 | 0 | GArray *output_chars; |
570 | 0 | gunichar n; |
571 | 0 | guint i, bias; |
572 | 0 | guint oldi, w, k, digit, t; |
573 | 0 | const gchar *split; |
574 | |
|
575 | 0 | n = PUNYCODE_INITIAL_N; |
576 | 0 | i = 0; |
577 | 0 | bias = PUNYCODE_INITIAL_BIAS; |
578 | |
|
579 | 0 | split = input + input_length - 1; |
580 | 0 | while (split > input && *split != '-') |
581 | 0 | split--; |
582 | 0 | if (split > input) |
583 | 0 | { |
584 | 0 | output_chars = g_array_sized_new (FALSE, FALSE, sizeof (gunichar), |
585 | 0 | split - input); |
586 | 0 | input_length -= (split - input) + 1; |
587 | 0 | while (input < split) |
588 | 0 | { |
589 | 0 | gunichar ch = (gunichar)*input++; |
590 | 0 | if (!PUNYCODE_IS_BASIC (ch)) |
591 | 0 | goto fail; |
592 | 0 | g_array_append_val (output_chars, ch); |
593 | 0 | } |
594 | 0 | input++; |
595 | 0 | } |
596 | 0 | else |
597 | 0 | output_chars = g_array_new (FALSE, FALSE, sizeof (gunichar)); |
598 | | |
599 | 0 | while (input_length) |
600 | 0 | { |
601 | 0 | oldi = i; |
602 | 0 | w = 1; |
603 | 0 | for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE) |
604 | 0 | { |
605 | 0 | if (!input_length--) |
606 | 0 | goto fail; |
607 | 0 | digit = decode_digit (*input++); |
608 | 0 | if (digit >= PUNYCODE_BASE) |
609 | 0 | goto fail; |
610 | 0 | if (digit > (G_MAXUINT - i) / w) |
611 | 0 | goto fail; |
612 | 0 | i += digit * w; |
613 | 0 | if (k <= bias) |
614 | 0 | t = PUNYCODE_TMIN; |
615 | 0 | else if (k >= bias + PUNYCODE_TMAX) |
616 | 0 | t = PUNYCODE_TMAX; |
617 | 0 | else |
618 | 0 | t = k - bias; |
619 | 0 | if (digit < t) |
620 | 0 | break; |
621 | 0 | if (w > G_MAXUINT / (PUNYCODE_BASE - t)) |
622 | 0 | goto fail; |
623 | 0 | w *= (PUNYCODE_BASE - t); |
624 | 0 | } |
625 | | |
626 | 0 | bias = adapt (i - oldi, output_chars->len + 1, oldi == 0); |
627 | |
|
628 | 0 | if (i / (output_chars->len + 1) > G_MAXUINT - n) |
629 | 0 | goto fail; |
630 | 0 | n += i / (output_chars->len + 1); |
631 | 0 | i %= (output_chars->len + 1); |
632 | |
|
633 | 0 | g_array_insert_val (output_chars, i++, n); |
634 | 0 | } |
635 | | |
636 | 0 | for (i = 0; i < output_chars->len; i++) |
637 | 0 | g_string_append_unichar (output, g_array_index (output_chars, gunichar, i)); |
638 | 0 | g_array_free (output_chars, TRUE); |
639 | 0 | return TRUE; |
640 | | |
641 | 0 | fail: |
642 | 0 | g_array_free (output_chars, TRUE); |
643 | 0 | return FALSE; |
644 | 0 | } |
645 | | |
646 | | /** |
647 | | * g_hostname_to_unicode: |
648 | | * @hostname: a valid UTF-8 or ASCII hostname |
649 | | * |
650 | | * Converts @hostname to its canonical presentation form; a UTF-8 |
651 | | * string in Unicode normalization form C, containing no uppercase |
652 | | * letters, no forbidden characters, and no ASCII-encoded segments, |
653 | | * and not ending with a trailing dot. |
654 | | * |
655 | | * Of course if @hostname is not an internationalized hostname, then |
656 | | * the canonical presentation form will be entirely ASCII. |
657 | | * |
658 | | * Returns: (nullable) (transfer full): a UTF-8 hostname, which must be freed, |
659 | | * or %NULL if @hostname is in some way invalid. |
660 | | * |
661 | | * Since: 2.22 |
662 | | **/ |
663 | | gchar * |
664 | | g_hostname_to_unicode (const gchar *hostname) |
665 | 0 | { |
666 | 0 | GString *out; |
667 | 0 | gssize llen; |
668 | 0 | gsize hostname_max_length_bytes = get_hostname_max_length_bytes (); |
669 | | |
670 | | /* See the comment at the top of g_hostname_to_ascii(). */ |
671 | 0 | if (hostname_max_length_bytes <= G_MAXSIZE / 4 && |
672 | 0 | strlen_greater_than (hostname, 4 * MAX (255, hostname_max_length_bytes))) |
673 | 0 | return NULL; |
674 | | |
675 | 0 | out = g_string_new (NULL); |
676 | |
|
677 | 0 | do |
678 | 0 | { |
679 | 0 | llen = idna_end_of_label (hostname) - hostname; |
680 | 0 | if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
681 | 0 | { |
682 | 0 | hostname += IDNA_ACE_PREFIX_LEN; |
683 | 0 | llen -= IDNA_ACE_PREFIX_LEN; |
684 | 0 | if (!punycode_decode (hostname, llen, out)) |
685 | 0 | { |
686 | 0 | g_string_free (out, TRUE); |
687 | 0 | return NULL; |
688 | 0 | } |
689 | 0 | } |
690 | 0 | else |
691 | 0 | { |
692 | 0 | gboolean unicode; |
693 | 0 | gchar *canonicalized = nameprep (hostname, llen, &unicode); |
694 | |
|
695 | 0 | if (!canonicalized) |
696 | 0 | { |
697 | 0 | g_string_free (out, TRUE); |
698 | 0 | return NULL; |
699 | 0 | } |
700 | 0 | g_string_append (out, canonicalized); |
701 | 0 | g_free (canonicalized); |
702 | 0 | } |
703 | | |
704 | 0 | hostname += llen; |
705 | 0 | if (*hostname) |
706 | 0 | hostname = g_utf8_next_char (hostname); |
707 | 0 | if (*hostname) |
708 | 0 | g_string_append_c (out, '.'); |
709 | 0 | } |
710 | 0 | while (*hostname); |
711 | | |
712 | 0 | return g_string_free (out, FALSE); |
713 | 0 | } |
714 | | |
715 | | /** |
716 | | * g_hostname_is_ascii_encoded: |
717 | | * @hostname: a hostname |
718 | | * |
719 | | * Tests if @hostname contains segments with an ASCII-compatible |
720 | | * encoding of an Internationalized Domain Name. If this returns |
721 | | * %TRUE, you should decode the hostname with g_hostname_to_unicode() |
722 | | * before displaying it to the user. |
723 | | * |
724 | | * Note that a hostname might contain a mix of encoded and unencoded |
725 | | * segments, and so it is possible for g_hostname_is_non_ascii() and |
726 | | * g_hostname_is_ascii_encoded() to both return %TRUE for a name. |
727 | | * |
728 | | * Returns: %TRUE if @hostname contains any ASCII-encoded |
729 | | * segments. |
730 | | * |
731 | | * Since: 2.22 |
732 | | **/ |
733 | | gboolean |
734 | | g_hostname_is_ascii_encoded (const gchar *hostname) |
735 | 0 | { |
736 | 0 | while (1) |
737 | 0 | { |
738 | 0 | if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
739 | 0 | return TRUE; |
740 | 0 | hostname = idna_end_of_label (hostname); |
741 | 0 | if (*hostname) |
742 | 0 | hostname = g_utf8_next_char (hostname); |
743 | 0 | if (!*hostname) |
744 | 0 | return FALSE; |
745 | 0 | } |
746 | 0 | } |
747 | | |
748 | | /** |
749 | | * g_hostname_is_ip_address: |
750 | | * @hostname: a hostname (or IP address in string form) |
751 | | * |
752 | | * Tests if @hostname is the string form of an IPv4 or IPv6 address. |
753 | | * (Eg, "192.168.0.1".) |
754 | | * |
755 | | * Since 2.66, IPv6 addresses with a zone-id are accepted (RFC6874). |
756 | | * |
757 | | * Returns: %TRUE if @hostname is an IP address |
758 | | * |
759 | | * Since: 2.22 |
760 | | **/ |
761 | | gboolean |
762 | | g_hostname_is_ip_address (const gchar *hostname) |
763 | 0 | { |
764 | 0 | gchar *p, *end; |
765 | 0 | gint nsegments, octet; |
766 | | |
767 | | /* On Linux we could implement this using inet_pton, but the Windows |
768 | | * equivalent of that requires linking against winsock, so we just |
769 | | * figure this out ourselves. Tested by tests/hostutils.c. |
770 | | */ |
771 | |
|
772 | 0 | p = (char *)hostname; |
773 | |
|
774 | 0 | if (strchr (p, ':')) |
775 | 0 | { |
776 | 0 | gboolean skipped; |
777 | | |
778 | | /* If it contains a ':', it's an IPv6 address (assuming it's an |
779 | | * IP address at all). This consists of eight ':'-separated |
780 | | * segments, each containing a 1-4 digit hex number, except that |
781 | | * optionally: (a) the last two segments can be replaced by an |
782 | | * IPv4 address, and (b) a single span of 1 to 8 "0000" segments |
783 | | * can be replaced with just "::". |
784 | | */ |
785 | |
|
786 | 0 | nsegments = 0; |
787 | 0 | skipped = FALSE; |
788 | 0 | while (*p && *p != '%' && nsegments < 8) |
789 | 0 | { |
790 | | /* Each segment after the first must be preceded by a ':'. |
791 | | * (We also handle half of the "string starts with ::" case |
792 | | * here.) |
793 | | */ |
794 | 0 | if (p != (char *)hostname || (p[0] == ':' && p[1] == ':')) |
795 | 0 | { |
796 | 0 | if (*p != ':') |
797 | 0 | return FALSE; |
798 | 0 | p++; |
799 | 0 | } |
800 | | |
801 | | /* If there's another ':', it means we're skipping some segments */ |
802 | 0 | if (*p == ':' && !skipped) |
803 | 0 | { |
804 | 0 | skipped = TRUE; |
805 | 0 | nsegments++; |
806 | | |
807 | | /* Handle the "string ends with ::" case */ |
808 | 0 | if (!p[1]) |
809 | 0 | p++; |
810 | |
|
811 | 0 | continue; |
812 | 0 | } |
813 | | |
814 | | /* Read the segment, make sure it's valid. */ |
815 | 0 | for (end = p; g_ascii_isxdigit (*end); end++) |
816 | 0 | ; |
817 | 0 | if (end == p || end > p + 4) |
818 | 0 | return FALSE; |
819 | | |
820 | 0 | if (*end == '.') |
821 | 0 | { |
822 | 0 | if ((nsegments == 6 && !skipped) || (nsegments <= 6 && skipped)) |
823 | 0 | goto parse_ipv4; |
824 | 0 | else |
825 | 0 | return FALSE; |
826 | 0 | } |
827 | | |
828 | 0 | nsegments++; |
829 | 0 | p = end; |
830 | 0 | } |
831 | | |
832 | 0 | return (!*p || (p[0] == '%' && p[1])) && (nsegments == 8 || skipped); |
833 | 0 | } |
834 | | |
835 | 0 | parse_ipv4: |
836 | | |
837 | | /* Parse IPv4: N.N.N.N, where each N <= 255 and doesn't have leading 0s. */ |
838 | 0 | for (nsegments = 0; nsegments < 4; nsegments++) |
839 | 0 | { |
840 | 0 | if (nsegments != 0) |
841 | 0 | { |
842 | 0 | if (*p != '.') |
843 | 0 | return FALSE; |
844 | 0 | p++; |
845 | 0 | } |
846 | | |
847 | | /* Check the segment; a little tricker than the IPv6 case since |
848 | | * we can't allow extra leading 0s, and we can't assume that all |
849 | | * strings of valid length are within range. |
850 | | */ |
851 | 0 | octet = 0; |
852 | 0 | if (*p == '0') |
853 | 0 | end = p + 1; |
854 | 0 | else |
855 | 0 | { |
856 | 0 | for (end = p; g_ascii_isdigit (*end); end++) |
857 | 0 | { |
858 | 0 | octet = 10 * octet + (*end - '0'); |
859 | |
|
860 | 0 | if (octet > 255) |
861 | 0 | break; |
862 | 0 | } |
863 | 0 | } |
864 | 0 | if (end == p || end > p + 3 || octet > 255) |
865 | 0 | return FALSE; |
866 | | |
867 | 0 | p = end; |
868 | 0 | } |
869 | | |
870 | | /* If there's nothing left to parse, then it's ok. */ |
871 | 0 | return !*p; |
872 | 0 | } |