/src/gstreamer/subprojects/glib-2.86.3/glib/ghostutils.c
Line | Count | Source |
1 | | /* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- */ |
2 | | |
3 | | /* GLIB - Library of useful routines for C programming |
4 | | * Copyright (C) 2008 Red Hat, Inc. |
5 | | * |
6 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
7 | | * |
8 | | * This library is free software; you can redistribute it and/or |
9 | | * modify it under the terms of the GNU Lesser General Public |
10 | | * License as published by the Free Software Foundation; either |
11 | | * version 2.1 of the License, or (at your option) any later version. |
12 | | * |
13 | | * This library is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 | | * Lesser General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU Lesser General |
19 | | * Public License along with this library; if not, see <http://www.gnu.org/licenses/>. |
20 | | */ |
21 | | |
22 | | #include "config.h" |
23 | | #include "glibconfig.h" |
24 | | |
25 | | #include <string.h> |
26 | | |
27 | | #ifdef G_OS_UNIX |
28 | | #include <unistd.h> |
29 | | #endif |
30 | | |
31 | | #include "ghostutils.h" |
32 | | |
33 | | #include "garray.h" |
34 | | #include "gmem.h" |
35 | | #include "gmessages.h" |
36 | | #include "gstring.h" |
37 | | #include "gstrfuncs.h" |
38 | | #include "gtestutils.h" |
39 | | #include "glibintl.h" |
40 | | |
41 | | #ifdef G_PLATFORM_WIN32 |
42 | | #include <windows.h> |
43 | | #endif |
44 | | |
45 | | |
46 | 0 | #define IDNA_ACE_PREFIX "xn--" |
47 | 0 | #define IDNA_ACE_PREFIX_LEN 4 |
48 | | |
49 | | /* Punycode constants, from RFC 3492. */ |
50 | | |
51 | 0 | #define PUNYCODE_BASE 36 |
52 | 0 | #define PUNYCODE_TMIN 1 |
53 | 0 | #define PUNYCODE_TMAX 26 |
54 | 0 | #define PUNYCODE_SKEW 38 |
55 | 0 | #define PUNYCODE_DAMP 700 |
56 | 0 | #define PUNYCODE_INITIAL_BIAS 72 |
57 | 0 | #define PUNYCODE_INITIAL_N 0x80 |
58 | | |
59 | 0 | #define IS_ASCII(cp) ((guint) (cp) < 0x80) |
60 | 0 | #define PUNYCODE_IS_BASIC(cp) IS_ASCII (cp) |
61 | | |
62 | | /* Encode/decode a single base-36 digit */ |
63 | | static inline gchar |
64 | | encode_digit (guint dig) |
65 | 0 | { |
66 | 0 | if (dig < 26) |
67 | 0 | return dig + 'a'; |
68 | 0 | else |
69 | 0 | return dig - 26 + '0'; |
70 | 0 | } |
71 | | |
72 | | static inline guint |
73 | | decode_digit (gchar dig) |
74 | 0 | { |
75 | 0 | if (dig >= 'A' && dig <= 'Z') |
76 | 0 | return dig - 'A'; |
77 | 0 | else if (dig >= 'a' && dig <= 'z') |
78 | 0 | return dig - 'a'; |
79 | 0 | else if (dig >= '0' && dig <= '9') |
80 | 0 | return dig - '0' + 26; |
81 | 0 | else |
82 | 0 | return G_MAXUINT; |
83 | 0 | } |
84 | | |
85 | | /* Punycode bias adaptation algorithm, RFC 3492 section 6.1 */ |
86 | | static guint |
87 | | adapt (guint delta, |
88 | | guint numpoints, |
89 | | gboolean firsttime) |
90 | 0 | { |
91 | 0 | guint k; |
92 | |
|
93 | 0 | delta = firsttime ? delta / PUNYCODE_DAMP : delta / 2; |
94 | 0 | delta += delta / numpoints; |
95 | |
|
96 | 0 | k = 0; |
97 | 0 | while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2) |
98 | 0 | { |
99 | 0 | delta /= PUNYCODE_BASE - PUNYCODE_TMIN; |
100 | 0 | k += PUNYCODE_BASE; |
101 | 0 | } |
102 | |
|
103 | 0 | return k + ((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta / |
104 | 0 | (delta + PUNYCODE_SKEW)); |
105 | 0 | } |
106 | | |
107 | | /* Punycode encoder, RFC 3492 section 6.3. The algorithm is |
108 | | * sufficiently bizarre that it's not really worth trying to explain |
109 | | * here. |
110 | | */ |
111 | | static gboolean |
112 | | punycode_encode (const gchar *input_utf8, |
113 | | gsize input_utf8_length, |
114 | | GString *output) |
115 | 0 | { |
116 | 0 | guint delta, handled_chars, num_basic_chars, bias, j, q, k, t, digit; |
117 | 0 | gunichar n, m, *input; |
118 | 0 | glong written_chars; |
119 | 0 | gsize input_length; |
120 | 0 | gboolean success = FALSE; |
121 | | |
122 | | /* Convert from UTF-8 to Unicode code points */ |
123 | 0 | input = g_utf8_to_ucs4 (input_utf8, input_utf8_length, NULL, |
124 | 0 | &written_chars, NULL); |
125 | 0 | if (!input) |
126 | 0 | return FALSE; |
127 | | |
128 | 0 | input_length = (gsize) (written_chars > 0 ? written_chars : 0); |
129 | | |
130 | | /* Copy basic chars */ |
131 | 0 | for (j = num_basic_chars = 0; j < input_length; j++) |
132 | 0 | { |
133 | 0 | if (PUNYCODE_IS_BASIC (input[j])) |
134 | 0 | { |
135 | 0 | g_string_append_c (output, g_ascii_tolower (input[j])); |
136 | 0 | num_basic_chars++; |
137 | 0 | } |
138 | 0 | } |
139 | 0 | if (num_basic_chars) |
140 | 0 | g_string_append_c (output, '-'); |
141 | |
|
142 | 0 | handled_chars = num_basic_chars; |
143 | | |
144 | | /* Encode non-basic chars */ |
145 | 0 | delta = 0; |
146 | 0 | bias = PUNYCODE_INITIAL_BIAS; |
147 | 0 | n = PUNYCODE_INITIAL_N; |
148 | 0 | while (handled_chars < input_length) |
149 | 0 | { |
150 | | /* let m = the minimum {non-basic} code point >= n in the input */ |
151 | 0 | for (m = G_MAXUINT, j = 0; j < input_length; j++) |
152 | 0 | { |
153 | 0 | if (input[j] >= n && input[j] < m) |
154 | 0 | m = input[j]; |
155 | 0 | } |
156 | |
|
157 | 0 | if (m - n > (G_MAXUINT - delta) / (handled_chars + 1)) |
158 | 0 | goto fail; |
159 | 0 | delta += (m - n) * (handled_chars + 1); |
160 | 0 | n = m; |
161 | |
|
162 | 0 | for (j = 0; j < input_length; j++) |
163 | 0 | { |
164 | 0 | if (input[j] < n) |
165 | 0 | { |
166 | 0 | if (++delta == 0) |
167 | 0 | goto fail; |
168 | 0 | } |
169 | 0 | else if (input[j] == n) |
170 | 0 | { |
171 | 0 | q = delta; |
172 | 0 | for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE) |
173 | 0 | { |
174 | 0 | if (k <= bias) |
175 | 0 | t = PUNYCODE_TMIN; |
176 | 0 | else if (k >= bias + PUNYCODE_TMAX) |
177 | 0 | t = PUNYCODE_TMAX; |
178 | 0 | else |
179 | 0 | t = k - bias; |
180 | 0 | if (q < t) |
181 | 0 | break; |
182 | 0 | digit = t + (q - t) % (PUNYCODE_BASE - t); |
183 | 0 | g_string_append_c (output, encode_digit (digit)); |
184 | 0 | q = (q - t) / (PUNYCODE_BASE - t); |
185 | 0 | } |
186 | |
|
187 | 0 | g_string_append_c (output, encode_digit (q)); |
188 | 0 | bias = adapt (delta, handled_chars + 1, handled_chars == num_basic_chars); |
189 | 0 | delta = 0; |
190 | 0 | handled_chars++; |
191 | 0 | } |
192 | 0 | } |
193 | | |
194 | 0 | delta++; |
195 | 0 | n++; |
196 | 0 | } |
197 | | |
198 | 0 | success = TRUE; |
199 | |
|
200 | 0 | fail: |
201 | 0 | g_free (input); |
202 | 0 | return success; |
203 | 0 | } |
204 | | |
205 | | /* From RFC 3454, Table B.1 */ |
206 | 0 | #define idna_is_junk(ch) ((ch) == 0x00AD || (ch) == 0x1806 || (ch) == 0x200B || (ch) == 0x2060 || (ch) == 0xFEFF || (ch) == 0x034F || (ch) == 0x180B || (ch) == 0x180C || (ch) == 0x180D || (ch) == 0x200C || (ch) == 0x200D || ((ch) >= 0xFE00 && (ch) <= 0xFE0F)) |
207 | | |
208 | | /* Scan @str for "junk" and return a cleaned-up string if any junk |
209 | | * is found. Else return %NULL. |
210 | | */ |
211 | | static gchar * |
212 | | remove_junk (const gchar *str, |
213 | | gssize len) |
214 | 0 | { |
215 | 0 | GString *cleaned = NULL; |
216 | 0 | const gchar *p; |
217 | 0 | gunichar ch; |
218 | |
|
219 | 0 | for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p)) |
220 | 0 | { |
221 | 0 | ch = g_utf8_get_char (p); |
222 | 0 | if (idna_is_junk (ch)) |
223 | 0 | { |
224 | 0 | if (!cleaned) |
225 | 0 | { |
226 | 0 | cleaned = g_string_new (NULL); |
227 | 0 | g_string_append_len (cleaned, str, p - str); |
228 | 0 | } |
229 | 0 | } |
230 | 0 | else if (cleaned) |
231 | 0 | g_string_append_unichar (cleaned, ch); |
232 | 0 | } |
233 | |
|
234 | 0 | if (cleaned) |
235 | 0 | return g_string_free (cleaned, FALSE); |
236 | 0 | else |
237 | 0 | return NULL; |
238 | 0 | } |
239 | | |
240 | | static inline gboolean |
241 | | contains_uppercase_letters (const gchar *str, |
242 | | gssize len) |
243 | 0 | { |
244 | 0 | const gchar *p; |
245 | |
|
246 | 0 | for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p)) |
247 | 0 | { |
248 | 0 | if (g_unichar_isupper (g_utf8_get_char (p))) |
249 | 0 | return TRUE; |
250 | 0 | } |
251 | 0 | return FALSE; |
252 | 0 | } |
253 | | |
254 | | static inline gboolean |
255 | | contains_non_ascii (const gchar *str, |
256 | | gssize len) |
257 | 0 | { |
258 | 0 | const gchar *p; |
259 | |
|
260 | 0 | for (p = str; len == -1 ? *p : p < str + len; p++) |
261 | 0 | { |
262 | 0 | if (!IS_ASCII (*p)) |
263 | 0 | return TRUE; |
264 | 0 | } |
265 | 0 | return FALSE; |
266 | 0 | } |
267 | | |
268 | | /* RFC 3454, Appendix C. ish. */ |
269 | | static inline gboolean |
270 | | idna_is_prohibited (gunichar ch) |
271 | 0 | { |
272 | 0 | switch (g_unichar_type (ch)) |
273 | 0 | { |
274 | 0 | case G_UNICODE_CONTROL: |
275 | 0 | case G_UNICODE_FORMAT: |
276 | 0 | case G_UNICODE_UNASSIGNED: |
277 | 0 | case G_UNICODE_PRIVATE_USE: |
278 | 0 | case G_UNICODE_SURROGATE: |
279 | 0 | case G_UNICODE_LINE_SEPARATOR: |
280 | 0 | case G_UNICODE_PARAGRAPH_SEPARATOR: |
281 | 0 | case G_UNICODE_SPACE_SEPARATOR: |
282 | 0 | return TRUE; |
283 | | |
284 | 0 | case G_UNICODE_OTHER_SYMBOL: |
285 | 0 | if (ch == 0xFFFC || ch == 0xFFFD || |
286 | 0 | (ch >= 0x2FF0 && ch <= 0x2FFB)) |
287 | 0 | return TRUE; |
288 | 0 | return FALSE; |
289 | | |
290 | 0 | case G_UNICODE_NON_SPACING_MARK: |
291 | 0 | if (ch == 0x0340 || ch == 0x0341) |
292 | 0 | return TRUE; |
293 | 0 | return FALSE; |
294 | | |
295 | 0 | default: |
296 | 0 | return FALSE; |
297 | 0 | } |
298 | 0 | } |
299 | | |
300 | | /* RFC 3491 IDN cleanup algorithm. */ |
301 | | static gchar * |
302 | | nameprep (const gchar *hostname, |
303 | | gssize len, |
304 | | gboolean *is_unicode) |
305 | 0 | { |
306 | 0 | const char *name, *p; |
307 | 0 | char *name_owned = NULL, *name_normalized = NULL; |
308 | | |
309 | | /* It would be nice if we could do this without repeatedly |
310 | | * allocating strings and converting back and forth between |
311 | | * gunichars and UTF-8... The code does at least avoid doing most of |
312 | | * the sub-operations when they would just be equivalent to a |
313 | | * g_strdup(). |
314 | | */ |
315 | | |
316 | | /* Remove presentation-only characters */ |
317 | 0 | name = name_owned = remove_junk (hostname, len); |
318 | 0 | if (name) |
319 | 0 | len = -1; |
320 | 0 | else |
321 | 0 | name = hostname; |
322 | | |
323 | | /* Convert to lowercase */ |
324 | 0 | if (contains_uppercase_letters (name, len)) |
325 | 0 | { |
326 | 0 | char *name_owned_lower = NULL; |
327 | |
|
328 | 0 | name = name_owned_lower = g_utf8_strdown (name, len); |
329 | 0 | g_free (name_owned); |
330 | 0 | name_owned = g_steal_pointer (&name_owned_lower); |
331 | 0 | len = -1; |
332 | 0 | } |
333 | | |
334 | | /* If there are no UTF8 characters, we're done. */ |
335 | 0 | if (!contains_non_ascii (name, len)) |
336 | 0 | { |
337 | 0 | *is_unicode = FALSE; |
338 | 0 | if (name == hostname) |
339 | 0 | return len == -1 ? g_strdup (hostname) : g_strndup (hostname, len); |
340 | 0 | else |
341 | 0 | return g_steal_pointer (&name_owned); |
342 | 0 | } |
343 | | |
344 | 0 | *is_unicode = TRUE; |
345 | | |
346 | | /* Normalize */ |
347 | 0 | name = name_normalized = g_utf8_normalize (name, len, G_NORMALIZE_NFKC); |
348 | 0 | g_free (name_owned); |
349 | 0 | name_owned = g_steal_pointer (&name_normalized); |
350 | 0 | len = -1; |
351 | |
|
352 | 0 | if (!name) |
353 | 0 | return NULL; |
354 | | |
355 | | /* KC normalization may have created more capital letters (eg, |
356 | | * angstrom -> capital A with ring). So we have to lowercasify a |
357 | | * second time. (This is more-or-less how the nameprep algorithm |
358 | | * does it. If tolower(nfkc(tolower(X))) is guaranteed to be the |
359 | | * same as tolower(nfkc(X)), then we could skip the first tolower, |
360 | | * but I'm not sure it is.) |
361 | | */ |
362 | 0 | if (contains_uppercase_letters (name, len)) |
363 | 0 | { |
364 | 0 | char *name_owned_lower = NULL; |
365 | |
|
366 | 0 | name = name_owned_lower = g_utf8_strdown (name, len); |
367 | 0 | g_free (name_owned); |
368 | 0 | name_owned = g_steal_pointer (&name_owned_lower); |
369 | 0 | len = -1; |
370 | 0 | } |
371 | | |
372 | | /* Check for prohibited characters */ |
373 | 0 | for (p = name; *p; p = g_utf8_next_char (p)) |
374 | 0 | { |
375 | 0 | if (idna_is_prohibited (g_utf8_get_char (p))) |
376 | 0 | { |
377 | 0 | name = NULL; |
378 | 0 | g_clear_pointer (&name_owned, g_free); |
379 | 0 | len = -1; |
380 | 0 | goto done; |
381 | 0 | } |
382 | 0 | } |
383 | | |
384 | | /* FIXME: We're supposed to verify certain constraints on bidi |
385 | | * characters, but glib does not appear to have that information. |
386 | | */ |
387 | | |
388 | 0 | done: |
389 | 0 | return g_steal_pointer (&name_owned); |
390 | 0 | } |
391 | | |
392 | | /* RFC 3490, section 3.1 says '.', 0x3002, 0xFF0E, and 0xFF61 count as |
393 | | * label-separating dots. @str must be '\0'-terminated. |
394 | | */ |
395 | 0 | #define idna_is_dot(str) ( \ |
396 | 0 | ((guchar)(str)[0] == '.') || \ |
397 | 0 | ((guchar)(str)[0] == 0xE3 && (guchar)(str)[1] == 0x80 && (guchar)(str)[2] == 0x82) || \ |
398 | 0 | ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBC && (guchar)(str)[2] == 0x8E) || \ |
399 | 0 | ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBD && (guchar)(str)[2] == 0xA1) ) |
400 | | |
401 | | static const gchar * |
402 | | idna_end_of_label (const gchar *str) |
403 | 0 | { |
404 | 0 | for (; *str; str = g_utf8_next_char (str)) |
405 | 0 | { |
406 | 0 | if (idna_is_dot (str)) |
407 | 0 | return str; |
408 | 0 | } |
409 | 0 | return str; |
410 | 0 | } |
411 | | |
412 | | static gsize |
413 | | get_hostname_max_length_bytes (void) |
414 | 0 | { |
415 | | #if defined(G_OS_WIN32) |
416 | | wchar_t tmp[MAX_COMPUTERNAME_LENGTH]; |
417 | | return sizeof (tmp) / sizeof (tmp[0]); |
418 | | #elif defined(_SC_HOST_NAME_MAX) |
419 | 0 | glong max = sysconf (_SC_HOST_NAME_MAX); |
420 | 0 | if (max > 0) |
421 | 0 | return (gsize) max; |
422 | | |
423 | 0 | #ifdef HOST_NAME_MAX |
424 | 0 | return HOST_NAME_MAX; |
425 | | #else |
426 | | return _POSIX_HOST_NAME_MAX; |
427 | | #endif /* HOST_NAME_MAX */ |
428 | | #else |
429 | | /* Fallback to some reasonable value |
430 | | * See https://stackoverflow.com/questions/8724954/what-is-the-maximum-number-of-characters-for-a-host-name-in-unix/28918017#28918017 */ |
431 | | return 255; |
432 | | #endif |
433 | 0 | } |
434 | | |
435 | | /* Returns %TRUE if `strlen (str) > comparison_length`, but without actually |
436 | | * running `strlen(str)`, as that would take a very long time for long |
437 | | * (untrusted) input strings. */ |
438 | | static gboolean |
439 | | strlen_greater_than (const gchar *str, |
440 | | gsize comparison_length) |
441 | 0 | { |
442 | 0 | gsize i; |
443 | |
|
444 | 0 | for (i = 0; str[i] != '\0'; i++) |
445 | 0 | if (i > comparison_length) |
446 | 0 | return TRUE; |
447 | | |
448 | 0 | return FALSE; |
449 | 0 | } |
450 | | |
451 | | /** |
452 | | * g_hostname_to_ascii: |
453 | | * @hostname: a valid UTF-8 or ASCII hostname |
454 | | * |
455 | | * Converts @hostname to its canonical ASCII form; an ASCII-only |
456 | | * string containing no uppercase letters and not ending with a |
457 | | * trailing dot. |
458 | | * |
459 | | * Returns: (nullable) (transfer full): an ASCII hostname, which must be freed, |
460 | | * or %NULL if @hostname is in some way invalid. |
461 | | * |
462 | | * Since: 2.22 |
463 | | **/ |
464 | | gchar * |
465 | | g_hostname_to_ascii (const gchar *hostname) |
466 | 0 | { |
467 | 0 | gchar *name, *label, *p; |
468 | 0 | GString *out; |
469 | 0 | gssize llen, oldlen; |
470 | 0 | gboolean unicode; |
471 | 0 | gsize hostname_max_length_bytes = get_hostname_max_length_bytes (); |
472 | | |
473 | | /* Do an initial check on the hostname length, as overlong hostnames take a |
474 | | * long time in the IDN cleanup algorithm in nameprep(). The ultimate |
475 | | * restriction is that the IDN-decoded (i.e. pure ASCII) hostname cannot be |
476 | | * longer than 255 bytes. That’s the least restrictive limit on hostname |
477 | | * length of all the ways hostnames can be interpreted. Typically, the |
478 | | * hostname will be an FQDN, which is limited to 253 bytes long. POSIX |
479 | | * hostnames are limited to `get_hostname_max_length_bytes()` (typically 255 |
480 | | * bytes). |
481 | | * |
482 | | * See https://stackoverflow.com/a/28918017/2931197 |
483 | | * |
484 | | * It’s possible for a hostname to be %-encoded, in which case its decoded |
485 | | * length will be as much as 3× shorter. |
486 | | * |
487 | | * It’s also possible for a hostname to use overlong UTF-8 encodings, in which |
488 | | * case its decoded length will be as much as 4× shorter. |
489 | | * |
490 | | * Note: This check is not intended as an absolute guarantee that a hostname |
491 | | * is the right length and will be accepted by other systems. It’s intended to |
492 | | * stop wildly-invalid hostnames from taking forever in nameprep(). |
493 | | */ |
494 | 0 | if (hostname_max_length_bytes <= G_MAXSIZE / 4 && |
495 | 0 | strlen_greater_than (hostname, 4 * MAX (255, hostname_max_length_bytes))) |
496 | 0 | return NULL; |
497 | | |
498 | 0 | label = name = nameprep (hostname, -1, &unicode); |
499 | 0 | if (!name || !unicode) |
500 | 0 | return name; |
501 | | |
502 | 0 | out = g_string_new (NULL); |
503 | |
|
504 | 0 | do |
505 | 0 | { |
506 | 0 | unicode = FALSE; |
507 | 0 | for (p = label; *p && !idna_is_dot (p); p++) |
508 | 0 | { |
509 | 0 | if (!IS_ASCII (*p)) |
510 | 0 | unicode = TRUE; |
511 | 0 | } |
512 | |
|
513 | 0 | oldlen = out->len; |
514 | 0 | llen = p - label; |
515 | 0 | if (unicode) |
516 | 0 | { |
517 | 0 | if (!strncmp (label, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
518 | 0 | goto fail; |
519 | | |
520 | 0 | g_string_append (out, IDNA_ACE_PREFIX); |
521 | 0 | if (!punycode_encode (label, llen, out)) |
522 | 0 | goto fail; |
523 | 0 | } |
524 | 0 | else |
525 | 0 | g_string_append_len (out, label, llen); |
526 | | |
527 | 0 | if (out->len - oldlen > 63) |
528 | 0 | goto fail; |
529 | | |
530 | 0 | label += llen; |
531 | 0 | if (*label) |
532 | 0 | label = g_utf8_next_char (label); |
533 | 0 | if (*label) |
534 | 0 | g_string_append_c (out, '.'); |
535 | 0 | } |
536 | 0 | while (*label); |
537 | | |
538 | 0 | g_free (name); |
539 | 0 | return g_string_free (out, FALSE); |
540 | | |
541 | 0 | fail: |
542 | 0 | g_free (name); |
543 | 0 | g_string_free (out, TRUE); |
544 | 0 | return NULL; |
545 | 0 | } |
546 | | |
547 | | /** |
548 | | * g_hostname_is_non_ascii: |
549 | | * @hostname: a hostname |
550 | | * |
551 | | * Tests if @hostname contains Unicode characters. If this returns |
552 | | * %TRUE, you need to encode the hostname with g_hostname_to_ascii() |
553 | | * before using it in non-IDN-aware contexts. |
554 | | * |
555 | | * Note that a hostname might contain a mix of encoded and unencoded |
556 | | * segments, and so it is possible for g_hostname_is_non_ascii() and |
557 | | * g_hostname_is_ascii_encoded() to both return %TRUE for a name. |
558 | | * |
559 | | * Returns: %TRUE if @hostname contains any non-ASCII characters |
560 | | * |
561 | | * Since: 2.22 |
562 | | **/ |
563 | | gboolean |
564 | | g_hostname_is_non_ascii (const gchar *hostname) |
565 | 0 | { |
566 | 0 | return contains_non_ascii (hostname, -1); |
567 | 0 | } |
568 | | |
569 | | /* Punycode decoder, RFC 3492 section 6.2. As with punycode_encode(), |
570 | | * read the RFC if you want to understand what this is actually doing. |
571 | | */ |
572 | | static gboolean |
573 | | punycode_decode (const gchar *input, |
574 | | gsize input_length, |
575 | | GString *output) |
576 | 0 | { |
577 | 0 | GArray *output_chars; |
578 | 0 | gunichar n; |
579 | 0 | guint i, bias; |
580 | 0 | guint oldi, w, k, digit, t; |
581 | 0 | const gchar *split; |
582 | |
|
583 | 0 | n = PUNYCODE_INITIAL_N; |
584 | 0 | i = 0; |
585 | 0 | bias = PUNYCODE_INITIAL_BIAS; |
586 | |
|
587 | 0 | split = input + input_length - 1; |
588 | 0 | while (split > input && *split != '-') |
589 | 0 | split--; |
590 | 0 | if (split > input) |
591 | 0 | { |
592 | 0 | g_assert ((guint) (split - input) <= G_MAXUINT); |
593 | | |
594 | 0 | output_chars = g_array_sized_new (FALSE, FALSE, sizeof (gunichar), |
595 | 0 | (guint) (split - input)); |
596 | 0 | input_length -= (split - input) + 1; |
597 | 0 | while (input < split) |
598 | 0 | { |
599 | 0 | gunichar ch = (gunichar)*input++; |
600 | 0 | if (!PUNYCODE_IS_BASIC (ch)) |
601 | 0 | goto fail; |
602 | 0 | g_array_append_val (output_chars, ch); |
603 | 0 | } |
604 | 0 | input++; |
605 | 0 | } |
606 | 0 | else |
607 | 0 | output_chars = g_array_new (FALSE, FALSE, sizeof (gunichar)); |
608 | | |
609 | 0 | while (input_length) |
610 | 0 | { |
611 | 0 | oldi = i; |
612 | 0 | w = 1; |
613 | 0 | for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE) |
614 | 0 | { |
615 | 0 | if (!input_length--) |
616 | 0 | goto fail; |
617 | 0 | digit = decode_digit (*input++); |
618 | 0 | if (digit >= PUNYCODE_BASE) |
619 | 0 | goto fail; |
620 | 0 | if (digit > (G_MAXUINT - i) / w) |
621 | 0 | goto fail; |
622 | 0 | i += digit * w; |
623 | 0 | if (k <= bias) |
624 | 0 | t = PUNYCODE_TMIN; |
625 | 0 | else if (k >= bias + PUNYCODE_TMAX) |
626 | 0 | t = PUNYCODE_TMAX; |
627 | 0 | else |
628 | 0 | t = k - bias; |
629 | 0 | if (digit < t) |
630 | 0 | break; |
631 | 0 | if (w > G_MAXUINT / (PUNYCODE_BASE - t)) |
632 | 0 | goto fail; |
633 | 0 | w *= (PUNYCODE_BASE - t); |
634 | 0 | } |
635 | | |
636 | 0 | bias = adapt (i - oldi, output_chars->len + 1, oldi == 0); |
637 | |
|
638 | 0 | if (i / (output_chars->len + 1) > G_MAXUINT - n) |
639 | 0 | goto fail; |
640 | 0 | n += i / (output_chars->len + 1); |
641 | 0 | i %= (output_chars->len + 1); |
642 | |
|
643 | 0 | g_array_insert_val (output_chars, i++, n); |
644 | 0 | } |
645 | | |
646 | 0 | for (i = 0; i < output_chars->len; i++) |
647 | 0 | g_string_append_unichar (output, g_array_index (output_chars, gunichar, i)); |
648 | 0 | g_array_free (output_chars, TRUE); |
649 | 0 | return TRUE; |
650 | | |
651 | 0 | fail: |
652 | 0 | g_array_free (output_chars, TRUE); |
653 | 0 | return FALSE; |
654 | 0 | } |
655 | | |
656 | | /** |
657 | | * g_hostname_to_unicode: |
658 | | * @hostname: a valid UTF-8 or ASCII hostname |
659 | | * |
660 | | * Converts @hostname to its canonical presentation form; a UTF-8 |
661 | | * string in Unicode normalization form C, containing no uppercase |
662 | | * letters, no forbidden characters, and no ASCII-encoded segments, |
663 | | * and not ending with a trailing dot. |
664 | | * |
665 | | * Of course if @hostname is not an internationalized hostname, then |
666 | | * the canonical presentation form will be entirely ASCII. |
667 | | * |
668 | | * Returns: (nullable) (transfer full): a UTF-8 hostname, which must be freed, |
669 | | * or %NULL if @hostname is in some way invalid. |
670 | | * |
671 | | * Since: 2.22 |
672 | | **/ |
673 | | gchar * |
674 | | g_hostname_to_unicode (const gchar *hostname) |
675 | 0 | { |
676 | 0 | GString *out; |
677 | 0 | gssize llen; |
678 | 0 | gsize hostname_max_length_bytes = get_hostname_max_length_bytes (); |
679 | |
|
680 | 0 | g_return_val_if_fail (hostname != NULL, NULL); |
681 | | |
682 | | /* See the comment at the top of g_hostname_to_ascii(). */ |
683 | 0 | if (hostname_max_length_bytes <= G_MAXSIZE / 4 && |
684 | 0 | strlen_greater_than (hostname, 4 * MAX (255, hostname_max_length_bytes))) |
685 | 0 | return NULL; |
686 | | |
687 | 0 | out = g_string_new (NULL); |
688 | |
|
689 | 0 | do |
690 | 0 | { |
691 | 0 | llen = idna_end_of_label (hostname) - hostname; |
692 | 0 | if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
693 | 0 | { |
694 | 0 | hostname += IDNA_ACE_PREFIX_LEN; |
695 | 0 | llen -= IDNA_ACE_PREFIX_LEN; |
696 | 0 | if (!punycode_decode (hostname, llen, out)) |
697 | 0 | { |
698 | 0 | g_string_free (out, TRUE); |
699 | 0 | return NULL; |
700 | 0 | } |
701 | 0 | } |
702 | 0 | else |
703 | 0 | { |
704 | 0 | gboolean unicode; |
705 | 0 | gchar *canonicalized = nameprep (hostname, llen, &unicode); |
706 | |
|
707 | 0 | if (!canonicalized) |
708 | 0 | { |
709 | 0 | g_string_free (out, TRUE); |
710 | 0 | return NULL; |
711 | 0 | } |
712 | 0 | g_string_append (out, canonicalized); |
713 | 0 | g_free (canonicalized); |
714 | 0 | } |
715 | | |
716 | 0 | hostname += llen; |
717 | 0 | if (*hostname) |
718 | 0 | hostname = g_utf8_next_char (hostname); |
719 | 0 | if (*hostname) |
720 | 0 | g_string_append_c (out, '.'); |
721 | 0 | } |
722 | 0 | while (*hostname); |
723 | | |
724 | 0 | return g_string_free (out, FALSE); |
725 | 0 | } |
726 | | |
727 | | /** |
728 | | * g_hostname_is_ascii_encoded: |
729 | | * @hostname: a hostname |
730 | | * |
731 | | * Tests if @hostname contains segments with an ASCII-compatible |
732 | | * encoding of an Internationalized Domain Name. If this returns |
733 | | * %TRUE, you should decode the hostname with g_hostname_to_unicode() |
734 | | * before displaying it to the user. |
735 | | * |
736 | | * Note that a hostname might contain a mix of encoded and unencoded |
737 | | * segments, and so it is possible for g_hostname_is_non_ascii() and |
738 | | * g_hostname_is_ascii_encoded() to both return %TRUE for a name. |
739 | | * |
740 | | * Returns: %TRUE if @hostname contains any ASCII-encoded |
741 | | * segments. |
742 | | * |
743 | | * Since: 2.22 |
744 | | **/ |
745 | | gboolean |
746 | | g_hostname_is_ascii_encoded (const gchar *hostname) |
747 | 0 | { |
748 | 0 | while (1) |
749 | 0 | { |
750 | 0 | if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
751 | 0 | return TRUE; |
752 | 0 | hostname = idna_end_of_label (hostname); |
753 | 0 | if (*hostname) |
754 | 0 | hostname = g_utf8_next_char (hostname); |
755 | 0 | if (!*hostname) |
756 | 0 | return FALSE; |
757 | 0 | } |
758 | 0 | } |
759 | | |
760 | | /** |
761 | | * g_hostname_is_ip_address: |
762 | | * @hostname: a hostname (or IP address in string form) |
763 | | * |
764 | | * Tests if @hostname is the string form of an IPv4 or IPv6 address. |
765 | | * (Eg, "192.168.0.1".) |
766 | | * |
767 | | * Since 2.66, IPv6 addresses with a zone-id are accepted (RFC6874). |
768 | | * |
769 | | * Returns: %TRUE if @hostname is an IP address |
770 | | * |
771 | | * Since: 2.22 |
772 | | **/ |
773 | | gboolean |
774 | | g_hostname_is_ip_address (const gchar *hostname) |
775 | 0 | { |
776 | 0 | gchar *p, *end; |
777 | 0 | gint nsegments, octet; |
778 | | |
779 | | /* On Linux we could implement this using inet_pton, but the Windows |
780 | | * equivalent of that requires linking against winsock, so we just |
781 | | * figure this out ourselves. Tested by tests/hostutils.c. |
782 | | */ |
783 | |
|
784 | 0 | p = (char *)hostname; |
785 | |
|
786 | 0 | if (strchr (p, ':')) |
787 | 0 | { |
788 | 0 | gboolean skipped; |
789 | | |
790 | | /* If it contains a ':', it's an IPv6 address (assuming it's an |
791 | | * IP address at all). This consists of eight ':'-separated |
792 | | * segments, each containing a 1-4 digit hex number, except that |
793 | | * optionally: (a) the last two segments can be replaced by an |
794 | | * IPv4 address, and (b) a single span of 1 to 8 "0000" segments |
795 | | * can be replaced with just "::". |
796 | | */ |
797 | |
|
798 | 0 | nsegments = 0; |
799 | 0 | skipped = FALSE; |
800 | 0 | while (*p && *p != '%' && nsegments < 8) |
801 | 0 | { |
802 | | /* Each segment after the first must be preceded by a ':'. |
803 | | * (We also handle half of the "string starts with ::" case |
804 | | * here.) |
805 | | */ |
806 | 0 | if (p != (char *)hostname || (p[0] == ':' && p[1] == ':')) |
807 | 0 | { |
808 | 0 | if (*p != ':') |
809 | 0 | return FALSE; |
810 | 0 | p++; |
811 | 0 | } |
812 | | |
813 | | /* If there's another ':', it means we're skipping some segments */ |
814 | 0 | if (*p == ':' && !skipped) |
815 | 0 | { |
816 | 0 | skipped = TRUE; |
817 | 0 | nsegments++; |
818 | | |
819 | | /* Handle the "string ends with ::" case */ |
820 | 0 | if (!p[1]) |
821 | 0 | p++; |
822 | |
|
823 | 0 | continue; |
824 | 0 | } |
825 | | |
826 | | /* Read the segment, make sure it's valid. */ |
827 | 0 | for (end = p; g_ascii_isxdigit (*end); end++) |
828 | 0 | ; |
829 | 0 | if (end == p || end > p + 4) |
830 | 0 | return FALSE; |
831 | | |
832 | 0 | if (*end == '.') |
833 | 0 | { |
834 | 0 | if ((nsegments == 6 && !skipped) || (nsegments <= 6 && skipped)) |
835 | 0 | goto parse_ipv4; |
836 | 0 | else |
837 | 0 | return FALSE; |
838 | 0 | } |
839 | | |
840 | 0 | nsegments++; |
841 | 0 | p = end; |
842 | 0 | } |
843 | | |
844 | 0 | return (!*p || (p[0] == '%' && p[1])) && (nsegments == 8 || skipped); |
845 | 0 | } |
846 | | |
847 | 0 | parse_ipv4: |
848 | | |
849 | | /* Parse IPv4: N.N.N.N, where each N <= 255 and doesn't have leading 0s. */ |
850 | 0 | for (nsegments = 0; nsegments < 4; nsegments++) |
851 | 0 | { |
852 | 0 | if (nsegments != 0) |
853 | 0 | { |
854 | 0 | if (*p != '.') |
855 | 0 | return FALSE; |
856 | 0 | p++; |
857 | 0 | } |
858 | | |
859 | | /* Check the segment; a little tricker than the IPv6 case since |
860 | | * we can't allow extra leading 0s, and we can't assume that all |
861 | | * strings of valid length are within range. |
862 | | */ |
863 | 0 | octet = 0; |
864 | 0 | if (*p == '0') |
865 | 0 | end = p + 1; |
866 | 0 | else |
867 | 0 | { |
868 | 0 | for (end = p; g_ascii_isdigit (*end); end++) |
869 | 0 | { |
870 | 0 | octet = 10 * octet + (*end - '0'); |
871 | |
|
872 | 0 | if (octet > 255) |
873 | 0 | break; |
874 | 0 | } |
875 | 0 | } |
876 | 0 | if (end == p || end > p + 3 || octet > 255) |
877 | 0 | return FALSE; |
878 | | |
879 | 0 | p = end; |
880 | 0 | } |
881 | | |
882 | | /* If there's nothing left to parse, then it's ok. */ |
883 | 0 | return !*p; |
884 | 0 | } |