/src/glib/glib/ghostutils.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- */ |
2 | | |
3 | | /* GLIB - Library of useful routines for C programming |
4 | | * Copyright (C) 2008 Red Hat, Inc. |
5 | | * |
6 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
7 | | * |
8 | | * This library is free software; you can redistribute it and/or |
9 | | * modify it under the terms of the GNU Lesser General Public |
10 | | * License as published by the Free Software Foundation; either |
11 | | * version 2.1 of the License, or (at your option) any later version. |
12 | | * |
13 | | * This library is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 | | * Lesser General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU Lesser General |
19 | | * Public License along with this library; if not, see <http://www.gnu.org/licenses/>. |
20 | | */ |
21 | | |
22 | | #include "config.h" |
23 | | #include "glibconfig.h" |
24 | | |
25 | | #include <string.h> |
26 | | |
27 | | #ifdef G_OS_UNIX |
28 | | #include <unistd.h> |
29 | | #endif |
30 | | |
31 | | #include "ghostutils.h" |
32 | | |
33 | | #include "garray.h" |
34 | | #include "gmem.h" |
35 | | #include "gstring.h" |
36 | | #include "gstrfuncs.h" |
37 | | #include "glibintl.h" |
38 | | |
39 | | #ifdef G_PLATFORM_WIN32 |
40 | | #include <windows.h> |
41 | | #endif |
42 | | |
43 | | |
44 | | /** |
45 | | * SECTION:ghostutils |
46 | | * @short_description: Internet hostname utilities |
47 | | * |
48 | | * Functions for manipulating internet hostnames; in particular, for |
49 | | * converting between Unicode and ASCII-encoded forms of |
50 | | * Internationalized Domain Names (IDNs). |
51 | | * |
52 | | * The |
53 | | * [Internationalized Domain Names for Applications (IDNA)](http://www.ietf.org/rfc/rfc3490.txt) |
54 | | * standards allow for the use |
55 | | * of Unicode domain names in applications, while providing |
56 | | * backward-compatibility with the old ASCII-only DNS, by defining an |
57 | | * ASCII-Compatible Encoding of any given Unicode name, which can be |
58 | | * used with non-IDN-aware applications and protocols. (For example, |
59 | | * "Παν語.org" maps to "xn--4wa8awb4637h.org".) |
60 | | **/ |
61 | | |
62 | 0 | #define IDNA_ACE_PREFIX "xn--" |
63 | 0 | #define IDNA_ACE_PREFIX_LEN 4 |
64 | | |
65 | | /* Punycode constants, from RFC 3492. */ |
66 | | |
67 | 0 | #define PUNYCODE_BASE 36 |
68 | 0 | #define PUNYCODE_TMIN 1 |
69 | 0 | #define PUNYCODE_TMAX 26 |
70 | 0 | #define PUNYCODE_SKEW 38 |
71 | 0 | #define PUNYCODE_DAMP 700 |
72 | 0 | #define PUNYCODE_INITIAL_BIAS 72 |
73 | 0 | #define PUNYCODE_INITIAL_N 0x80 |
74 | | |
75 | 0 | #define PUNYCODE_IS_BASIC(cp) ((guint)(cp) < 0x80) |
76 | | |
77 | | /* Encode/decode a single base-36 digit */ |
78 | | static inline gchar |
79 | | encode_digit (guint dig) |
80 | 0 | { |
81 | 0 | if (dig < 26) |
82 | 0 | return dig + 'a'; |
83 | 0 | else |
84 | 0 | return dig - 26 + '0'; |
85 | 0 | } |
86 | | |
87 | | static inline guint |
88 | | decode_digit (gchar dig) |
89 | 0 | { |
90 | 0 | if (dig >= 'A' && dig <= 'Z') |
91 | 0 | return dig - 'A'; |
92 | 0 | else if (dig >= 'a' && dig <= 'z') |
93 | 0 | return dig - 'a'; |
94 | 0 | else if (dig >= '0' && dig <= '9') |
95 | 0 | return dig - '0' + 26; |
96 | 0 | else |
97 | 0 | return G_MAXUINT; |
98 | 0 | } |
99 | | |
100 | | /* Punycode bias adaptation algorithm, RFC 3492 section 6.1 */ |
101 | | static guint |
102 | | adapt (guint delta, |
103 | | guint numpoints, |
104 | | gboolean firsttime) |
105 | 0 | { |
106 | 0 | guint k; |
107 | |
|
108 | 0 | delta = firsttime ? delta / PUNYCODE_DAMP : delta / 2; |
109 | 0 | delta += delta / numpoints; |
110 | |
|
111 | 0 | k = 0; |
112 | 0 | while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2) |
113 | 0 | { |
114 | 0 | delta /= PUNYCODE_BASE - PUNYCODE_TMIN; |
115 | 0 | k += PUNYCODE_BASE; |
116 | 0 | } |
117 | |
|
118 | 0 | return k + ((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta / |
119 | 0 | (delta + PUNYCODE_SKEW)); |
120 | 0 | } |
121 | | |
122 | | /* Punycode encoder, RFC 3492 section 6.3. The algorithm is |
123 | | * sufficiently bizarre that it's not really worth trying to explain |
124 | | * here. |
125 | | */ |
126 | | static gboolean |
127 | | punycode_encode (const gchar *input_utf8, |
128 | | gsize input_utf8_length, |
129 | | GString *output) |
130 | 0 | { |
131 | 0 | guint delta, handled_chars, num_basic_chars, bias, j, q, k, t, digit; |
132 | 0 | gunichar n, m, *input; |
133 | 0 | glong written_chars; |
134 | 0 | gsize input_length; |
135 | 0 | gboolean success = FALSE; |
136 | | |
137 | | /* Convert from UTF-8 to Unicode code points */ |
138 | 0 | input = g_utf8_to_ucs4 (input_utf8, input_utf8_length, NULL, |
139 | 0 | &written_chars, NULL); |
140 | 0 | if (!input) |
141 | 0 | return FALSE; |
142 | | |
143 | 0 | input_length = (gsize) (written_chars > 0 ? written_chars : 0); |
144 | | |
145 | | /* Copy basic chars */ |
146 | 0 | for (j = num_basic_chars = 0; j < input_length; j++) |
147 | 0 | { |
148 | 0 | if (PUNYCODE_IS_BASIC (input[j])) |
149 | 0 | { |
150 | 0 | g_string_append_c (output, g_ascii_tolower (input[j])); |
151 | 0 | num_basic_chars++; |
152 | 0 | } |
153 | 0 | } |
154 | 0 | if (num_basic_chars) |
155 | 0 | g_string_append_c (output, '-'); |
156 | |
|
157 | 0 | handled_chars = num_basic_chars; |
158 | | |
159 | | /* Encode non-basic chars */ |
160 | 0 | delta = 0; |
161 | 0 | bias = PUNYCODE_INITIAL_BIAS; |
162 | 0 | n = PUNYCODE_INITIAL_N; |
163 | 0 | while (handled_chars < input_length) |
164 | 0 | { |
165 | | /* let m = the minimum {non-basic} code point >= n in the input */ |
166 | 0 | for (m = G_MAXUINT, j = 0; j < input_length; j++) |
167 | 0 | { |
168 | 0 | if (input[j] >= n && input[j] < m) |
169 | 0 | m = input[j]; |
170 | 0 | } |
171 | |
|
172 | 0 | if (m - n > (G_MAXUINT - delta) / (handled_chars + 1)) |
173 | 0 | goto fail; |
174 | 0 | delta += (m - n) * (handled_chars + 1); |
175 | 0 | n = m; |
176 | |
|
177 | 0 | for (j = 0; j < input_length; j++) |
178 | 0 | { |
179 | 0 | if (input[j] < n) |
180 | 0 | { |
181 | 0 | if (++delta == 0) |
182 | 0 | goto fail; |
183 | 0 | } |
184 | 0 | else if (input[j] == n) |
185 | 0 | { |
186 | 0 | q = delta; |
187 | 0 | for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE) |
188 | 0 | { |
189 | 0 | if (k <= bias) |
190 | 0 | t = PUNYCODE_TMIN; |
191 | 0 | else if (k >= bias + PUNYCODE_TMAX) |
192 | 0 | t = PUNYCODE_TMAX; |
193 | 0 | else |
194 | 0 | t = k - bias; |
195 | 0 | if (q < t) |
196 | 0 | break; |
197 | 0 | digit = t + (q - t) % (PUNYCODE_BASE - t); |
198 | 0 | g_string_append_c (output, encode_digit (digit)); |
199 | 0 | q = (q - t) / (PUNYCODE_BASE - t); |
200 | 0 | } |
201 | |
|
202 | 0 | g_string_append_c (output, encode_digit (q)); |
203 | 0 | bias = adapt (delta, handled_chars + 1, handled_chars == num_basic_chars); |
204 | 0 | delta = 0; |
205 | 0 | handled_chars++; |
206 | 0 | } |
207 | 0 | } |
208 | | |
209 | 0 | delta++; |
210 | 0 | n++; |
211 | 0 | } |
212 | | |
213 | 0 | success = TRUE; |
214 | |
|
215 | 0 | fail: |
216 | 0 | g_free (input); |
217 | 0 | return success; |
218 | 0 | } |
219 | | |
220 | | /* From RFC 3454, Table B.1 */ |
221 | 0 | #define idna_is_junk(ch) ((ch) == 0x00AD || (ch) == 0x1806 || (ch) == 0x200B || (ch) == 0x2060 || (ch) == 0xFEFF || (ch) == 0x034F || (ch) == 0x180B || (ch) == 0x180C || (ch) == 0x180D || (ch) == 0x200C || (ch) == 0x200D || ((ch) >= 0xFE00 && (ch) <= 0xFE0F)) |
222 | | |
223 | | /* Scan @str for "junk" and return a cleaned-up string if any junk |
224 | | * is found. Else return %NULL. |
225 | | */ |
226 | | static gchar * |
227 | | remove_junk (const gchar *str, |
228 | | gint len) |
229 | 0 | { |
230 | 0 | GString *cleaned = NULL; |
231 | 0 | const gchar *p; |
232 | 0 | gunichar ch; |
233 | |
|
234 | 0 | for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p)) |
235 | 0 | { |
236 | 0 | ch = g_utf8_get_char (p); |
237 | 0 | if (idna_is_junk (ch)) |
238 | 0 | { |
239 | 0 | if (!cleaned) |
240 | 0 | { |
241 | 0 | cleaned = g_string_new (NULL); |
242 | 0 | g_string_append_len (cleaned, str, p - str); |
243 | 0 | } |
244 | 0 | } |
245 | 0 | else if (cleaned) |
246 | 0 | g_string_append_unichar (cleaned, ch); |
247 | 0 | } |
248 | |
|
249 | 0 | if (cleaned) |
250 | 0 | return g_string_free (cleaned, FALSE); |
251 | 0 | else |
252 | 0 | return NULL; |
253 | 0 | } |
254 | | |
255 | | static inline gboolean |
256 | | contains_uppercase_letters (const gchar *str, |
257 | | gint len) |
258 | 0 | { |
259 | 0 | const gchar *p; |
260 | |
|
261 | 0 | for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p)) |
262 | 0 | { |
263 | 0 | if (g_unichar_isupper (g_utf8_get_char (p))) |
264 | 0 | return TRUE; |
265 | 0 | } |
266 | 0 | return FALSE; |
267 | 0 | } |
268 | | |
269 | | static inline gboolean |
270 | | contains_non_ascii (const gchar *str, |
271 | | gint len) |
272 | 0 | { |
273 | 0 | const gchar *p; |
274 | |
|
275 | 0 | for (p = str; len == -1 ? *p : p < str + len; p++) |
276 | 0 | { |
277 | 0 | if ((guchar)*p > 0x80) |
278 | 0 | return TRUE; |
279 | 0 | } |
280 | 0 | return FALSE; |
281 | 0 | } |
282 | | |
283 | | /* RFC 3454, Appendix C. ish. */ |
284 | | static inline gboolean |
285 | | idna_is_prohibited (gunichar ch) |
286 | 0 | { |
287 | 0 | switch (g_unichar_type (ch)) |
288 | 0 | { |
289 | 0 | case G_UNICODE_CONTROL: |
290 | 0 | case G_UNICODE_FORMAT: |
291 | 0 | case G_UNICODE_UNASSIGNED: |
292 | 0 | case G_UNICODE_PRIVATE_USE: |
293 | 0 | case G_UNICODE_SURROGATE: |
294 | 0 | case G_UNICODE_LINE_SEPARATOR: |
295 | 0 | case G_UNICODE_PARAGRAPH_SEPARATOR: |
296 | 0 | case G_UNICODE_SPACE_SEPARATOR: |
297 | 0 | return TRUE; |
298 | | |
299 | 0 | case G_UNICODE_OTHER_SYMBOL: |
300 | 0 | if (ch == 0xFFFC || ch == 0xFFFD || |
301 | 0 | (ch >= 0x2FF0 && ch <= 0x2FFB)) |
302 | 0 | return TRUE; |
303 | 0 | return FALSE; |
304 | | |
305 | 0 | case G_UNICODE_NON_SPACING_MARK: |
306 | 0 | if (ch == 0x0340 || ch == 0x0341) |
307 | 0 | return TRUE; |
308 | 0 | return FALSE; |
309 | | |
310 | 0 | default: |
311 | 0 | return FALSE; |
312 | 0 | } |
313 | 0 | } |
314 | | |
315 | | /* RFC 3491 IDN cleanup algorithm. */ |
316 | | static gchar * |
317 | | nameprep (const gchar *hostname, |
318 | | gint len, |
319 | | gboolean *is_unicode) |
320 | 0 | { |
321 | 0 | gchar *name, *tmp = NULL, *p; |
322 | | |
323 | | /* It would be nice if we could do this without repeatedly |
324 | | * allocating strings and converting back and forth between |
325 | | * gunichars and UTF-8... The code does at least avoid doing most of |
326 | | * the sub-operations when they would just be equivalent to a |
327 | | * g_strdup(). |
328 | | */ |
329 | | |
330 | | /* Remove presentation-only characters */ |
331 | 0 | name = remove_junk (hostname, len); |
332 | 0 | if (name) |
333 | 0 | { |
334 | 0 | tmp = name; |
335 | 0 | len = -1; |
336 | 0 | } |
337 | 0 | else |
338 | 0 | name = (gchar *)hostname; |
339 | | |
340 | | /* Convert to lowercase */ |
341 | 0 | if (contains_uppercase_letters (name, len)) |
342 | 0 | { |
343 | 0 | name = g_utf8_strdown (name, len); |
344 | 0 | g_free (tmp); |
345 | 0 | tmp = name; |
346 | 0 | len = -1; |
347 | 0 | } |
348 | | |
349 | | /* If there are no UTF8 characters, we're done. */ |
350 | 0 | if (!contains_non_ascii (name, len)) |
351 | 0 | { |
352 | 0 | *is_unicode = FALSE; |
353 | 0 | if (name == (gchar *)hostname) |
354 | 0 | return len == -1 ? g_strdup (hostname) : g_strndup (hostname, len); |
355 | 0 | else |
356 | 0 | return name; |
357 | 0 | } |
358 | | |
359 | 0 | *is_unicode = TRUE; |
360 | | |
361 | | /* Normalize */ |
362 | 0 | name = g_utf8_normalize (name, len, G_NORMALIZE_NFKC); |
363 | 0 | g_free (tmp); |
364 | 0 | tmp = name; |
365 | |
|
366 | 0 | if (!name) |
367 | 0 | return NULL; |
368 | | |
369 | | /* KC normalization may have created more capital letters (eg, |
370 | | * angstrom -> capital A with ring). So we have to lowercasify a |
371 | | * second time. (This is more-or-less how the nameprep algorithm |
372 | | * does it. If tolower(nfkc(tolower(X))) is guaranteed to be the |
373 | | * same as tolower(nfkc(X)), then we could skip the first tolower, |
374 | | * but I'm not sure it is.) |
375 | | */ |
376 | 0 | if (contains_uppercase_letters (name, -1)) |
377 | 0 | { |
378 | 0 | name = g_utf8_strdown (name, -1); |
379 | 0 | g_free (tmp); |
380 | 0 | tmp = name; |
381 | 0 | } |
382 | | |
383 | | /* Check for prohibited characters */ |
384 | 0 | for (p = name; *p; p = g_utf8_next_char (p)) |
385 | 0 | { |
386 | 0 | if (idna_is_prohibited (g_utf8_get_char (p))) |
387 | 0 | { |
388 | 0 | name = NULL; |
389 | 0 | g_free (tmp); |
390 | 0 | goto done; |
391 | 0 | } |
392 | 0 | } |
393 | | |
394 | | /* FIXME: We're supposed to verify certain constraints on bidi |
395 | | * characters, but glib does not appear to have that information. |
396 | | */ |
397 | | |
398 | 0 | done: |
399 | 0 | return name; |
400 | 0 | } |
401 | | |
402 | | /* RFC 3490, section 3.1 says '.', 0x3002, 0xFF0E, and 0xFF61 count as |
403 | | * label-separating dots. @str must be '\0'-terminated. |
404 | | */ |
405 | 0 | #define idna_is_dot(str) ( \ |
406 | 0 | ((guchar)(str)[0] == '.') || \ |
407 | 0 | ((guchar)(str)[0] == 0xE3 && (guchar)(str)[1] == 0x80 && (guchar)(str)[2] == 0x82) || \ |
408 | 0 | ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBC && (guchar)(str)[2] == 0x8E) || \ |
409 | 0 | ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBD && (guchar)(str)[2] == 0xA1) ) |
410 | | |
411 | | static const gchar * |
412 | | idna_end_of_label (const gchar *str) |
413 | 0 | { |
414 | 0 | for (; *str; str = g_utf8_next_char (str)) |
415 | 0 | { |
416 | 0 | if (idna_is_dot (str)) |
417 | 0 | return str; |
418 | 0 | } |
419 | 0 | return str; |
420 | 0 | } |
421 | | |
422 | | static gsize |
423 | | get_hostname_max_length_bytes (void) |
424 | 0 | { |
425 | | #if defined(G_OS_WIN32) |
426 | | wchar_t tmp[MAX_COMPUTERNAME_LENGTH]; |
427 | | return sizeof (tmp) / sizeof (tmp[0]); |
428 | | #elif defined(_SC_HOST_NAME_MAX) |
429 | | glong max = sysconf (_SC_HOST_NAME_MAX); |
430 | 0 | if (max > 0) |
431 | 0 | return (gsize) max; |
432 | | |
433 | 0 | #ifdef HOST_NAME_MAX |
434 | 0 | return HOST_NAME_MAX; |
435 | | #else |
436 | | return _POSIX_HOST_NAME_MAX; |
437 | | #endif /* HOST_NAME_MAX */ |
438 | | #else |
439 | | /* Fallback to some reasonable value |
440 | | * See https://stackoverflow.com/questions/8724954/what-is-the-maximum-number-of-characters-for-a-host-name-in-unix/28918017#28918017 */ |
441 | | return 255; |
442 | | #endif |
443 | 0 | } |
444 | | |
445 | | /* Returns %TRUE if `strlen (str) > comparison_length`, but without actually |
446 | | * running `strlen(str)`, as that would take a very long time for long |
447 | | * (untrusted) input strings. */ |
448 | | static gboolean |
449 | | strlen_greater_than (const gchar *str, |
450 | | gsize comparison_length) |
451 | 0 | { |
452 | 0 | gsize i; |
453 | |
|
454 | 0 | for (i = 0; str[i] != '\0'; i++) |
455 | 0 | if (i > comparison_length) |
456 | 0 | return TRUE; |
457 | | |
458 | 0 | return FALSE; |
459 | 0 | } |
460 | | |
461 | | /** |
462 | | * g_hostname_to_ascii: |
463 | | * @hostname: a valid UTF-8 or ASCII hostname |
464 | | * |
465 | | * Converts @hostname to its canonical ASCII form; an ASCII-only |
466 | | * string containing no uppercase letters and not ending with a |
467 | | * trailing dot. |
468 | | * |
469 | | * Returns: (nullable) (transfer full): an ASCII hostname, which must be freed, |
470 | | * or %NULL if @hostname is in some way invalid. |
471 | | * |
472 | | * Since: 2.22 |
473 | | **/ |
474 | | gchar * |
475 | | g_hostname_to_ascii (const gchar *hostname) |
476 | 0 | { |
477 | 0 | gchar *name, *label, *p; |
478 | 0 | GString *out; |
479 | 0 | gssize llen, oldlen; |
480 | 0 | gboolean unicode; |
481 | 0 | gsize hostname_max_length_bytes = get_hostname_max_length_bytes (); |
482 | | |
483 | | /* Do an initial check on the hostname length, as overlong hostnames take a |
484 | | * long time in the IDN cleanup algorithm in nameprep(). The ultimate |
485 | | * restriction is that the IDN-decoded (i.e. pure ASCII) hostname cannot be |
486 | | * longer than 255 bytes. That’s the least restrictive limit on hostname |
487 | | * length of all the ways hostnames can be interpreted. Typically, the |
488 | | * hostname will be an FQDN, which is limited to 253 bytes long. POSIX |
489 | | * hostnames are limited to `get_hostname_max_length_bytes()` (typically 255 |
490 | | * bytes). |
491 | | * |
492 | | * See https://stackoverflow.com/a/28918017/2931197 |
493 | | * |
494 | | * It’s possible for a hostname to be %-encoded, in which case its decoded |
495 | | * length will be as much as 3× shorter. |
496 | | * |
497 | | * It’s also possible for a hostname to use overlong UTF-8 encodings, in which |
498 | | * case its decoded length will be as much as 4× shorter. |
499 | | * |
500 | | * Note: This check is not intended as an absolute guarantee that a hostname |
501 | | * is the right length and will be accepted by other systems. It’s intended to |
502 | | * stop wildly-invalid hostnames from taking forever in nameprep(). |
503 | | */ |
504 | 0 | if (hostname_max_length_bytes <= G_MAXSIZE / 4 && |
505 | 0 | strlen_greater_than (hostname, 4 * MAX (255, hostname_max_length_bytes))) |
506 | 0 | return NULL; |
507 | | |
508 | 0 | label = name = nameprep (hostname, -1, &unicode); |
509 | 0 | if (!name || !unicode) |
510 | 0 | return name; |
511 | | |
512 | 0 | out = g_string_new (NULL); |
513 | |
|
514 | 0 | do |
515 | 0 | { |
516 | 0 | unicode = FALSE; |
517 | 0 | for (p = label; *p && !idna_is_dot (p); p++) |
518 | 0 | { |
519 | 0 | if ((guchar)*p > 0x80) |
520 | 0 | unicode = TRUE; |
521 | 0 | } |
522 | |
|
523 | 0 | oldlen = out->len; |
524 | 0 | llen = p - label; |
525 | 0 | if (unicode) |
526 | 0 | { |
527 | 0 | if (!strncmp (label, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
528 | 0 | goto fail; |
529 | | |
530 | 0 | g_string_append (out, IDNA_ACE_PREFIX); |
531 | 0 | if (!punycode_encode (label, llen, out)) |
532 | 0 | goto fail; |
533 | 0 | } |
534 | 0 | else |
535 | 0 | g_string_append_len (out, label, llen); |
536 | | |
537 | 0 | if (out->len - oldlen > 63) |
538 | 0 | goto fail; |
539 | | |
540 | 0 | label += llen; |
541 | 0 | if (*label) |
542 | 0 | label = g_utf8_next_char (label); |
543 | 0 | if (*label) |
544 | 0 | g_string_append_c (out, '.'); |
545 | 0 | } |
546 | 0 | while (*label); |
547 | | |
548 | 0 | g_free (name); |
549 | 0 | return g_string_free (out, FALSE); |
550 | | |
551 | 0 | fail: |
552 | 0 | g_free (name); |
553 | 0 | g_string_free (out, TRUE); |
554 | 0 | return NULL; |
555 | 0 | } |
556 | | |
557 | | /** |
558 | | * g_hostname_is_non_ascii: |
559 | | * @hostname: a hostname |
560 | | * |
561 | | * Tests if @hostname contains Unicode characters. If this returns |
562 | | * %TRUE, you need to encode the hostname with g_hostname_to_ascii() |
563 | | * before using it in non-IDN-aware contexts. |
564 | | * |
565 | | * Note that a hostname might contain a mix of encoded and unencoded |
566 | | * segments, and so it is possible for g_hostname_is_non_ascii() and |
567 | | * g_hostname_is_ascii_encoded() to both return %TRUE for a name. |
568 | | * |
569 | | * Returns: %TRUE if @hostname contains any non-ASCII characters |
570 | | * |
571 | | * Since: 2.22 |
572 | | **/ |
573 | | gboolean |
574 | | g_hostname_is_non_ascii (const gchar *hostname) |
575 | 0 | { |
576 | 0 | return contains_non_ascii (hostname, -1); |
577 | 0 | } |
578 | | |
579 | | /* Punycode decoder, RFC 3492 section 6.2. As with punycode_encode(), |
580 | | * read the RFC if you want to understand what this is actually doing. |
581 | | */ |
582 | | static gboolean |
583 | | punycode_decode (const gchar *input, |
584 | | gsize input_length, |
585 | | GString *output) |
586 | 0 | { |
587 | 0 | GArray *output_chars; |
588 | 0 | gunichar n; |
589 | 0 | guint i, bias; |
590 | 0 | guint oldi, w, k, digit, t; |
591 | 0 | const gchar *split; |
592 | |
|
593 | 0 | n = PUNYCODE_INITIAL_N; |
594 | 0 | i = 0; |
595 | 0 | bias = PUNYCODE_INITIAL_BIAS; |
596 | |
|
597 | 0 | split = input + input_length - 1; |
598 | 0 | while (split > input && *split != '-') |
599 | 0 | split--; |
600 | 0 | if (split > input) |
601 | 0 | { |
602 | 0 | output_chars = g_array_sized_new (FALSE, FALSE, sizeof (gunichar), |
603 | 0 | split - input); |
604 | 0 | input_length -= (split - input) + 1; |
605 | 0 | while (input < split) |
606 | 0 | { |
607 | 0 | gunichar ch = (gunichar)*input++; |
608 | 0 | if (!PUNYCODE_IS_BASIC (ch)) |
609 | 0 | goto fail; |
610 | 0 | g_array_append_val (output_chars, ch); |
611 | 0 | } |
612 | 0 | input++; |
613 | 0 | } |
614 | 0 | else |
615 | 0 | output_chars = g_array_new (FALSE, FALSE, sizeof (gunichar)); |
616 | | |
617 | 0 | while (input_length) |
618 | 0 | { |
619 | 0 | oldi = i; |
620 | 0 | w = 1; |
621 | 0 | for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE) |
622 | 0 | { |
623 | 0 | if (!input_length--) |
624 | 0 | goto fail; |
625 | 0 | digit = decode_digit (*input++); |
626 | 0 | if (digit >= PUNYCODE_BASE) |
627 | 0 | goto fail; |
628 | 0 | if (digit > (G_MAXUINT - i) / w) |
629 | 0 | goto fail; |
630 | 0 | i += digit * w; |
631 | 0 | if (k <= bias) |
632 | 0 | t = PUNYCODE_TMIN; |
633 | 0 | else if (k >= bias + PUNYCODE_TMAX) |
634 | 0 | t = PUNYCODE_TMAX; |
635 | 0 | else |
636 | 0 | t = k - bias; |
637 | 0 | if (digit < t) |
638 | 0 | break; |
639 | 0 | if (w > G_MAXUINT / (PUNYCODE_BASE - t)) |
640 | 0 | goto fail; |
641 | 0 | w *= (PUNYCODE_BASE - t); |
642 | 0 | } |
643 | | |
644 | 0 | bias = adapt (i - oldi, output_chars->len + 1, oldi == 0); |
645 | |
|
646 | 0 | if (i / (output_chars->len + 1) > G_MAXUINT - n) |
647 | 0 | goto fail; |
648 | 0 | n += i / (output_chars->len + 1); |
649 | 0 | i %= (output_chars->len + 1); |
650 | |
|
651 | 0 | g_array_insert_val (output_chars, i++, n); |
652 | 0 | } |
653 | | |
654 | 0 | for (i = 0; i < output_chars->len; i++) |
655 | 0 | g_string_append_unichar (output, g_array_index (output_chars, gunichar, i)); |
656 | 0 | g_array_free (output_chars, TRUE); |
657 | 0 | return TRUE; |
658 | | |
659 | 0 | fail: |
660 | 0 | g_array_free (output_chars, TRUE); |
661 | 0 | return FALSE; |
662 | 0 | } |
663 | | |
664 | | /** |
665 | | * g_hostname_to_unicode: |
666 | | * @hostname: a valid UTF-8 or ASCII hostname |
667 | | * |
668 | | * Converts @hostname to its canonical presentation form; a UTF-8 |
669 | | * string in Unicode normalization form C, containing no uppercase |
670 | | * letters, no forbidden characters, and no ASCII-encoded segments, |
671 | | * and not ending with a trailing dot. |
672 | | * |
673 | | * Of course if @hostname is not an internationalized hostname, then |
674 | | * the canonical presentation form will be entirely ASCII. |
675 | | * |
676 | | * Returns: (nullable) (transfer full): a UTF-8 hostname, which must be freed, |
677 | | * or %NULL if @hostname is in some way invalid. |
678 | | * |
679 | | * Since: 2.22 |
680 | | **/ |
681 | | gchar * |
682 | | g_hostname_to_unicode (const gchar *hostname) |
683 | 0 | { |
684 | 0 | GString *out; |
685 | 0 | gssize llen; |
686 | 0 | gsize hostname_max_length_bytes = get_hostname_max_length_bytes (); |
687 | | |
688 | | /* See the comment at the top of g_hostname_to_ascii(). */ |
689 | 0 | if (hostname_max_length_bytes <= G_MAXSIZE / 4 && |
690 | 0 | strlen_greater_than (hostname, 4 * MAX (255, hostname_max_length_bytes))) |
691 | 0 | return NULL; |
692 | | |
693 | 0 | out = g_string_new (NULL); |
694 | |
|
695 | 0 | do |
696 | 0 | { |
697 | 0 | llen = idna_end_of_label (hostname) - hostname; |
698 | 0 | if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
699 | 0 | { |
700 | 0 | hostname += IDNA_ACE_PREFIX_LEN; |
701 | 0 | llen -= IDNA_ACE_PREFIX_LEN; |
702 | 0 | if (!punycode_decode (hostname, llen, out)) |
703 | 0 | { |
704 | 0 | g_string_free (out, TRUE); |
705 | 0 | return NULL; |
706 | 0 | } |
707 | 0 | } |
708 | 0 | else |
709 | 0 | { |
710 | 0 | gboolean unicode; |
711 | 0 | gchar *canonicalized = nameprep (hostname, llen, &unicode); |
712 | |
|
713 | 0 | if (!canonicalized) |
714 | 0 | { |
715 | 0 | g_string_free (out, TRUE); |
716 | 0 | return NULL; |
717 | 0 | } |
718 | 0 | g_string_append (out, canonicalized); |
719 | 0 | g_free (canonicalized); |
720 | 0 | } |
721 | | |
722 | 0 | hostname += llen; |
723 | 0 | if (*hostname) |
724 | 0 | hostname = g_utf8_next_char (hostname); |
725 | 0 | if (*hostname) |
726 | 0 | g_string_append_c (out, '.'); |
727 | 0 | } |
728 | 0 | while (*hostname); |
729 | | |
730 | 0 | return g_string_free (out, FALSE); |
731 | 0 | } |
732 | | |
733 | | /** |
734 | | * g_hostname_is_ascii_encoded: |
735 | | * @hostname: a hostname |
736 | | * |
737 | | * Tests if @hostname contains segments with an ASCII-compatible |
738 | | * encoding of an Internationalized Domain Name. If this returns |
739 | | * %TRUE, you should decode the hostname with g_hostname_to_unicode() |
740 | | * before displaying it to the user. |
741 | | * |
742 | | * Note that a hostname might contain a mix of encoded and unencoded |
743 | | * segments, and so it is possible for g_hostname_is_non_ascii() and |
744 | | * g_hostname_is_ascii_encoded() to both return %TRUE for a name. |
745 | | * |
746 | | * Returns: %TRUE if @hostname contains any ASCII-encoded |
747 | | * segments. |
748 | | * |
749 | | * Since: 2.22 |
750 | | **/ |
751 | | gboolean |
752 | | g_hostname_is_ascii_encoded (const gchar *hostname) |
753 | 0 | { |
754 | 0 | while (1) |
755 | 0 | { |
756 | 0 | if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
757 | 0 | return TRUE; |
758 | 0 | hostname = idna_end_of_label (hostname); |
759 | 0 | if (*hostname) |
760 | 0 | hostname = g_utf8_next_char (hostname); |
761 | 0 | if (!*hostname) |
762 | 0 | return FALSE; |
763 | 0 | } |
764 | 0 | } |
765 | | |
766 | | /** |
767 | | * g_hostname_is_ip_address: |
768 | | * @hostname: a hostname (or IP address in string form) |
769 | | * |
770 | | * Tests if @hostname is the string form of an IPv4 or IPv6 address. |
771 | | * (Eg, "192.168.0.1".) |
772 | | * |
773 | | * Since 2.66, IPv6 addresses with a zone-id are accepted (RFC6874). |
774 | | * |
775 | | * Returns: %TRUE if @hostname is an IP address |
776 | | * |
777 | | * Since: 2.22 |
778 | | **/ |
779 | | gboolean |
780 | | g_hostname_is_ip_address (const gchar *hostname) |
781 | 0 | { |
782 | 0 | gchar *p, *end; |
783 | 0 | gint nsegments, octet; |
784 | | |
785 | | /* On Linux we could implement this using inet_pton, but the Windows |
786 | | * equivalent of that requires linking against winsock, so we just |
787 | | * figure this out ourselves. Tested by tests/hostutils.c. |
788 | | */ |
789 | |
|
790 | 0 | p = (char *)hostname; |
791 | |
|
792 | 0 | if (strchr (p, ':')) |
793 | 0 | { |
794 | 0 | gboolean skipped; |
795 | | |
796 | | /* If it contains a ':', it's an IPv6 address (assuming it's an |
797 | | * IP address at all). This consists of eight ':'-separated |
798 | | * segments, each containing a 1-4 digit hex number, except that |
799 | | * optionally: (a) the last two segments can be replaced by an |
800 | | * IPv4 address, and (b) a single span of 1 to 8 "0000" segments |
801 | | * can be replaced with just "::". |
802 | | */ |
803 | |
|
804 | 0 | nsegments = 0; |
805 | 0 | skipped = FALSE; |
806 | 0 | while (*p && *p != '%' && nsegments < 8) |
807 | 0 | { |
808 | | /* Each segment after the first must be preceded by a ':'. |
809 | | * (We also handle half of the "string starts with ::" case |
810 | | * here.) |
811 | | */ |
812 | 0 | if (p != (char *)hostname || (p[0] == ':' && p[1] == ':')) |
813 | 0 | { |
814 | 0 | if (*p != ':') |
815 | 0 | return FALSE; |
816 | 0 | p++; |
817 | 0 | } |
818 | | |
819 | | /* If there's another ':', it means we're skipping some segments */ |
820 | 0 | if (*p == ':' && !skipped) |
821 | 0 | { |
822 | 0 | skipped = TRUE; |
823 | 0 | nsegments++; |
824 | | |
825 | | /* Handle the "string ends with ::" case */ |
826 | 0 | if (!p[1]) |
827 | 0 | p++; |
828 | |
|
829 | 0 | continue; |
830 | 0 | } |
831 | | |
832 | | /* Read the segment, make sure it's valid. */ |
833 | 0 | for (end = p; g_ascii_isxdigit (*end); end++) |
834 | 0 | ; |
835 | 0 | if (end == p || end > p + 4) |
836 | 0 | return FALSE; |
837 | | |
838 | 0 | if (*end == '.') |
839 | 0 | { |
840 | 0 | if ((nsegments == 6 && !skipped) || (nsegments <= 6 && skipped)) |
841 | 0 | goto parse_ipv4; |
842 | 0 | else |
843 | 0 | return FALSE; |
844 | 0 | } |
845 | | |
846 | 0 | nsegments++; |
847 | 0 | p = end; |
848 | 0 | } |
849 | | |
850 | 0 | return (!*p || (p[0] == '%' && p[1])) && (nsegments == 8 || skipped); |
851 | 0 | } |
852 | | |
853 | 0 | parse_ipv4: |
854 | | |
855 | | /* Parse IPv4: N.N.N.N, where each N <= 255 and doesn't have leading 0s. */ |
856 | 0 | for (nsegments = 0; nsegments < 4; nsegments++) |
857 | 0 | { |
858 | 0 | if (nsegments != 0) |
859 | 0 | { |
860 | 0 | if (*p != '.') |
861 | 0 | return FALSE; |
862 | 0 | p++; |
863 | 0 | } |
864 | | |
865 | | /* Check the segment; a little tricker than the IPv6 case since |
866 | | * we can't allow extra leading 0s, and we can't assume that all |
867 | | * strings of valid length are within range. |
868 | | */ |
869 | 0 | octet = 0; |
870 | 0 | if (*p == '0') |
871 | 0 | end = p + 1; |
872 | 0 | else |
873 | 0 | { |
874 | 0 | for (end = p; g_ascii_isdigit (*end); end++) |
875 | 0 | { |
876 | 0 | octet = 10 * octet + (*end - '0'); |
877 | |
|
878 | 0 | if (octet > 255) |
879 | 0 | break; |
880 | 0 | } |
881 | 0 | } |
882 | 0 | if (end == p || end > p + 3 || octet > 255) |
883 | 0 | return FALSE; |
884 | | |
885 | 0 | p = end; |
886 | 0 | } |
887 | | |
888 | | /* If there's nothing left to parse, then it's ok. */ |
889 | 0 | return !*p; |
890 | 0 | } |