/src/glib/glib/ghostutils.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- */ |
2 | | |
3 | | /* GLIB - Library of useful routines for C programming |
4 | | * Copyright (C) 2008 Red Hat, Inc. |
5 | | * |
6 | | * This library is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * This library is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General |
17 | | * Public License along with this library; if not, see <http://www.gnu.org/licenses/>. |
18 | | */ |
19 | | |
20 | | #include "config.h" |
21 | | #include "glibconfig.h" |
22 | | |
23 | | #include <string.h> |
24 | | |
25 | | #ifdef G_OS_UNIX |
26 | | #include <unistd.h> |
27 | | #endif |
28 | | |
29 | | #include "ghostutils.h" |
30 | | |
31 | | #include "garray.h" |
32 | | #include "gmem.h" |
33 | | #include "gstring.h" |
34 | | #include "gstrfuncs.h" |
35 | | #include "glibintl.h" |
36 | | |
37 | | #ifdef G_PLATFORM_WIN32 |
38 | | #include <windows.h> |
39 | | #endif |
40 | | |
41 | | |
42 | | /** |
43 | | * SECTION:ghostutils |
44 | | * @short_description: Internet hostname utilities |
45 | | * |
46 | | * Functions for manipulating internet hostnames; in particular, for |
47 | | * converting between Unicode and ASCII-encoded forms of |
48 | | * Internationalized Domain Names (IDNs). |
49 | | * |
50 | | * The |
51 | | * [Internationalized Domain Names for Applications (IDNA)](http://www.ietf.org/rfc/rfc3490.txt) |
52 | | * standards allow for the use |
53 | | * of Unicode domain names in applications, while providing |
54 | | * backward-compatibility with the old ASCII-only DNS, by defining an |
55 | | * ASCII-Compatible Encoding of any given Unicode name, which can be |
56 | | * used with non-IDN-aware applications and protocols. (For example, |
57 | | * "Παν語.org" maps to "xn--4wa8awb4637h.org".) |
58 | | **/ |
59 | | |
60 | 0 | #define IDNA_ACE_PREFIX "xn--" |
61 | 0 | #define IDNA_ACE_PREFIX_LEN 4 |
62 | | |
63 | | /* Punycode constants, from RFC 3492. */ |
64 | | |
65 | 0 | #define PUNYCODE_BASE 36 |
66 | 0 | #define PUNYCODE_TMIN 1 |
67 | 0 | #define PUNYCODE_TMAX 26 |
68 | 0 | #define PUNYCODE_SKEW 38 |
69 | 0 | #define PUNYCODE_DAMP 700 |
70 | 0 | #define PUNYCODE_INITIAL_BIAS 72 |
71 | 0 | #define PUNYCODE_INITIAL_N 0x80 |
72 | | |
73 | 0 | #define PUNYCODE_IS_BASIC(cp) ((guint)(cp) < 0x80) |
74 | | |
75 | | /* Encode/decode a single base-36 digit */ |
76 | | static inline gchar |
77 | | encode_digit (guint dig) |
78 | 0 | { |
79 | 0 | if (dig < 26) |
80 | 0 | return dig + 'a'; |
81 | 0 | else |
82 | 0 | return dig - 26 + '0'; |
83 | 0 | } |
84 | | |
85 | | static inline guint |
86 | | decode_digit (gchar dig) |
87 | 0 | { |
88 | 0 | if (dig >= 'A' && dig <= 'Z') |
89 | 0 | return dig - 'A'; |
90 | 0 | else if (dig >= 'a' && dig <= 'z') |
91 | 0 | return dig - 'a'; |
92 | 0 | else if (dig >= '0' && dig <= '9') |
93 | 0 | return dig - '0' + 26; |
94 | 0 | else |
95 | 0 | return G_MAXUINT; |
96 | 0 | } |
97 | | |
98 | | /* Punycode bias adaptation algorithm, RFC 3492 section 6.1 */ |
99 | | static guint |
100 | | adapt (guint delta, |
101 | | guint numpoints, |
102 | | gboolean firsttime) |
103 | 0 | { |
104 | 0 | guint k; |
105 | |
|
106 | 0 | delta = firsttime ? delta / PUNYCODE_DAMP : delta / 2; |
107 | 0 | delta += delta / numpoints; |
108 | |
|
109 | 0 | k = 0; |
110 | 0 | while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2) |
111 | 0 | { |
112 | 0 | delta /= PUNYCODE_BASE - PUNYCODE_TMIN; |
113 | 0 | k += PUNYCODE_BASE; |
114 | 0 | } |
115 | |
|
116 | 0 | return k + ((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta / |
117 | 0 | (delta + PUNYCODE_SKEW)); |
118 | 0 | } |
119 | | |
120 | | /* Punycode encoder, RFC 3492 section 6.3. The algorithm is |
121 | | * sufficiently bizarre that it's not really worth trying to explain |
122 | | * here. |
123 | | */ |
124 | | static gboolean |
125 | | punycode_encode (const gchar *input_utf8, |
126 | | gsize input_utf8_length, |
127 | | GString *output) |
128 | 0 | { |
129 | 0 | guint delta, handled_chars, num_basic_chars, bias, j, q, k, t, digit; |
130 | 0 | gunichar n, m, *input; |
131 | 0 | glong input_length; |
132 | 0 | gboolean success = FALSE; |
133 | | |
134 | | /* Convert from UTF-8 to Unicode code points */ |
135 | 0 | input = g_utf8_to_ucs4 (input_utf8, input_utf8_length, NULL, |
136 | 0 | &input_length, NULL); |
137 | 0 | if (!input) |
138 | 0 | return FALSE; |
139 | | |
140 | | /* Copy basic chars */ |
141 | 0 | for (j = num_basic_chars = 0; j < input_length; j++) |
142 | 0 | { |
143 | 0 | if (PUNYCODE_IS_BASIC (input[j])) |
144 | 0 | { |
145 | 0 | g_string_append_c (output, g_ascii_tolower (input[j])); |
146 | 0 | num_basic_chars++; |
147 | 0 | } |
148 | 0 | } |
149 | 0 | if (num_basic_chars) |
150 | 0 | g_string_append_c (output, '-'); |
151 | |
|
152 | 0 | handled_chars = num_basic_chars; |
153 | | |
154 | | /* Encode non-basic chars */ |
155 | 0 | delta = 0; |
156 | 0 | bias = PUNYCODE_INITIAL_BIAS; |
157 | 0 | n = PUNYCODE_INITIAL_N; |
158 | 0 | while (handled_chars < input_length) |
159 | 0 | { |
160 | | /* let m = the minimum {non-basic} code point >= n in the input */ |
161 | 0 | for (m = G_MAXUINT, j = 0; j < input_length; j++) |
162 | 0 | { |
163 | 0 | if (input[j] >= n && input[j] < m) |
164 | 0 | m = input[j]; |
165 | 0 | } |
166 | |
|
167 | 0 | if (m - n > (G_MAXUINT - delta) / (handled_chars + 1)) |
168 | 0 | goto fail; |
169 | 0 | delta += (m - n) * (handled_chars + 1); |
170 | 0 | n = m; |
171 | |
|
172 | 0 | for (j = 0; j < input_length; j++) |
173 | 0 | { |
174 | 0 | if (input[j] < n) |
175 | 0 | { |
176 | 0 | if (++delta == 0) |
177 | 0 | goto fail; |
178 | 0 | } |
179 | 0 | else if (input[j] == n) |
180 | 0 | { |
181 | 0 | q = delta; |
182 | 0 | for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE) |
183 | 0 | { |
184 | 0 | if (k <= bias) |
185 | 0 | t = PUNYCODE_TMIN; |
186 | 0 | else if (k >= bias + PUNYCODE_TMAX) |
187 | 0 | t = PUNYCODE_TMAX; |
188 | 0 | else |
189 | 0 | t = k - bias; |
190 | 0 | if (q < t) |
191 | 0 | break; |
192 | 0 | digit = t + (q - t) % (PUNYCODE_BASE - t); |
193 | 0 | g_string_append_c (output, encode_digit (digit)); |
194 | 0 | q = (q - t) / (PUNYCODE_BASE - t); |
195 | 0 | } |
196 | |
|
197 | 0 | g_string_append_c (output, encode_digit (q)); |
198 | 0 | bias = adapt (delta, handled_chars + 1, handled_chars == num_basic_chars); |
199 | 0 | delta = 0; |
200 | 0 | handled_chars++; |
201 | 0 | } |
202 | 0 | } |
203 | | |
204 | 0 | delta++; |
205 | 0 | n++; |
206 | 0 | } |
207 | | |
208 | 0 | success = TRUE; |
209 | |
|
210 | 0 | fail: |
211 | 0 | g_free (input); |
212 | 0 | return success; |
213 | 0 | } |
214 | | |
215 | | /* From RFC 3454, Table B.1 */ |
216 | 0 | #define idna_is_junk(ch) ((ch) == 0x00AD || (ch) == 0x1806 || (ch) == 0x200B || (ch) == 0x2060 || (ch) == 0xFEFF || (ch) == 0x034F || (ch) == 0x180B || (ch) == 0x180C || (ch) == 0x180D || (ch) == 0x200C || (ch) == 0x200D || ((ch) >= 0xFE00 && (ch) <= 0xFE0F)) |
217 | | |
218 | | /* Scan @str for "junk" and return a cleaned-up string if any junk |
219 | | * is found. Else return %NULL. |
220 | | */ |
221 | | static gchar * |
222 | | remove_junk (const gchar *str, |
223 | | gint len) |
224 | 0 | { |
225 | 0 | GString *cleaned = NULL; |
226 | 0 | const gchar *p; |
227 | 0 | gunichar ch; |
228 | |
|
229 | 0 | for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p)) |
230 | 0 | { |
231 | 0 | ch = g_utf8_get_char (p); |
232 | 0 | if (idna_is_junk (ch)) |
233 | 0 | { |
234 | 0 | if (!cleaned) |
235 | 0 | { |
236 | 0 | cleaned = g_string_new (NULL); |
237 | 0 | g_string_append_len (cleaned, str, p - str); |
238 | 0 | } |
239 | 0 | } |
240 | 0 | else if (cleaned) |
241 | 0 | g_string_append_unichar (cleaned, ch); |
242 | 0 | } |
243 | |
|
244 | 0 | if (cleaned) |
245 | 0 | return g_string_free (cleaned, FALSE); |
246 | 0 | else |
247 | 0 | return NULL; |
248 | 0 | } |
249 | | |
250 | | static inline gboolean |
251 | | contains_uppercase_letters (const gchar *str, |
252 | | gint len) |
253 | 0 | { |
254 | 0 | const gchar *p; |
255 | |
|
256 | 0 | for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p)) |
257 | 0 | { |
258 | 0 | if (g_unichar_isupper (g_utf8_get_char (p))) |
259 | 0 | return TRUE; |
260 | 0 | } |
261 | 0 | return FALSE; |
262 | 0 | } |
263 | | |
264 | | static inline gboolean |
265 | | contains_non_ascii (const gchar *str, |
266 | | gint len) |
267 | 0 | { |
268 | 0 | const gchar *p; |
269 | |
|
270 | 0 | for (p = str; len == -1 ? *p : p < str + len; p++) |
271 | 0 | { |
272 | 0 | if ((guchar)*p > 0x80) |
273 | 0 | return TRUE; |
274 | 0 | } |
275 | 0 | return FALSE; |
276 | 0 | } |
277 | | |
278 | | /* RFC 3454, Appendix C. ish. */ |
279 | | static inline gboolean |
280 | | idna_is_prohibited (gunichar ch) |
281 | 0 | { |
282 | 0 | switch (g_unichar_type (ch)) |
283 | 0 | { |
284 | 0 | case G_UNICODE_CONTROL: |
285 | 0 | case G_UNICODE_FORMAT: |
286 | 0 | case G_UNICODE_UNASSIGNED: |
287 | 0 | case G_UNICODE_PRIVATE_USE: |
288 | 0 | case G_UNICODE_SURROGATE: |
289 | 0 | case G_UNICODE_LINE_SEPARATOR: |
290 | 0 | case G_UNICODE_PARAGRAPH_SEPARATOR: |
291 | 0 | case G_UNICODE_SPACE_SEPARATOR: |
292 | 0 | return TRUE; |
293 | | |
294 | 0 | case G_UNICODE_OTHER_SYMBOL: |
295 | 0 | if (ch == 0xFFFC || ch == 0xFFFD || |
296 | 0 | (ch >= 0x2FF0 && ch <= 0x2FFB)) |
297 | 0 | return TRUE; |
298 | 0 | return FALSE; |
299 | | |
300 | 0 | case G_UNICODE_NON_SPACING_MARK: |
301 | 0 | if (ch == 0x0340 || ch == 0x0341) |
302 | 0 | return TRUE; |
303 | 0 | return FALSE; |
304 | | |
305 | 0 | default: |
306 | 0 | return FALSE; |
307 | 0 | } |
308 | 0 | } |
309 | | |
310 | | /* RFC 3491 IDN cleanup algorithm. */ |
311 | | static gchar * |
312 | | nameprep (const gchar *hostname, |
313 | | gint len, |
314 | | gboolean *is_unicode) |
315 | 0 | { |
316 | 0 | gchar *name, *tmp = NULL, *p; |
317 | | |
318 | | /* It would be nice if we could do this without repeatedly |
319 | | * allocating strings and converting back and forth between |
320 | | * gunichars and UTF-8... The code does at least avoid doing most of |
321 | | * the sub-operations when they would just be equivalent to a |
322 | | * g_strdup(). |
323 | | */ |
324 | | |
325 | | /* Remove presentation-only characters */ |
326 | 0 | name = remove_junk (hostname, len); |
327 | 0 | if (name) |
328 | 0 | { |
329 | 0 | tmp = name; |
330 | 0 | len = -1; |
331 | 0 | } |
332 | 0 | else |
333 | 0 | name = (gchar *)hostname; |
334 | | |
335 | | /* Convert to lowercase */ |
336 | 0 | if (contains_uppercase_letters (name, len)) |
337 | 0 | { |
338 | 0 | name = g_utf8_strdown (name, len); |
339 | 0 | g_free (tmp); |
340 | 0 | tmp = name; |
341 | 0 | len = -1; |
342 | 0 | } |
343 | | |
344 | | /* If there are no UTF8 characters, we're done. */ |
345 | 0 | if (!contains_non_ascii (name, len)) |
346 | 0 | { |
347 | 0 | *is_unicode = FALSE; |
348 | 0 | if (name == (gchar *)hostname) |
349 | 0 | return len == -1 ? g_strdup (hostname) : g_strndup (hostname, len); |
350 | 0 | else |
351 | 0 | return name; |
352 | 0 | } |
353 | | |
354 | 0 | *is_unicode = TRUE; |
355 | | |
356 | | /* Normalize */ |
357 | 0 | name = g_utf8_normalize (name, len, G_NORMALIZE_NFKC); |
358 | 0 | g_free (tmp); |
359 | 0 | tmp = name; |
360 | |
|
361 | 0 | if (!name) |
362 | 0 | return NULL; |
363 | | |
364 | | /* KC normalization may have created more capital letters (eg, |
365 | | * angstrom -> capital A with ring). So we have to lowercasify a |
366 | | * second time. (This is more-or-less how the nameprep algorithm |
367 | | * does it. If tolower(nfkc(tolower(X))) is guaranteed to be the |
368 | | * same as tolower(nfkc(X)), then we could skip the first tolower, |
369 | | * but I'm not sure it is.) |
370 | | */ |
371 | 0 | if (contains_uppercase_letters (name, -1)) |
372 | 0 | { |
373 | 0 | name = g_utf8_strdown (name, -1); |
374 | 0 | g_free (tmp); |
375 | 0 | tmp = name; |
376 | 0 | } |
377 | | |
378 | | /* Check for prohibited characters */ |
379 | 0 | for (p = name; *p; p = g_utf8_next_char (p)) |
380 | 0 | { |
381 | 0 | if (idna_is_prohibited (g_utf8_get_char (p))) |
382 | 0 | { |
383 | 0 | name = NULL; |
384 | 0 | g_free (tmp); |
385 | 0 | goto done; |
386 | 0 | } |
387 | 0 | } |
388 | | |
389 | | /* FIXME: We're supposed to verify certain constraints on bidi |
390 | | * characters, but glib does not appear to have that information. |
391 | | */ |
392 | | |
393 | 0 | done: |
394 | 0 | return name; |
395 | 0 | } |
396 | | |
397 | | /* RFC 3490, section 3.1 says '.', 0x3002, 0xFF0E, and 0xFF61 count as |
398 | | * label-separating dots. @str must be '\0'-terminated. |
399 | | */ |
400 | 0 | #define idna_is_dot(str) ( \ |
401 | 0 | ((guchar)(str)[0] == '.') || \ |
402 | 0 | ((guchar)(str)[0] == 0xE3 && (guchar)(str)[1] == 0x80 && (guchar)(str)[2] == 0x82) || \ |
403 | 0 | ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBC && (guchar)(str)[2] == 0x8E) || \ |
404 | 0 | ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBD && (guchar)(str)[2] == 0xA1) ) |
405 | | |
406 | | static const gchar * |
407 | | idna_end_of_label (const gchar *str) |
408 | 0 | { |
409 | 0 | for (; *str; str = g_utf8_next_char (str)) |
410 | 0 | { |
411 | 0 | if (idna_is_dot (str)) |
412 | 0 | return str; |
413 | 0 | } |
414 | 0 | return str; |
415 | 0 | } |
416 | | |
417 | | static gsize |
418 | | get_hostname_max_length_bytes (void) |
419 | 0 | { |
420 | | #if defined(G_OS_WIN32) |
421 | | wchar_t tmp[MAX_COMPUTERNAME_LENGTH]; |
422 | | return sizeof (tmp) / sizeof (tmp[0]); |
423 | | #elif defined(_SC_HOST_NAME_MAX) |
424 | | glong max = sysconf (_SC_HOST_NAME_MAX); |
425 | 0 | if (max > 0) |
426 | 0 | return (gsize) max; |
427 | | |
428 | 0 | #ifdef HOST_NAME_MAX |
429 | 0 | return HOST_NAME_MAX; |
430 | | #else |
431 | | return _POSIX_HOST_NAME_MAX; |
432 | | #endif /* HOST_NAME_MAX */ |
433 | | #else |
434 | | /* Fallback to some reasonable value |
435 | | * See https://stackoverflow.com/questions/8724954/what-is-the-maximum-number-of-characters-for-a-host-name-in-unix/28918017#28918017 */ |
436 | | return 255; |
437 | | #endif |
438 | 0 | } |
439 | | |
440 | | /* Returns %TRUE if `strlen (str) > comparison_length`, but without actually |
441 | | * running `strlen(str)`, as that would take a very long time for long |
442 | | * (untrusted) input strings. */ |
443 | | static gboolean |
444 | | strlen_greater_than (const gchar *str, |
445 | | gsize comparison_length) |
446 | 0 | { |
447 | 0 | gsize i; |
448 | |
|
449 | 0 | for (i = 0; str[i] != '\0'; i++) |
450 | 0 | if (i > comparison_length) |
451 | 0 | return TRUE; |
452 | | |
453 | 0 | return FALSE; |
454 | 0 | } |
455 | | |
456 | | /** |
457 | | * g_hostname_to_ascii: |
458 | | * @hostname: a valid UTF-8 or ASCII hostname |
459 | | * |
460 | | * Converts @hostname to its canonical ASCII form; an ASCII-only |
461 | | * string containing no uppercase letters and not ending with a |
462 | | * trailing dot. |
463 | | * |
464 | | * Returns: (nullable) (transfer full): an ASCII hostname, which must be freed, |
465 | | * or %NULL if @hostname is in some way invalid. |
466 | | * |
467 | | * Since: 2.22 |
468 | | **/ |
469 | | gchar * |
470 | | g_hostname_to_ascii (const gchar *hostname) |
471 | 0 | { |
472 | 0 | gchar *name, *label, *p; |
473 | 0 | GString *out; |
474 | 0 | gssize llen, oldlen; |
475 | 0 | gboolean unicode; |
476 | 0 | gsize hostname_max_length_bytes = get_hostname_max_length_bytes (); |
477 | | |
478 | | /* Do an initial check on the hostname length, as overlong hostnames take a |
479 | | * long time in the IDN cleanup algorithm in nameprep(). The ultimate |
480 | | * restriction is that the IDN-decoded (i.e. pure ASCII) hostname cannot be |
481 | | * longer than 255 bytes. That’s the least restrictive limit on hostname |
482 | | * length of all the ways hostnames can be interpreted. Typically, the |
483 | | * hostname will be an FQDN, which is limited to 253 bytes long. POSIX |
484 | | * hostnames are limited to `get_hostname_max_length_bytes()` (typically 255 |
485 | | * bytes). |
486 | | * |
487 | | * See https://stackoverflow.com/a/28918017/2931197 |
488 | | * |
489 | | * It’s possible for a hostname to be %-encoded, in which case its decoded |
490 | | * length will be as much as 3× shorter. |
491 | | * |
492 | | * It’s also possible for a hostname to use overlong UTF-8 encodings, in which |
493 | | * case its decoded length will be as much as 4× shorter. |
494 | | * |
495 | | * Note: This check is not intended as an absolute guarantee that a hostname |
496 | | * is the right length and will be accepted by other systems. It’s intended to |
497 | | * stop wildly-invalid hostnames from taking forever in nameprep(). |
498 | | */ |
499 | 0 | if (hostname_max_length_bytes <= G_MAXSIZE / 4 && |
500 | 0 | strlen_greater_than (hostname, 4 * MAX (255, hostname_max_length_bytes))) |
501 | 0 | return NULL; |
502 | | |
503 | 0 | label = name = nameprep (hostname, -1, &unicode); |
504 | 0 | if (!name || !unicode) |
505 | 0 | return name; |
506 | | |
507 | 0 | out = g_string_new (NULL); |
508 | |
|
509 | 0 | do |
510 | 0 | { |
511 | 0 | unicode = FALSE; |
512 | 0 | for (p = label; *p && !idna_is_dot (p); p++) |
513 | 0 | { |
514 | 0 | if ((guchar)*p > 0x80) |
515 | 0 | unicode = TRUE; |
516 | 0 | } |
517 | |
|
518 | 0 | oldlen = out->len; |
519 | 0 | llen = p - label; |
520 | 0 | if (unicode) |
521 | 0 | { |
522 | 0 | if (!strncmp (label, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
523 | 0 | goto fail; |
524 | | |
525 | 0 | g_string_append (out, IDNA_ACE_PREFIX); |
526 | 0 | if (!punycode_encode (label, llen, out)) |
527 | 0 | goto fail; |
528 | 0 | } |
529 | 0 | else |
530 | 0 | g_string_append_len (out, label, llen); |
531 | | |
532 | 0 | if (out->len - oldlen > 63) |
533 | 0 | goto fail; |
534 | | |
535 | 0 | label += llen; |
536 | 0 | if (*label) |
537 | 0 | label = g_utf8_next_char (label); |
538 | 0 | if (*label) |
539 | 0 | g_string_append_c (out, '.'); |
540 | 0 | } |
541 | 0 | while (*label); |
542 | | |
543 | 0 | g_free (name); |
544 | 0 | return g_string_free (out, FALSE); |
545 | | |
546 | 0 | fail: |
547 | 0 | g_free (name); |
548 | 0 | g_string_free (out, TRUE); |
549 | 0 | return NULL; |
550 | 0 | } |
551 | | |
552 | | /** |
553 | | * g_hostname_is_non_ascii: |
554 | | * @hostname: a hostname |
555 | | * |
556 | | * Tests if @hostname contains Unicode characters. If this returns |
557 | | * %TRUE, you need to encode the hostname with g_hostname_to_ascii() |
558 | | * before using it in non-IDN-aware contexts. |
559 | | * |
560 | | * Note that a hostname might contain a mix of encoded and unencoded |
561 | | * segments, and so it is possible for g_hostname_is_non_ascii() and |
562 | | * g_hostname_is_ascii_encoded() to both return %TRUE for a name. |
563 | | * |
564 | | * Returns: %TRUE if @hostname contains any non-ASCII characters |
565 | | * |
566 | | * Since: 2.22 |
567 | | **/ |
568 | | gboolean |
569 | | g_hostname_is_non_ascii (const gchar *hostname) |
570 | 0 | { |
571 | 0 | return contains_non_ascii (hostname, -1); |
572 | 0 | } |
573 | | |
574 | | /* Punycode decoder, RFC 3492 section 6.2. As with punycode_encode(), |
575 | | * read the RFC if you want to understand what this is actually doing. |
576 | | */ |
577 | | static gboolean |
578 | | punycode_decode (const gchar *input, |
579 | | gsize input_length, |
580 | | GString *output) |
581 | 0 | { |
582 | 0 | GArray *output_chars; |
583 | 0 | gunichar n; |
584 | 0 | guint i, bias; |
585 | 0 | guint oldi, w, k, digit, t; |
586 | 0 | const gchar *split; |
587 | |
|
588 | 0 | n = PUNYCODE_INITIAL_N; |
589 | 0 | i = 0; |
590 | 0 | bias = PUNYCODE_INITIAL_BIAS; |
591 | |
|
592 | 0 | split = input + input_length - 1; |
593 | 0 | while (split > input && *split != '-') |
594 | 0 | split--; |
595 | 0 | if (split > input) |
596 | 0 | { |
597 | 0 | output_chars = g_array_sized_new (FALSE, FALSE, sizeof (gunichar), |
598 | 0 | split - input); |
599 | 0 | input_length -= (split - input) + 1; |
600 | 0 | while (input < split) |
601 | 0 | { |
602 | 0 | gunichar ch = (gunichar)*input++; |
603 | 0 | if (!PUNYCODE_IS_BASIC (ch)) |
604 | 0 | goto fail; |
605 | 0 | g_array_append_val (output_chars, ch); |
606 | 0 | } |
607 | 0 | input++; |
608 | 0 | } |
609 | 0 | else |
610 | 0 | output_chars = g_array_new (FALSE, FALSE, sizeof (gunichar)); |
611 | | |
612 | 0 | while (input_length) |
613 | 0 | { |
614 | 0 | oldi = i; |
615 | 0 | w = 1; |
616 | 0 | for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE) |
617 | 0 | { |
618 | 0 | if (!input_length--) |
619 | 0 | goto fail; |
620 | 0 | digit = decode_digit (*input++); |
621 | 0 | if (digit >= PUNYCODE_BASE) |
622 | 0 | goto fail; |
623 | 0 | if (digit > (G_MAXUINT - i) / w) |
624 | 0 | goto fail; |
625 | 0 | i += digit * w; |
626 | 0 | if (k <= bias) |
627 | 0 | t = PUNYCODE_TMIN; |
628 | 0 | else if (k >= bias + PUNYCODE_TMAX) |
629 | 0 | t = PUNYCODE_TMAX; |
630 | 0 | else |
631 | 0 | t = k - bias; |
632 | 0 | if (digit < t) |
633 | 0 | break; |
634 | 0 | if (w > G_MAXUINT / (PUNYCODE_BASE - t)) |
635 | 0 | goto fail; |
636 | 0 | w *= (PUNYCODE_BASE - t); |
637 | 0 | } |
638 | | |
639 | 0 | bias = adapt (i - oldi, output_chars->len + 1, oldi == 0); |
640 | |
|
641 | 0 | if (i / (output_chars->len + 1) > G_MAXUINT - n) |
642 | 0 | goto fail; |
643 | 0 | n += i / (output_chars->len + 1); |
644 | 0 | i %= (output_chars->len + 1); |
645 | |
|
646 | 0 | g_array_insert_val (output_chars, i++, n); |
647 | 0 | } |
648 | | |
649 | 0 | for (i = 0; i < output_chars->len; i++) |
650 | 0 | g_string_append_unichar (output, g_array_index (output_chars, gunichar, i)); |
651 | 0 | g_array_free (output_chars, TRUE); |
652 | 0 | return TRUE; |
653 | | |
654 | 0 | fail: |
655 | 0 | g_array_free (output_chars, TRUE); |
656 | 0 | return FALSE; |
657 | 0 | } |
658 | | |
659 | | /** |
660 | | * g_hostname_to_unicode: |
661 | | * @hostname: a valid UTF-8 or ASCII hostname |
662 | | * |
663 | | * Converts @hostname to its canonical presentation form; a UTF-8 |
664 | | * string in Unicode normalization form C, containing no uppercase |
665 | | * letters, no forbidden characters, and no ASCII-encoded segments, |
666 | | * and not ending with a trailing dot. |
667 | | * |
668 | | * Of course if @hostname is not an internationalized hostname, then |
669 | | * the canonical presentation form will be entirely ASCII. |
670 | | * |
671 | | * Returns: (nullable) (transfer full): a UTF-8 hostname, which must be freed, |
672 | | * or %NULL if @hostname is in some way invalid. |
673 | | * |
674 | | * Since: 2.22 |
675 | | **/ |
676 | | gchar * |
677 | | g_hostname_to_unicode (const gchar *hostname) |
678 | 0 | { |
679 | 0 | GString *out; |
680 | 0 | gssize llen; |
681 | 0 | gsize hostname_max_length_bytes = get_hostname_max_length_bytes (); |
682 | | |
683 | | /* See the comment at the top of g_hostname_to_ascii(). */ |
684 | 0 | if (hostname_max_length_bytes <= G_MAXSIZE / 4 && |
685 | 0 | strlen_greater_than (hostname, 4 * MAX (255, hostname_max_length_bytes))) |
686 | 0 | return NULL; |
687 | | |
688 | 0 | out = g_string_new (NULL); |
689 | |
|
690 | 0 | do |
691 | 0 | { |
692 | 0 | llen = idna_end_of_label (hostname) - hostname; |
693 | 0 | if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
694 | 0 | { |
695 | 0 | hostname += IDNA_ACE_PREFIX_LEN; |
696 | 0 | llen -= IDNA_ACE_PREFIX_LEN; |
697 | 0 | if (!punycode_decode (hostname, llen, out)) |
698 | 0 | { |
699 | 0 | g_string_free (out, TRUE); |
700 | 0 | return NULL; |
701 | 0 | } |
702 | 0 | } |
703 | 0 | else |
704 | 0 | { |
705 | 0 | gboolean unicode; |
706 | 0 | gchar *canonicalized = nameprep (hostname, llen, &unicode); |
707 | |
|
708 | 0 | if (!canonicalized) |
709 | 0 | { |
710 | 0 | g_string_free (out, TRUE); |
711 | 0 | return NULL; |
712 | 0 | } |
713 | 0 | g_string_append (out, canonicalized); |
714 | 0 | g_free (canonicalized); |
715 | 0 | } |
716 | | |
717 | 0 | hostname += llen; |
718 | 0 | if (*hostname) |
719 | 0 | hostname = g_utf8_next_char (hostname); |
720 | 0 | if (*hostname) |
721 | 0 | g_string_append_c (out, '.'); |
722 | 0 | } |
723 | 0 | while (*hostname); |
724 | | |
725 | 0 | return g_string_free (out, FALSE); |
726 | 0 | } |
727 | | |
728 | | /** |
729 | | * g_hostname_is_ascii_encoded: |
730 | | * @hostname: a hostname |
731 | | * |
732 | | * Tests if @hostname contains segments with an ASCII-compatible |
733 | | * encoding of an Internationalized Domain Name. If this returns |
734 | | * %TRUE, you should decode the hostname with g_hostname_to_unicode() |
735 | | * before displaying it to the user. |
736 | | * |
737 | | * Note that a hostname might contain a mix of encoded and unencoded |
738 | | * segments, and so it is possible for g_hostname_is_non_ascii() and |
739 | | * g_hostname_is_ascii_encoded() to both return %TRUE for a name. |
740 | | * |
741 | | * Returns: %TRUE if @hostname contains any ASCII-encoded |
742 | | * segments. |
743 | | * |
744 | | * Since: 2.22 |
745 | | **/ |
746 | | gboolean |
747 | | g_hostname_is_ascii_encoded (const gchar *hostname) |
748 | 0 | { |
749 | 0 | while (1) |
750 | 0 | { |
751 | 0 | if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
752 | 0 | return TRUE; |
753 | 0 | hostname = idna_end_of_label (hostname); |
754 | 0 | if (*hostname) |
755 | 0 | hostname = g_utf8_next_char (hostname); |
756 | 0 | if (!*hostname) |
757 | 0 | return FALSE; |
758 | 0 | } |
759 | 0 | } |
760 | | |
761 | | /** |
762 | | * g_hostname_is_ip_address: |
763 | | * @hostname: a hostname (or IP address in string form) |
764 | | * |
765 | | * Tests if @hostname is the string form of an IPv4 or IPv6 address. |
766 | | * (Eg, "192.168.0.1".) |
767 | | * |
768 | | * Since 2.66, IPv6 addresses with a zone-id are accepted (RFC6874). |
769 | | * |
770 | | * Returns: %TRUE if @hostname is an IP address |
771 | | * |
772 | | * Since: 2.22 |
773 | | **/ |
774 | | gboolean |
775 | | g_hostname_is_ip_address (const gchar *hostname) |
776 | 0 | { |
777 | 0 | gchar *p, *end; |
778 | 0 | gint nsegments, octet; |
779 | | |
780 | | /* On Linux we could implement this using inet_pton, but the Windows |
781 | | * equivalent of that requires linking against winsock, so we just |
782 | | * figure this out ourselves. Tested by tests/hostutils.c. |
783 | | */ |
784 | |
|
785 | 0 | p = (char *)hostname; |
786 | |
|
787 | 0 | if (strchr (p, ':')) |
788 | 0 | { |
789 | 0 | gboolean skipped; |
790 | | |
791 | | /* If it contains a ':', it's an IPv6 address (assuming it's an |
792 | | * IP address at all). This consists of eight ':'-separated |
793 | | * segments, each containing a 1-4 digit hex number, except that |
794 | | * optionally: (a) the last two segments can be replaced by an |
795 | | * IPv4 address, and (b) a single span of 1 to 8 "0000" segments |
796 | | * can be replaced with just "::". |
797 | | */ |
798 | |
|
799 | 0 | nsegments = 0; |
800 | 0 | skipped = FALSE; |
801 | 0 | while (*p && *p != '%' && nsegments < 8) |
802 | 0 | { |
803 | | /* Each segment after the first must be preceded by a ':'. |
804 | | * (We also handle half of the "string starts with ::" case |
805 | | * here.) |
806 | | */ |
807 | 0 | if (p != (char *)hostname || (p[0] == ':' && p[1] == ':')) |
808 | 0 | { |
809 | 0 | if (*p != ':') |
810 | 0 | return FALSE; |
811 | 0 | p++; |
812 | 0 | } |
813 | | |
814 | | /* If there's another ':', it means we're skipping some segments */ |
815 | 0 | if (*p == ':' && !skipped) |
816 | 0 | { |
817 | 0 | skipped = TRUE; |
818 | 0 | nsegments++; |
819 | | |
820 | | /* Handle the "string ends with ::" case */ |
821 | 0 | if (!p[1]) |
822 | 0 | p++; |
823 | |
|
824 | 0 | continue; |
825 | 0 | } |
826 | | |
827 | | /* Read the segment, make sure it's valid. */ |
828 | 0 | for (end = p; g_ascii_isxdigit (*end); end++) |
829 | 0 | ; |
830 | 0 | if (end == p || end > p + 4) |
831 | 0 | return FALSE; |
832 | | |
833 | 0 | if (*end == '.') |
834 | 0 | { |
835 | 0 | if ((nsegments == 6 && !skipped) || (nsegments <= 6 && skipped)) |
836 | 0 | goto parse_ipv4; |
837 | 0 | else |
838 | 0 | return FALSE; |
839 | 0 | } |
840 | | |
841 | 0 | nsegments++; |
842 | 0 | p = end; |
843 | 0 | } |
844 | | |
845 | 0 | return (!*p || (p[0] == '%' && p[1])) && (nsegments == 8 || skipped); |
846 | 0 | } |
847 | | |
848 | 0 | parse_ipv4: |
849 | | |
850 | | /* Parse IPv4: N.N.N.N, where each N <= 255 and doesn't have leading 0s. */ |
851 | 0 | for (nsegments = 0; nsegments < 4; nsegments++) |
852 | 0 | { |
853 | 0 | if (nsegments != 0) |
854 | 0 | { |
855 | 0 | if (*p != '.') |
856 | 0 | return FALSE; |
857 | 0 | p++; |
858 | 0 | } |
859 | | |
860 | | /* Check the segment; a little tricker than the IPv6 case since |
861 | | * we can't allow extra leading 0s, and we can't assume that all |
862 | | * strings of valid length are within range. |
863 | | */ |
864 | 0 | octet = 0; |
865 | 0 | if (*p == '0') |
866 | 0 | end = p + 1; |
867 | 0 | else |
868 | 0 | { |
869 | 0 | for (end = p; g_ascii_isdigit (*end); end++) |
870 | 0 | { |
871 | 0 | octet = 10 * octet + (*end - '0'); |
872 | |
|
873 | 0 | if (octet > 255) |
874 | 0 | break; |
875 | 0 | } |
876 | 0 | } |
877 | 0 | if (end == p || end > p + 3 || octet > 255) |
878 | 0 | return FALSE; |
879 | | |
880 | 0 | p = end; |
881 | 0 | } |
882 | | |
883 | | /* If there's nothing left to parse, then it's ok. */ |
884 | 0 | return !*p; |
885 | 0 | } |