/src/libidn2/lib/lookup.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* lookup.c - implementation of IDNA2008 lookup functions |
2 | | Copyright (C) 2011-2025 Simon Josefsson |
3 | | Copyright (C) 2017-2025 Tim Ruehsen |
4 | | |
5 | | Libidn2 is free software: you can redistribute it and/or modify it |
6 | | under the terms of either: |
7 | | |
8 | | * the GNU Lesser General Public License as published by the Free |
9 | | Software Foundation; either version 3 of the License, or (at |
10 | | your option) any later version. |
11 | | |
12 | | or |
13 | | |
14 | | * the GNU General Public License as published by the Free |
15 | | Software Foundation; either version 2 of the License, or (at |
16 | | your option) any later version. |
17 | | |
18 | | or both in parallel, as here. |
19 | | |
20 | | This program is distributed in the hope that it will be useful, |
21 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 | | GNU General Public License for more details. |
24 | | |
25 | | You should have received copies of the GNU General Public License and |
26 | | the GNU Lesser General Public License along with this program. If |
27 | | not, see <http://www.gnu.org/licenses/>. |
28 | | */ |
29 | | |
30 | | #include <config.h> |
31 | | |
32 | | #include "idn2.h" |
33 | | |
34 | | #include <errno.h> /* errno */ |
35 | | #include <stdlib.h> /* malloc, free */ |
36 | | |
37 | | #include <unitypes.h> |
38 | | #include <uniconv.h> /* u8_strconv_from_locale */ |
39 | | #include <uninorm.h> /* u32_normalize */ |
40 | | #include <unistr.h> /* u8_to_u32 */ |
41 | | |
42 | | #include "idna.h" /* _idn2_label_test */ |
43 | | #include "tr46map.h" /* definition for tr46map.c */ |
44 | | |
45 | | #ifdef HAVE_LIBUNISTRING |
46 | | /* copied from gnulib */ |
47 | | # include <limits.h> |
48 | | # define _C_CTYPE_LOWER_N(N) \ |
49 | 0 | case 'a' + (N): case 'b' + (N): case 'c' + (N): case 'd' + (N): \ |
50 | 0 | case 'e' + (N): case 'f' + (N): \ |
51 | 0 | case 'g' + (N): case 'h' + (N): case 'i' + (N): case 'j' + (N): \ |
52 | 0 | case 'k' + (N): case 'l' + (N): case 'm' + (N): case 'n' + (N): \ |
53 | 0 | case 'o' + (N): case 'p' + (N): case 'q' + (N): case 'r' + (N): \ |
54 | 0 | case 's' + (N): case 't' + (N): case 'u' + (N): case 'v' + (N): \ |
55 | 0 | case 'w' + (N): case 'x' + (N): case 'y' + (N): case 'z' + (N) |
56 | 0 | # define _C_CTYPE_UPPER _C_CTYPE_LOWER_N ('A' - 'a') |
57 | | static inline int |
58 | | c_tolower (int c) |
59 | 210k | { |
60 | 210k | switch (c) |
61 | 210k | { |
62 | 0 | _C_CTYPE_UPPER: |
63 | 0 | return c - 'A' + 'a'; |
64 | 210k | default: |
65 | 210k | return c; |
66 | 210k | } |
67 | 210k | } |
68 | | |
69 | | static int |
70 | | c_strncasecmp (const char *s1, const char *s2, size_t n) |
71 | 7.77k | { |
72 | 7.77k | register const unsigned char *p1 = (const unsigned char *) s1; |
73 | 7.77k | register const unsigned char *p2 = (const unsigned char *) s2; |
74 | 7.77k | unsigned char c1, c2; |
75 | | |
76 | 7.77k | if (p1 == p2 || n == 0) |
77 | 0 | return 0; |
78 | | |
79 | 7.77k | do |
80 | 105k | { |
81 | 105k | c1 = c_tolower (*p1); |
82 | 105k | c2 = c_tolower (*p2); |
83 | | |
84 | 105k | if (--n == 0 || c1 == '\0') |
85 | 7.77k | break; |
86 | | |
87 | 97.5k | ++p1; |
88 | 97.5k | ++p2; |
89 | 97.5k | } |
90 | 97.5k | while (c1 == c2); |
91 | | |
92 | 7.77k | if (UCHAR_MAX <= INT_MAX) |
93 | 7.77k | return c1 - c2; |
94 | 0 | else |
95 | | /* On machines where 'char' and 'int' are types of the same size, the |
96 | | difference of two 'unsigned char' values - including the sign bit - |
97 | | doesn't fit in an 'int'. */ |
98 | 0 | return (c1 > c2 ? 1 : c1 < c2 ? -1 : 0); |
99 | 7.77k | } |
100 | | #else |
101 | | # include <c-strcase.h> |
102 | | #endif |
103 | | |
104 | | static int |
105 | | set_default_flags (int *flags) |
106 | 118k | { |
107 | 118k | if (((*flags) & IDN2_TRANSITIONAL) && ((*flags) & IDN2_NONTRANSITIONAL)) |
108 | 0 | return IDN2_INVALID_FLAGS; |
109 | | |
110 | 118k | if (((*flags) & (IDN2_TRANSITIONAL | IDN2_NONTRANSITIONAL)) |
111 | 118k | && ((*flags) & IDN2_NO_TR46)) |
112 | 0 | return IDN2_INVALID_FLAGS; |
113 | | |
114 | 118k | if (((*flags) & IDN2_ALABEL_ROUNDTRIP) |
115 | 118k | && ((*flags) & IDN2_NO_ALABEL_ROUNDTRIP)) |
116 | 0 | return IDN2_INVALID_FLAGS; |
117 | | |
118 | 118k | if (!((*flags) & (IDN2_NO_TR46 | IDN2_TRANSITIONAL))) |
119 | 63.8k | *flags |= IDN2_NONTRANSITIONAL; |
120 | | |
121 | 118k | return IDN2_OK; |
122 | 118k | } |
123 | | |
124 | | static int |
125 | | label (const uint8_t *src, size_t srclen, uint8_t *dst, size_t *dstlen, |
126 | | int flags) |
127 | 74.1k | { |
128 | 74.1k | size_t plen; |
129 | 74.1k | uint32_t *p = NULL; |
130 | 74.1k | const uint8_t *src_org = NULL; |
131 | 74.1k | uint8_t *src_allocated = NULL; |
132 | 74.1k | int rc, check_roundtrip = 0; |
133 | 74.1k | size_t tmpl, srclen_org = 0; |
134 | 74.1k | uint32_t label_u32[IDN2_LABEL_MAX_LENGTH]; |
135 | 74.1k | size_t label32_len = IDN2_LABEL_MAX_LENGTH; |
136 | | |
137 | 74.1k | if (_idn2_ascii_p (src, srclen)) |
138 | 20.4k | { |
139 | 20.4k | if (!(flags & IDN2_NO_ALABEL_ROUNDTRIP) && srclen >= 4 |
140 | 20.4k | && memcmp (src, "xn--", 4) == 0) |
141 | 11.7k | { |
142 | | /* |
143 | | If the input to this procedure appears to be an A-label |
144 | | (i.e., it starts in "xn--", interpreted |
145 | | case-insensitively), the lookup application MAY attempt to |
146 | | convert it to a U-label, first ensuring that the A-label is |
147 | | entirely in lowercase (converting it to lowercase if |
148 | | necessary), and apply the tests of Section 5.4 and the |
149 | | conversion of Section 5.5 to that form. */ |
150 | 11.7k | rc = idn2_punycode_decode ((char *) src + 4, srclen - 4, |
151 | 11.7k | label_u32, &label32_len); |
152 | 11.7k | if (rc) |
153 | 0 | return rc; |
154 | | |
155 | 11.7k | check_roundtrip = 1; |
156 | 11.7k | src_org = src; |
157 | 11.7k | srclen_org = srclen; |
158 | | |
159 | 11.7k | srclen = IDN2_LABEL_MAX_LENGTH; |
160 | 11.7k | src = src_allocated = |
161 | 11.7k | u32_to_u8 (label_u32, label32_len, NULL, &srclen); |
162 | 11.7k | if (!src) |
163 | 0 | { |
164 | 0 | if (errno == ENOMEM) |
165 | 0 | return IDN2_MALLOC; |
166 | 0 | return IDN2_ENCODING_ERROR; |
167 | 0 | } |
168 | 11.7k | } |
169 | 8.72k | else |
170 | 8.72k | { |
171 | 8.72k | if (srclen > IDN2_LABEL_MAX_LENGTH) |
172 | 1.27k | return IDN2_TOO_BIG_LABEL; |
173 | 7.45k | if (srclen > *dstlen) |
174 | 0 | return IDN2_TOO_BIG_DOMAIN; |
175 | | |
176 | 7.45k | memcpy (dst, src, srclen); |
177 | 7.45k | *dstlen = srclen; |
178 | 7.45k | return IDN2_OK; |
179 | 7.45k | } |
180 | 20.4k | } |
181 | | |
182 | 65.4k | rc = _idn2_u8_to_u32_nfc (src, srclen, &p, &plen, flags & IDN2_NFC_INPUT); |
183 | 65.4k | if (rc != IDN2_OK) |
184 | 0 | goto out; |
185 | | |
186 | 65.4k | if (!(flags & IDN2_TRANSITIONAL)) |
187 | 35.8k | { |
188 | 35.8k | rc = _idn2_label_test (TEST_NFC | |
189 | 35.8k | TEST_2HYPHEN | |
190 | 35.8k | TEST_LEADING_COMBINING | |
191 | 35.8k | TEST_DISALLOWED | |
192 | 35.8k | TEST_CONTEXTJ_RULE | |
193 | 35.8k | TEST_CONTEXTO_WITH_RULE | |
194 | 35.8k | TEST_UNASSIGNED | TEST_BIDI | |
195 | 35.8k | ((flags & IDN2_NONTRANSITIONAL) ? |
196 | 35.8k | TEST_NONTRANSITIONAL : 0) | ((flags & |
197 | 35.8k | IDN2_USE_STD3_ASCII_RULES) |
198 | 35.8k | ? 0 : |
199 | 35.8k | TEST_ALLOW_STD3_DISALLOWED), |
200 | 35.8k | p, plen); |
201 | | |
202 | 35.8k | if (rc != IDN2_OK) |
203 | 16.4k | goto out; |
204 | 35.8k | } |
205 | | |
206 | 49.0k | dst[0] = 'x'; |
207 | 49.0k | dst[1] = 'n'; |
208 | 49.0k | dst[2] = '-'; |
209 | 49.0k | dst[3] = '-'; |
210 | | |
211 | 49.0k | tmpl = *dstlen - 4; |
212 | 49.0k | rc = idn2_punycode_encode (p, plen, (char *) dst + 4, &tmpl); |
213 | 49.0k | if (rc != IDN2_OK) |
214 | 2.59k | goto out; |
215 | | |
216 | | |
217 | 46.4k | *dstlen = 4 + tmpl; |
218 | | |
219 | 46.4k | if (check_roundtrip) |
220 | 7.77k | { |
221 | 7.77k | if (srclen_org != *dstlen |
222 | 7.77k | || c_strncasecmp ((char *) src_org, (char *) dst, srclen_org)) |
223 | 0 | { |
224 | 0 | rc = IDN2_ALABEL_ROUNDTRIP_FAILED; |
225 | 0 | goto out; |
226 | 0 | } |
227 | 7.77k | } |
228 | 38.6k | else if (!(flags & IDN2_NO_ALABEL_ROUNDTRIP)) |
229 | 38.6k | { |
230 | 38.6k | rc = idn2_punycode_decode ((char *) dst + 4, *dstlen - 4, |
231 | 38.6k | label_u32, &label32_len); |
232 | 38.6k | if (rc) |
233 | 0 | { |
234 | 0 | rc = IDN2_ALABEL_ROUNDTRIP_FAILED; |
235 | 0 | goto out; |
236 | 0 | } |
237 | | |
238 | 38.6k | if (plen != label32_len || u32_cmp (p, label_u32, label32_len)) |
239 | 0 | { |
240 | 0 | rc = IDN2_ALABEL_ROUNDTRIP_FAILED; |
241 | 0 | goto out; |
242 | 0 | } |
243 | 38.6k | } |
244 | | |
245 | 46.4k | rc = IDN2_OK; |
246 | | |
247 | 65.4k | out: |
248 | 65.4k | free (p); |
249 | 65.4k | free (src_allocated); |
250 | 65.4k | return rc; |
251 | 46.4k | } |
252 | | |
253 | | #define TR46_TRANSITIONAL_CHECK \ |
254 | 32.2k | (TEST_NFC | TEST_2HYPHEN | TEST_HYPHEN_STARTEND | TEST_LEADING_COMBINING | TEST_TRANSITIONAL) |
255 | | #define TR46_NONTRANSITIONAL_CHECK \ |
256 | 134k | (TEST_NFC | TEST_2HYPHEN | TEST_HYPHEN_STARTEND | TEST_LEADING_COMBINING | TEST_NONTRANSITIONAL) |
257 | | |
258 | | static int |
259 | | _tr46 (const uint8_t *domain_u8, uint8_t **out, int flags) |
260 | 118k | { |
261 | 118k | size_t len, it; |
262 | 118k | uint32_t *domain_u32; |
263 | 118k | int err = IDN2_OK, rc; |
264 | 118k | int transitional = 0; |
265 | 118k | int test_flags; |
266 | | |
267 | 118k | if (flags & IDN2_TRANSITIONAL) |
268 | 54.7k | transitional = 1; |
269 | | |
270 | | /* convert UTF-8 to UTF-32 */ |
271 | 118k | if (!(domain_u32 = |
272 | 118k | u8_to_u32 (domain_u8, u8_strlen (domain_u8) + 1, NULL, &len))) |
273 | 19.1k | { |
274 | 19.1k | if (errno == ENOMEM) |
275 | 0 | return IDN2_MALLOC; |
276 | 19.1k | return IDN2_ENCODING_ERROR; |
277 | 19.1k | } |
278 | | |
279 | 99.4k | size_t len2 = 0; |
280 | 1.18M | for (it = 0; it < len - 1; it++) |
281 | 1.11M | { |
282 | 1.11M | IDNAMap map; |
283 | | |
284 | 1.11M | get_idna_map (domain_u32[it], &map); |
285 | | |
286 | 1.11M | if (map_is (&map, TR46_FLG_DISALLOWED)) |
287 | 33.8k | { |
288 | 33.8k | if (domain_u32[it]) |
289 | 33.8k | { |
290 | 33.8k | free (domain_u32); |
291 | 33.8k | return IDN2_DISALLOWED; |
292 | 33.8k | } |
293 | 0 | len2++; |
294 | 0 | } |
295 | 1.08M | else if (map_is (&map, TR46_FLG_MAPPED)) |
296 | 352k | { |
297 | 352k | len2 += map.nmappings; |
298 | 352k | } |
299 | 731k | else if (map_is (&map, TR46_FLG_VALID)) |
300 | 642k | { |
301 | 642k | len2++; |
302 | 642k | } |
303 | 88.8k | else if (map_is (&map, TR46_FLG_IGNORED)) |
304 | 4.61k | { |
305 | 4.61k | continue; |
306 | 4.61k | } |
307 | 84.2k | else if (map_is (&map, TR46_FLG_DEVIATION)) |
308 | 41.2k | { |
309 | 41.2k | if (transitional) |
310 | 19.9k | { |
311 | 19.9k | len2 += map.nmappings; |
312 | 19.9k | } |
313 | 21.3k | else |
314 | 21.3k | len2++; |
315 | 41.2k | } |
316 | 43.0k | else if (!(flags & IDN2_USE_STD3_ASCII_RULES)) |
317 | 0 | { |
318 | 0 | if (map_is (&map, TR46_FLG_DISALLOWED_STD3_VALID)) |
319 | 0 | { |
320 | | /* valid because UseSTD3ASCIIRules=false, see #TR46 5 */ |
321 | 0 | len2++; |
322 | 0 | } |
323 | 0 | else if (map_is (&map, TR46_FLG_DISALLOWED_STD3_MAPPED)) |
324 | 0 | { |
325 | | /* mapped because UseSTD3ASCIIRules=false, see #TR46 5 */ |
326 | 0 | len2 += map.nmappings; |
327 | 0 | } |
328 | 0 | } |
329 | 1.11M | } |
330 | | |
331 | | /* Exit early if result is too long. |
332 | | * This avoids excessive CPU usage in punycode encoding, which is O(N^2). */ |
333 | 65.6k | if (len2 >= IDN2_DOMAIN_MAX_LENGTH) |
334 | 608 | { |
335 | 608 | free (domain_u32); |
336 | 608 | return IDN2_TOO_BIG_DOMAIN; |
337 | 608 | } |
338 | | |
339 | 65.0k | uint32_t *tmp = (uint32_t *) malloc ((len2 + 1) * sizeof (uint32_t)); |
340 | 65.0k | if (!tmp) |
341 | 0 | { |
342 | 0 | free (domain_u32); |
343 | 0 | return IDN2_MALLOC; |
344 | 0 | } |
345 | | |
346 | 65.0k | len2 = 0; |
347 | 864k | for (it = 0; it < len - 1; it++) |
348 | 799k | { |
349 | 799k | uint32_t c = domain_u32[it]; |
350 | 799k | IDNAMap map; |
351 | | |
352 | 799k | get_idna_map (c, &map); |
353 | | |
354 | 799k | if (map_is (&map, TR46_FLG_DISALLOWED)) |
355 | 0 | { |
356 | 0 | tmp[len2++] = c; |
357 | 0 | } |
358 | 799k | else if (map_is (&map, TR46_FLG_MAPPED)) |
359 | 213k | { |
360 | 213k | len2 += get_map_data (tmp + len2, &map); |
361 | 213k | } |
362 | 586k | else if (map_is (&map, TR46_FLG_VALID)) |
363 | 526k | { |
364 | 526k | tmp[len2++] = c; |
365 | 526k | } |
366 | 60.4k | else if (map_is (&map, TR46_FLG_IGNORED)) |
367 | 4.50k | { |
368 | 4.50k | continue; |
369 | 4.50k | } |
370 | 55.9k | else if (map_is (&map, TR46_FLG_DEVIATION)) |
371 | 40.3k | { |
372 | 40.3k | if (transitional) |
373 | 19.3k | { |
374 | 19.3k | len2 += get_map_data (tmp + len2, &map); |
375 | 19.3k | } |
376 | 21.0k | else |
377 | 21.0k | tmp[len2++] = c; |
378 | 40.3k | } |
379 | 15.5k | else if (!(flags & IDN2_USE_STD3_ASCII_RULES)) |
380 | 0 | { |
381 | 0 | if (map_is (&map, TR46_FLG_DISALLOWED_STD3_VALID)) |
382 | 0 | { |
383 | 0 | tmp[len2++] = c; |
384 | 0 | } |
385 | 0 | else if (map_is (&map, TR46_FLG_DISALLOWED_STD3_MAPPED)) |
386 | 0 | { |
387 | 0 | len2 += get_map_data (tmp + len2, &map); |
388 | 0 | } |
389 | 0 | } |
390 | 799k | } |
391 | 65.0k | free (domain_u32); |
392 | | |
393 | | /* Normalize to NFC */ |
394 | 65.0k | tmp[len2] = 0; |
395 | 65.0k | domain_u32 = u32_normalize (UNINORM_NFC, tmp, len2 + 1, NULL, &len); |
396 | 65.0k | free (tmp); |
397 | 65.0k | tmp = NULL; |
398 | | |
399 | 65.0k | if (!domain_u32) |
400 | 0 | { |
401 | 0 | if (errno == ENOMEM) |
402 | 0 | return IDN2_MALLOC; |
403 | 0 | return IDN2_ENCODING_ERROR; |
404 | 0 | } |
405 | | |
406 | | /* split into labels and check */ |
407 | 65.0k | uint32_t *e, *s; |
408 | 158k | for (e = s = domain_u32; *e; s = e) |
409 | 101k | { |
410 | 1.06M | while (*e && *e != '.') |
411 | 968k | e++; |
412 | | |
413 | 101k | if (e - s >= 4 && s[0] == 'x' && s[1] == 'n' && s[2] == '-' |
414 | 101k | && s[3] == '-') |
415 | 27.5k | { |
416 | | /* decode punycode and check result non-transitional */ |
417 | 27.5k | size_t ace_len; |
418 | 27.5k | uint32_t name_u32[IDN2_LABEL_MAX_LENGTH]; |
419 | 27.5k | size_t name_len = IDN2_LABEL_MAX_LENGTH; |
420 | 27.5k | uint8_t *ace; |
421 | | |
422 | 27.5k | ace = u32_to_u8 (s + 4, e - s - 4, NULL, &ace_len); |
423 | 27.5k | if (!ace) |
424 | 0 | { |
425 | 0 | free (domain_u32); |
426 | 0 | if (errno == ENOMEM) |
427 | 0 | return IDN2_MALLOC; |
428 | 0 | return IDN2_ENCODING_ERROR; |
429 | 0 | } |
430 | | |
431 | 27.5k | rc = idn2_punycode_decode ((char *) ace, ace_len, |
432 | 27.5k | name_u32, &name_len); |
433 | | |
434 | 27.5k | free (ace); |
435 | | |
436 | 27.5k | if (rc) |
437 | 8.17k | { |
438 | 8.17k | free (domain_u32); |
439 | 8.17k | return rc; |
440 | 8.17k | } |
441 | | |
442 | 19.3k | test_flags = TR46_NONTRANSITIONAL_CHECK; |
443 | | |
444 | 19.3k | if (!(flags & IDN2_USE_STD3_ASCII_RULES)) |
445 | 0 | test_flags |= TEST_ALLOW_STD3_DISALLOWED; |
446 | | |
447 | 19.3k | if ((rc = _idn2_label_test (test_flags, name_u32, name_len))) |
448 | 7.50k | err = rc; |
449 | 19.3k | } |
450 | 73.7k | else |
451 | 73.7k | { |
452 | 73.7k | test_flags = |
453 | 73.7k | transitional ? TR46_TRANSITIONAL_CHECK : |
454 | 73.7k | TR46_NONTRANSITIONAL_CHECK; |
455 | | |
456 | 73.7k | if (!(flags & IDN2_USE_STD3_ASCII_RULES)) |
457 | 0 | test_flags |= TEST_ALLOW_STD3_DISALLOWED; |
458 | | |
459 | 73.7k | if ((rc = _idn2_label_test (test_flags, s, e - s))) |
460 | 6.03k | err = rc; |
461 | 73.7k | } |
462 | | |
463 | 93.1k | if (*e) |
464 | 38.2k | e++; |
465 | 93.1k | } |
466 | | |
467 | 56.8k | if (err == IDN2_OK && out) |
468 | 46.0k | { |
469 | 46.0k | uint8_t *_out = u32_to_u8 (domain_u32, len, NULL, &len); |
470 | 46.0k | free (domain_u32); |
471 | | |
472 | 46.0k | if (!_out) |
473 | 0 | { |
474 | 0 | if (errno == ENOMEM) |
475 | 0 | return IDN2_MALLOC; |
476 | 0 | return IDN2_ENCODING_ERROR; |
477 | 0 | } |
478 | | |
479 | 46.0k | *out = _out; |
480 | 46.0k | } |
481 | 10.8k | else |
482 | 10.8k | free (domain_u32); |
483 | | |
484 | 56.8k | return err; |
485 | 56.8k | } |
486 | | |
487 | | /** |
488 | | * idn2_lookup_u8: |
489 | | * @src: input zero-terminated UTF-8 string in Unicode NFC normalized form. |
490 | | * @lookupname: newly allocated output variable with name to lookup in DNS. |
491 | | * @flags: optional #idn2_flags to modify behaviour. |
492 | | * |
493 | | * Perform IDNA2008 lookup string conversion on domain name @src, as |
494 | | * described in section 5 of RFC 5891. Note that the input string |
495 | | * must be encoded in UTF-8 and be in Unicode NFC form. |
496 | | * |
497 | | * Pass %IDN2_NFC_INPUT in @flags to convert input to NFC form before |
498 | | * further processing. %IDN2_TRANSITIONAL and %IDN2_NONTRANSITIONAL |
499 | | * do already imply %IDN2_NFC_INPUT. |
500 | | * |
501 | | * Pass %IDN2_ALABEL_ROUNDTRIP in @flags to |
502 | | * convert any input A-labels to U-labels and perform additional |
503 | | * testing. This is default since version 2.2. |
504 | | * To switch this behavior off, pass IDN2_NO_ALABEL_ROUNDTRIP |
505 | | * |
506 | | * Pass %IDN2_TRANSITIONAL to enable Unicode TR46 |
507 | | * transitional processing, and %IDN2_NONTRANSITIONAL to enable |
508 | | * Unicode TR46 non-transitional processing. |
509 | | * |
510 | | * Multiple flags may be specified by binary or:ing them together. |
511 | | * |
512 | | * After version 2.0.3: %IDN2_USE_STD3_ASCII_RULES disabled by default. |
513 | | * Previously we were eliminating non-STD3 characters from domain strings |
514 | | * such as _443._tcp.example.com, or IPs 1.2.3.4/24 provided to libidn2 |
515 | | * functions. That was an unexpected regression for applications switching |
516 | | * from libidn and thus it is no longer applied by default. |
517 | | * Use %IDN2_USE_STD3_ASCII_RULES to enable that behavior again. |
518 | | * |
519 | | * After version 0.11: @lookupname may be NULL to test lookup of @src |
520 | | * without allocating memory. |
521 | | * |
522 | | * Returns: On successful conversion %IDN2_OK is returned, if the |
523 | | * output domain or any label would have been too long |
524 | | * %IDN2_TOO_BIG_DOMAIN or %IDN2_TOO_BIG_LABEL is returned, or |
525 | | * another error code is returned. |
526 | | * |
527 | | * Since: 0.1 |
528 | | **/ |
529 | | int |
530 | | idn2_lookup_u8 (const uint8_t *src, uint8_t **lookupname, int flags) |
531 | 118k | { |
532 | 118k | size_t lookupnamelen = 0; |
533 | 118k | uint8_t _lookupname[IDN2_DOMAIN_MAX_LENGTH + 1]; |
534 | 118k | uint8_t *src_allocated = NULL; |
535 | 118k | int rc; |
536 | | |
537 | 118k | if (src == NULL) |
538 | 0 | { |
539 | 0 | if (lookupname) |
540 | 0 | *lookupname = NULL; |
541 | 0 | return IDN2_OK; |
542 | 0 | } |
543 | | |
544 | 118k | rc = set_default_flags (&flags); |
545 | 118k | if (rc != IDN2_OK) |
546 | 0 | return rc; |
547 | | |
548 | 118k | if (!(flags & IDN2_NO_TR46)) |
549 | 118k | { |
550 | 118k | uint8_t *out = NULL; |
551 | | |
552 | 118k | rc = _tr46 (src, &out, flags); |
553 | 118k | if (rc != IDN2_OK) |
554 | 72.5k | return rc; |
555 | | |
556 | 46.0k | src = src_allocated = out; |
557 | 46.0k | } |
558 | | |
559 | 46.0k | do |
560 | 74.1k | { |
561 | 74.1k | const uint8_t *end = (uint8_t *) strchrnul ((const char *) src, '.'); |
562 | | /* XXX Do we care about non-U+002E dots such as U+3002, U+FF0E |
563 | | and U+FF61 here? Perhaps when IDN2_NFC_INPUT? */ |
564 | 74.1k | size_t labellen = end - src; |
565 | 74.1k | uint8_t tmp[IDN2_LABEL_MAX_LENGTH]; |
566 | 74.1k | size_t tmplen = IDN2_LABEL_MAX_LENGTH; |
567 | | |
568 | 74.1k | rc = label (src, labellen, tmp, &tmplen, flags); |
569 | 74.1k | if (rc != IDN2_OK) |
570 | 20.3k | { |
571 | 20.3k | free (src_allocated); |
572 | 20.3k | return rc; |
573 | 20.3k | } |
574 | | |
575 | 53.8k | if (lookupnamelen + tmplen |
576 | 53.8k | > IDN2_DOMAIN_MAX_LENGTH - (tmplen == 0 && *end == '\0' ? 1 : 2)) |
577 | 1.02k | { |
578 | 1.02k | free (src_allocated); |
579 | 1.02k | return IDN2_TOO_BIG_DOMAIN; |
580 | 1.02k | } |
581 | | |
582 | 52.8k | memcpy (_lookupname + lookupnamelen, tmp, tmplen); |
583 | 52.8k | lookupnamelen += tmplen; |
584 | | |
585 | 52.8k | if (*end == '.') |
586 | 28.1k | { |
587 | 28.1k | if (lookupnamelen + 1 > IDN2_DOMAIN_MAX_LENGTH) |
588 | 0 | { |
589 | 0 | free (src_allocated); |
590 | 0 | return IDN2_TOO_BIG_DOMAIN; |
591 | 0 | } |
592 | | |
593 | 28.1k | _lookupname[lookupnamelen] = '.'; |
594 | 28.1k | lookupnamelen++; |
595 | 28.1k | } |
596 | 52.8k | _lookupname[lookupnamelen] = '\0'; |
597 | | |
598 | 52.8k | src = end; |
599 | 52.8k | } |
600 | 52.8k | while (*src++); |
601 | | |
602 | 24.6k | free (src_allocated); |
603 | | |
604 | 24.6k | if (lookupname) |
605 | 24.6k | { |
606 | 24.6k | uint8_t *tmp = (uint8_t *) malloc (lookupnamelen + 1); |
607 | | |
608 | 24.6k | if (tmp == NULL) |
609 | 0 | return IDN2_MALLOC; |
610 | | |
611 | 24.6k | memcpy (tmp, _lookupname, lookupnamelen + 1); |
612 | 24.6k | *lookupname = tmp; |
613 | 24.6k | } |
614 | | |
615 | 24.6k | return IDN2_OK; |
616 | 24.6k | } |
617 | | |
618 | | /** |
619 | | * idn2_lookup_ul: |
620 | | * @src: input zero-terminated locale encoded string. |
621 | | * @lookupname: newly allocated output variable with name to lookup in DNS. |
622 | | * @flags: optional #idn2_flags to modify behaviour. |
623 | | * |
624 | | * Perform IDNA2008 lookup string conversion on domain name @src, as |
625 | | * described in section 5 of RFC 5891. Note that the input is assumed |
626 | | * to be encoded in the locale's default coding system, and will be |
627 | | * transcoded to UTF-8 and NFC normalized by this function. |
628 | | * |
629 | | * Pass %IDN2_ALABEL_ROUNDTRIP in @flags to |
630 | | * convert any input A-labels to U-labels and perform additional |
631 | | * testing. This is default since version 2.2. |
632 | | * To switch this behavior off, pass IDN2_NO_ALABEL_ROUNDTRIP |
633 | | * |
634 | | * Pass %IDN2_TRANSITIONAL to enable Unicode TR46 transitional processing, |
635 | | * and %IDN2_NONTRANSITIONAL to enable Unicode TR46 non-transitional |
636 | | * processing. |
637 | | * |
638 | | * Multiple flags may be specified by binary or:ing them together, for |
639 | | * example %IDN2_ALABEL_ROUNDTRIP | %IDN2_NONTRANSITIONAL. |
640 | | * |
641 | | * The %IDN2_NFC_INPUT in @flags is always enabled in this function. |
642 | | * |
643 | | * After version 0.11: @lookupname may be NULL to test lookup of @src |
644 | | * without allocating memory. |
645 | | * |
646 | | * Returns: On successful conversion %IDN2_OK is returned, if |
647 | | * conversion from locale to UTF-8 fails then %IDN2_ICONV_FAIL is |
648 | | * returned, if the output domain or any label would have been too |
649 | | * long %IDN2_TOO_BIG_DOMAIN or %IDN2_TOO_BIG_LABEL is returned, or |
650 | | * another error code is returned. |
651 | | * |
652 | | * Since: 0.1 |
653 | | **/ |
654 | | int |
655 | | idn2_lookup_ul (const char *src, char **lookupname, int flags) |
656 | 0 | { |
657 | 0 | uint8_t *utf8src = NULL; |
658 | 0 | int rc; |
659 | |
|
660 | 0 | if (src) |
661 | 0 | { |
662 | 0 | const char *encoding = locale_charset (); |
663 | |
|
664 | 0 | utf8src = u8_strconv_from_encoding (src, encoding, iconveh_error); |
665 | |
|
666 | 0 | if (!utf8src) |
667 | 0 | { |
668 | 0 | if (errno == ENOMEM) |
669 | 0 | return IDN2_MALLOC; |
670 | 0 | return IDN2_ICONV_FAIL; |
671 | 0 | } |
672 | 0 | } |
673 | | |
674 | 0 | rc = idn2_lookup_u8 (utf8src, (uint8_t **) lookupname, |
675 | 0 | flags | IDN2_NFC_INPUT); |
676 | |
|
677 | 0 | free (utf8src); |
678 | |
|
679 | 0 | return rc; |
680 | 0 | } |
681 | | |
682 | | /** |
683 | | * idn2_to_ascii_4i: |
684 | | * @input: zero terminated input Unicode (UCS-4) string. |
685 | | * @inlen: number of elements in @input. |
686 | | * @output: output zero terminated string that must have room for at |
687 | | * least 63 characters plus the terminating zero. |
688 | | * @flags: optional #idn2_flags to modify behaviour. |
689 | | * |
690 | | * The ToASCII operation takes a sequence of Unicode code points that make |
691 | | * up one domain label and transforms it into a sequence of code points in |
692 | | * the ASCII range (0..7F). If ToASCII succeeds, the original sequence and |
693 | | * the resulting sequence are equivalent labels. |
694 | | * |
695 | | * It is important to note that the ToASCII operation can fail. |
696 | | * ToASCII fails if any step of it fails. If any step of the |
697 | | * ToASCII operation fails on any label in a domain name, that domain |
698 | | * name MUST NOT be used as an internationalized domain name. |
699 | | * The method for dealing with this failure is application-specific. |
700 | | * |
701 | | * The inputs to ToASCII are a sequence of code points. |
702 | | * |
703 | | * ToASCII never alters a sequence of code points that are all in the ASCII |
704 | | * range to begin with (although it could fail). Applying the ToASCII operation multiple |
705 | | * effect as applying it just once. |
706 | | * |
707 | | * The default behavior of this function (when flags are zero) is to apply |
708 | | * the IDNA2008 rules without the TR46 amendments. As the TR46 |
709 | | * non-transitional processing is nowadays ubiquitous, when unsure, it is |
710 | | * recommended to call this function with the %IDN2_NONTRANSITIONAL |
711 | | * and the %IDN2_NFC_INPUT flags for compatibility with other software. |
712 | | * |
713 | | * Warning: With version 2.1.1 until before version 2.3.5 this |
714 | | * function was deprecated in favor idn2_to_ascii_4i2(). We still |
715 | | * encourage you to use idn2_to_ascii_4i2() when appropriate. |
716 | | * |
717 | | * Returns: On successful conversion %IDN2_OK is returned; if the |
718 | | * output label would have been too long %IDN2_TOO_BIG_LABEL is |
719 | | * returned, or another error code is returned. |
720 | | * |
721 | | * Since: 2.0.0 |
722 | | **/ |
723 | | int |
724 | | idn2_to_ascii_4i (const uint32_t *input, size_t inlen, char *output, |
725 | | int flags) |
726 | 0 | { |
727 | 0 | char *out; |
728 | 0 | int rc; |
729 | |
|
730 | 0 | if (!input) |
731 | 0 | { |
732 | 0 | if (output) |
733 | 0 | *output = 0; |
734 | 0 | return IDN2_OK; |
735 | 0 | } |
736 | | |
737 | 0 | rc = idn2_to_ascii_4i2 (input, inlen, &out, flags); |
738 | 0 | if (rc == IDN2_OK) |
739 | 0 | { |
740 | 0 | size_t len = strlen (out); |
741 | |
|
742 | 0 | if (len > IDN2_LABEL_MAX_LENGTH) |
743 | 0 | rc = IDN2_TOO_BIG_LABEL; |
744 | 0 | else if (output) |
745 | 0 | strcpy (output, out); |
746 | |
|
747 | 0 | free (out); |
748 | 0 | } |
749 | |
|
750 | 0 | return rc; |
751 | 0 | } |
752 | | |
753 | | /** |
754 | | * idn2_to_ascii_4i2: |
755 | | * @input: zero terminated input Unicode (UCS-4) string. |
756 | | * @inlen: number of elements in @input. |
757 | | * @output: pointer to newly allocated zero-terminated output string. |
758 | | * @flags: optional #idn2_flags to modify behaviour. |
759 | | * |
760 | | * The ToASCII operation takes a sequence of Unicode code points that make |
761 | | * up one domain label and transforms it into a sequence of code points in |
762 | | * the ASCII range (0..7F). If ToASCII succeeds, the original sequence and |
763 | | * the resulting sequence are equivalent labels. |
764 | | * |
765 | | * It is important to note that the ToASCII operation can fail. |
766 | | * ToASCII fails if any step of it fails. If any step of the |
767 | | * ToASCII operation fails on any label in a domain name, that domain |
768 | | * name MUST NOT be used as an internationalized domain name. |
769 | | * The method for dealing with this failure is application-specific. |
770 | | * |
771 | | * The inputs to ToASCII are a sequence of code points. |
772 | | * |
773 | | * ToASCII never alters a sequence of code points that are all in the ASCII |
774 | | * range to begin with (although it could fail). Applying the ToASCII operation multiple |
775 | | * effect as applying it just once. |
776 | | * |
777 | | * The default behavior of this function (when flags are zero) is to apply |
778 | | * the IDNA2008 rules without the TR46 amendments. As the TR46 |
779 | | * non-transitional processing is nowadays ubiquitous, when unsure, it is |
780 | | * recommended to call this function with the %IDN2_NONTRANSITIONAL |
781 | | * and the %IDN2_NFC_INPUT flags for compatibility with other software. |
782 | | * |
783 | | * Returns: On successful conversion %IDN2_OK is returned; if the |
784 | | * output label would have been too long %IDN2_TOO_BIG_LABEL is |
785 | | * returned, or another error code is returned. |
786 | | * |
787 | | * Since: 2.1.1 |
788 | | **/ |
789 | | int |
790 | | idn2_to_ascii_4i2 (const uint32_t *input, size_t inlen, char **output, |
791 | | int flags) |
792 | 0 | { |
793 | 0 | uint32_t *input_u32; |
794 | 0 | uint8_t *input_u8, *output_u8; |
795 | 0 | size_t length; |
796 | 0 | int rc; |
797 | |
|
798 | 0 | if (!input) |
799 | 0 | { |
800 | 0 | if (output) |
801 | 0 | *output = NULL; |
802 | 0 | return IDN2_OK; |
803 | 0 | } |
804 | | |
805 | 0 | input_u32 = (uint32_t *) malloc ((inlen + 1) * sizeof (uint32_t)); |
806 | 0 | if (!input_u32) |
807 | 0 | return IDN2_MALLOC; |
808 | | |
809 | 0 | u32_cpy (input_u32, input, inlen); |
810 | 0 | input_u32[inlen] = 0; |
811 | |
|
812 | 0 | input_u8 = u32_to_u8 (input_u32, inlen + 1, NULL, &length); |
813 | 0 | free (input_u32); |
814 | 0 | if (!input_u8) |
815 | 0 | { |
816 | 0 | if (errno == ENOMEM) |
817 | 0 | return IDN2_MALLOC; |
818 | 0 | return IDN2_ENCODING_ERROR; |
819 | 0 | } |
820 | | |
821 | 0 | rc = idn2_lookup_u8 (input_u8, &output_u8, flags); |
822 | 0 | free (input_u8); |
823 | |
|
824 | 0 | if (rc == IDN2_OK) |
825 | 0 | { |
826 | 0 | if (output) |
827 | 0 | *output = (char *) output_u8; |
828 | 0 | else |
829 | 0 | free (output_u8); |
830 | 0 | } |
831 | |
|
832 | 0 | return rc; |
833 | 0 | } |
834 | | |
835 | | /** |
836 | | * idn2_to_ascii_4z: |
837 | | * @input: zero terminated input Unicode (UCS-4) string. |
838 | | * @output: pointer to newly allocated zero-terminated output string. |
839 | | * @flags: optional #idn2_flags to modify behaviour. |
840 | | * |
841 | | * Convert UCS-4 domain name to ASCII string using the IDNA2008 |
842 | | * rules. The domain name may contain several labels, separated by dots. |
843 | | * The output buffer must be deallocated by the caller. |
844 | | * |
845 | | * The default behavior of this function (when flags are zero) is to apply |
846 | | * the IDNA2008 rules without the TR46 amendments. As the TR46 |
847 | | * non-transitional processing is nowadays ubiquitous, when unsure, it is |
848 | | * recommended to call this function with the %IDN2_NONTRANSITIONAL |
849 | | * and the %IDN2_NFC_INPUT flags for compatibility with other software. |
850 | | * |
851 | | * Return value: Returns %IDN2_OK on success, or error code. |
852 | | * |
853 | | * Since: 2.0.0 |
854 | | **/ |
855 | | int |
856 | | idn2_to_ascii_4z (const uint32_t *input, char **output, int flags) |
857 | 0 | { |
858 | 0 | uint8_t *input_u8; |
859 | 0 | size_t length; |
860 | 0 | int rc; |
861 | |
|
862 | 0 | if (!input) |
863 | 0 | { |
864 | 0 | if (output) |
865 | 0 | *output = NULL; |
866 | 0 | return IDN2_OK; |
867 | 0 | } |
868 | | |
869 | 0 | input_u8 = u32_to_u8 (input, u32_strlen (input) + 1, NULL, &length); |
870 | 0 | if (!input_u8) |
871 | 0 | { |
872 | 0 | if (errno == ENOMEM) |
873 | 0 | return IDN2_MALLOC; |
874 | 0 | return IDN2_ENCODING_ERROR; |
875 | 0 | } |
876 | | |
877 | 0 | rc = idn2_lookup_u8 (input_u8, (uint8_t **) output, flags); |
878 | 0 | free (input_u8); |
879 | |
|
880 | 0 | return rc; |
881 | 0 | } |
882 | | |
883 | | /** |
884 | | * idn2_to_ascii_8z: |
885 | | * @input: zero terminated input UTF-8 string. |
886 | | * @output: pointer to newly allocated output string. |
887 | | * @flags: optional #idn2_flags to modify behaviour. |
888 | | * |
889 | | * Convert UTF-8 domain name to ASCII string using the IDNA2008 |
890 | | * rules. The domain name may contain several labels, separated by dots. |
891 | | * The output buffer must be deallocated by the caller. |
892 | | * |
893 | | * The default behavior of this function (when flags are zero) is to apply |
894 | | * the IDNA2008 rules without the TR46 amendments. As the TR46 |
895 | | * non-transitional processing is nowadays ubiquitous, when unsure, it is |
896 | | * recommended to call this function with the %IDN2_NONTRANSITIONAL |
897 | | * and the %IDN2_NFC_INPUT flags for compatibility with other software. |
898 | | * |
899 | | * Return value: Returns %IDN2_OK on success, or error code. |
900 | | * |
901 | | * Since: 2.0.0 |
902 | | **/ |
903 | | int |
904 | | idn2_to_ascii_8z (const char *input, char **output, int flags) |
905 | 0 | { |
906 | 0 | return idn2_lookup_u8 ((const uint8_t *) input, (uint8_t **) output, flags); |
907 | 0 | } |
908 | | |
909 | | /** |
910 | | * idn2_to_ascii_lz: |
911 | | * @input: zero terminated input UTF-8 string. |
912 | | * @output: pointer to newly allocated output string. |
913 | | * @flags: optional #idn2_flags to modify behaviour. |
914 | | * |
915 | | * Convert a domain name in locale's encoding to ASCII string using the IDNA2008 |
916 | | * rules. The domain name may contain several labels, separated by dots. |
917 | | * The output buffer must be deallocated by the caller. |
918 | | * |
919 | | * The default behavior of this function (when flags are zero) is to apply |
920 | | * the IDNA2008 rules without the TR46 amendments. As the TR46 |
921 | | * non-transitional processing is nowadays ubiquitous, when unsure, it is |
922 | | * recommended to call this function with the %IDN2_NONTRANSITIONAL |
923 | | * and the %IDN2_NFC_INPUT flags for compatibility with other software. |
924 | | * |
925 | | * Returns: %IDN2_OK on success, or error code. |
926 | | * Same as described in idn2_lookup_ul() documentation. |
927 | | * |
928 | | * Since: 2.0.0 |
929 | | **/ |
930 | | int |
931 | | idn2_to_ascii_lz (const char *input, char **output, int flags) |
932 | 0 | { |
933 | 0 | return idn2_lookup_ul (input, output, flags); |
934 | 0 | } |