Line | Count | Source (jump to first uncovered line) |
1 | | /* tld.c --- Declarations for TLD restriction checking. |
2 | | Copyright (C) 2004-2025 Simon Josefsson. |
3 | | Copyright (C) 2003-2025 Free Software Foundation, Inc. |
4 | | |
5 | | Author: Thomas Jacob, Internet24.de |
6 | | |
7 | | This file is part of GNU Libidn. |
8 | | |
9 | | GNU Libidn is free software: you can redistribute it and/or |
10 | | modify it under the terms of either: |
11 | | |
12 | | * the GNU Lesser General Public License as published by the Free |
13 | | Software Foundation; either version 3 of the License, or (at |
14 | | your option) any later version. |
15 | | |
16 | | or |
17 | | |
18 | | * the GNU General Public License as published by the Free |
19 | | Software Foundation; either version 2 of the License, or (at |
20 | | your option) any later version. |
21 | | |
22 | | or both in parallel, as here. |
23 | | |
24 | | GNU Libidn is distributed in the hope that it will be useful, |
25 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
26 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
27 | | General Public License for more details. |
28 | | |
29 | | You should have received copies of the GNU General Public License and |
30 | | the GNU Lesser General Public License along with this program. If |
31 | | not, see <https://www.gnu.org/licenses/>. */ |
32 | | |
33 | | #include <config.h> |
34 | | |
35 | | /* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */ |
36 | | #include <stringprep.h> |
37 | | |
38 | | /* Get strcmp(). */ |
39 | | #include <string.h> |
40 | | |
41 | | /* Get specifications. */ |
42 | | #include <tld.h> |
43 | | |
44 | | /* Array of built-in domain restriction structures. See tlds.c. */ |
45 | | extern const Tld_table *_tld_tables[]; |
46 | | |
47 | | /** |
48 | | * tld_get_table: |
49 | | * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string. |
50 | | * @tables: Zero terminated array of #Tld_table info-structures for |
51 | | * TLDs. |
52 | | * |
53 | | * Get the TLD table for a named TLD by searching through the given |
54 | | * TLD table array. |
55 | | * |
56 | | * Return value: Return structure corresponding to TLD @tld by going |
57 | | * thru @tables, or return %NULL if no such structure is found. |
58 | | */ |
59 | | const Tld_table * |
60 | | tld_get_table (const char *tld, const Tld_table **tables) |
61 | 2.93k | { |
62 | 2.93k | const Tld_table **tldtable = NULL; |
63 | | |
64 | 2.93k | if (!tld || !tables) |
65 | 0 | return NULL; |
66 | | |
67 | 3.41k | for (tldtable = tables; *tldtable; tldtable++) |
68 | 3.27k | if (!strcmp ((*tldtable)->name, tld)) |
69 | 2.79k | return *tldtable; |
70 | | |
71 | 140 | return NULL; |
72 | 2.93k | } |
73 | | |
74 | | /** |
75 | | * tld_default_table: |
76 | | * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string. |
77 | | * @overrides: Additional zero terminated array of #Tld_table |
78 | | * info-structures for TLDs, or %NULL to only use library default |
79 | | * tables. |
80 | | * |
81 | | * Get the TLD table for a named TLD, using the internal defaults, |
82 | | * possibly overridden by the (optional) supplied tables. |
83 | | * |
84 | | * Return value: Return structure corresponding to TLD @tld_str, first |
85 | | * looking through @overrides then thru built-in list, or %NULL if |
86 | | * no such structure found. |
87 | | */ |
88 | | const Tld_table * |
89 | | tld_default_table (const char *tld, const Tld_table **overrides) |
90 | 2.93k | { |
91 | 2.93k | const Tld_table *tldtable = NULL; |
92 | | |
93 | 2.93k | if (!tld) |
94 | 0 | return NULL; |
95 | | |
96 | 2.93k | if (overrides) |
97 | 0 | tldtable = tld_get_table (tld, overrides); |
98 | | |
99 | 2.93k | if (!tldtable) |
100 | 2.93k | tldtable = tld_get_table (tld, _tld_tables); |
101 | | |
102 | 2.93k | return tldtable; |
103 | 2.93k | } |
104 | | |
105 | 6.36k | #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \ |
106 | 6.36k | (c) == 0xFF0E || (c) == 0xFF61) |
107 | | |
108 | | /** |
109 | | * tld_get_4: |
110 | | * @in: Array of unicode code points to process. Does not need to be |
111 | | * zero terminated. |
112 | | * @inlen: Number of unicode code points. |
113 | | * @out: Zero terminated ascii result string pointer. |
114 | | * |
115 | | * Isolate the top-level domain of @in and return it as an ASCII |
116 | | * string in @out. |
117 | | * |
118 | | * Return value: Return %TLD_SUCCESS on success, or the corresponding |
119 | | * #Tld_rc error code otherwise. |
120 | | */ |
121 | | int |
122 | | tld_get_4 (const uint32_t *in, size_t inlen, char **out) |
123 | 7.23k | { |
124 | 7.23k | const uint32_t *ipos; |
125 | 7.23k | size_t olen; |
126 | | |
127 | 7.23k | *out = NULL; |
128 | 7.23k | if (!in || inlen == 0) |
129 | 317 | return TLD_NODATA; |
130 | | |
131 | 6.91k | ipos = &in[inlen - 1]; |
132 | 6.91k | olen = 0; |
133 | | /* Scan backwards for non(latin)letters. */ |
134 | 30.9k | while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) || |
135 | 30.7k | (*ipos >= 0x61 && *ipos <= 0x7A))) |
136 | 24.0k | ipos--, olen++; |
137 | | |
138 | 6.91k | if (olen > 0 && ipos >= in && DOTP (*ipos)) |
139 | 786 | { |
140 | | /* Found something that appears a TLD. */ |
141 | 786 | char *out_s = malloc (sizeof (char) * (olen + 1)); |
142 | 786 | char *opos = out_s; |
143 | | |
144 | 786 | if (!opos) |
145 | 0 | return TLD_MALLOC_ERROR; |
146 | | |
147 | 786 | ipos++; |
148 | | /* Transcribe to lowercase ascii string. */ |
149 | 8.01k | for (; ipos < &in[inlen]; ipos++, opos++) |
150 | 7.22k | *opos = *ipos > 0x5A ? *ipos : *ipos + 0x20; |
151 | 786 | *opos = 0; |
152 | 786 | *out = out_s; |
153 | 786 | return TLD_SUCCESS; |
154 | 786 | } |
155 | | |
156 | 6.12k | return TLD_NO_TLD; |
157 | 6.91k | } |
158 | | |
159 | | /** |
160 | | * tld_get_4z: |
161 | | * @in: Zero terminated array of unicode code points to process. |
162 | | * @out: Zero terminated ascii result string pointer. |
163 | | * |
164 | | * Isolate the top-level domain of @in and return it as an ASCII |
165 | | * string in @out. |
166 | | * |
167 | | * Return value: Return %TLD_SUCCESS on success, or the corresponding |
168 | | * #Tld_rc error code otherwise. |
169 | | */ |
170 | | int |
171 | | tld_get_4z (const uint32_t *in, char **out) |
172 | 1.21k | { |
173 | 1.21k | const uint32_t *ipos = in; |
174 | | |
175 | 1.21k | if (!in) |
176 | 0 | return TLD_NODATA; |
177 | | |
178 | 11.3k | while (*ipos) |
179 | 10.1k | ipos++; |
180 | | |
181 | 1.21k | return tld_get_4 (in, ipos - in, out); |
182 | 1.21k | } |
183 | | |
184 | | /** |
185 | | * tld_get_z: |
186 | | * @in: Zero terminated character array to process. |
187 | | * @out: Zero terminated ascii result string pointer. |
188 | | * |
189 | | * Isolate the top-level domain of @in and return it as an ASCII |
190 | | * string in @out. The input string @in may be UTF-8, ISO-8859-1 or |
191 | | * any ASCII compatible character encoding. |
192 | | * |
193 | | * Return value: Return %TLD_SUCCESS on success, or the corresponding |
194 | | * #Tld_rc error code otherwise. |
195 | | */ |
196 | | int |
197 | | tld_get_z (const char *in, char **out) |
198 | 2.46k | { |
199 | 2.46k | uint32_t *iucs; |
200 | 2.46k | size_t i, ilen; |
201 | 2.46k | int rc; |
202 | | |
203 | 2.46k | ilen = strlen (in); |
204 | 2.46k | iucs = calloc (ilen, sizeof (*iucs)); |
205 | | |
206 | 2.46k | if (!iucs) |
207 | 0 | return TLD_MALLOC_ERROR; |
208 | | |
209 | 84.5k | for (i = 0; i < ilen; i++) |
210 | 82.0k | iucs[i] = in[i]; |
211 | | |
212 | 2.46k | rc = tld_get_4 (iucs, ilen, out); |
213 | | |
214 | 2.46k | free (iucs); |
215 | | |
216 | 2.46k | return rc; |
217 | 2.46k | } |
218 | | |
219 | | /* |
220 | | * tld_checkchar - verify that character is permitted |
221 | | * @ch: 32 bit unicode character to check. |
222 | | * @tld: A #Tld_table data structure to check @ch against. |
223 | | * |
224 | | * Verify if @ch is either in [a-z0-9-.] or mentioned as a valid |
225 | | * character in @tld. |
226 | | * |
227 | | * Return value: Return the #Tld_rc value %TLD_SUCCESS if @ch is a |
228 | | * valid character for the TLD @tld or if @tld is %NULL, |
229 | | * %TLD_INVALID if @ch is invalid as defined by @tld. |
230 | | */ |
231 | | static int |
232 | | _tld_checkchar (uint32_t ch, const Tld_table *tld) |
233 | 8.72k | { |
234 | 8.72k | const Tld_table_element *s, *e, *m; |
235 | | |
236 | 8.72k | if (!tld) |
237 | 0 | return TLD_SUCCESS; |
238 | | |
239 | | /* Check for [-a-z0-9.]. */ |
240 | 8.72k | if ((ch >= 0x61 && ch <= 0x7A) || |
241 | 8.72k | (ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch)) |
242 | 6.02k | return TLD_SUCCESS; |
243 | | |
244 | 2.70k | s = tld->valid; |
245 | 2.70k | e = s + tld->nvalid; |
246 | 10.2k | while (s < e) |
247 | 8.85k | { |
248 | 8.85k | m = s + ((e - s) >> 1); |
249 | 8.85k | if (ch < m->start) |
250 | 1.93k | e = m; |
251 | 6.92k | else if (ch > m->end) |
252 | 5.60k | s = m + 1; |
253 | 1.31k | else |
254 | 1.31k | return TLD_SUCCESS; |
255 | 8.85k | } |
256 | | |
257 | 1.38k | return TLD_INVALID; |
258 | 2.70k | } |
259 | | |
260 | | /** |
261 | | * tld_check_4t: |
262 | | * @in: Array of unicode code points to process. Does not need to be |
263 | | * zero terminated. |
264 | | * @inlen: Number of unicode code points. |
265 | | * @errpos: Position of offending character is returned here. |
266 | | * @tld: A #Tld_table data structure representing the restrictions for |
267 | | * which the input should be tested. |
268 | | * |
269 | | * Test each of the code points in @in for whether or not |
270 | | * they are allowed by the data structure in @tld, return |
271 | | * the position of the first character for which this is not |
272 | | * the case in @errpos. |
273 | | * |
274 | | * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code |
275 | | * points are valid or when @tld is null, %TLD_INVALID if a |
276 | | * character is not allowed, or additional error codes on general |
277 | | * failure conditions. |
278 | | */ |
279 | | int |
280 | | tld_check_4t (const uint32_t *in, size_t inlen, size_t *errpos, |
281 | | const Tld_table *tld) |
282 | 1.68k | { |
283 | 1.68k | const uint32_t *ipos; |
284 | 1.68k | int rc; |
285 | | |
286 | 1.68k | if (!tld) /* No data for TLD so everything is valid. */ |
287 | 140 | return TLD_SUCCESS; |
288 | | |
289 | 1.54k | ipos = in; |
290 | 8.88k | while (ipos < &in[inlen]) |
291 | 8.72k | { |
292 | 8.72k | rc = _tld_checkchar (*ipos, tld); |
293 | 8.72k | if (rc != TLD_SUCCESS) |
294 | 1.38k | { |
295 | 1.38k | if (errpos) |
296 | 1.38k | *errpos = ipos - in; |
297 | 1.38k | return rc; |
298 | 1.38k | } |
299 | 7.33k | ipos++; |
300 | 7.33k | } |
301 | 156 | return TLD_SUCCESS; |
302 | 1.54k | } |
303 | | |
304 | | /** |
305 | | * tld_check_4tz: |
306 | | * @in: Zero terminated array of unicode code points to process. |
307 | | * @errpos: Position of offending character is returned here. |
308 | | * @tld: A #Tld_table data structure representing the restrictions for |
309 | | * which the input should be tested. |
310 | | * |
311 | | * Test each of the code points in @in for whether or not |
312 | | * they are allowed by the data structure in @tld, return |
313 | | * the position of the first character for which this is not |
314 | | * the case in @errpos. |
315 | | * |
316 | | * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code |
317 | | * points are valid or when @tld is null, %TLD_INVALID if a |
318 | | * character is not allowed, or additional error codes on general |
319 | | * failure conditions. |
320 | | */ |
321 | | int |
322 | | tld_check_4tz (const uint32_t *in, size_t *errpos, const Tld_table *tld) |
323 | 1.21k | { |
324 | 1.21k | const uint32_t *ipos = in; |
325 | | |
326 | 1.21k | if (!ipos) |
327 | 0 | return TLD_NODATA; |
328 | | |
329 | 11.3k | while (*ipos) |
330 | 10.1k | ipos++; |
331 | | |
332 | 1.21k | return tld_check_4t (in, ipos - in, errpos, tld); |
333 | 1.21k | } |
334 | | |
335 | | /** |
336 | | * tld_check_4: |
337 | | * @in: Array of unicode code points to process. Does not need to be |
338 | | * zero terminated. |
339 | | * @inlen: Number of unicode code points. |
340 | | * @errpos: Position of offending character is returned here. |
341 | | * @overrides: A #Tld_table array of additional domain restriction |
342 | | * structures that complement and supersede the built-in information. |
343 | | * |
344 | | * Test each of the code points in @in for whether or not they are |
345 | | * allowed by the information in @overrides or by the built-in TLD |
346 | | * restriction data. When data for the same TLD is available both |
347 | | * internally and in @overrides, the information in @overrides takes |
348 | | * precedence. If several entries for a specific TLD are found, the |
349 | | * first one is used. If @overrides is %NULL, only the built-in |
350 | | * information is used. The position of the first offending character |
351 | | * is returned in @errpos. |
352 | | * |
353 | | * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code |
354 | | * points are valid or when @tld is null, %TLD_INVALID if a |
355 | | * character is not allowed, or additional error codes on general |
356 | | * failure conditions. |
357 | | */ |
358 | | int |
359 | | tld_check_4 (const uint32_t *in, size_t inlen, size_t *errpos, |
360 | | const Tld_table **overrides) |
361 | 3.55k | { |
362 | 3.55k | const Tld_table *tld; |
363 | 3.55k | char *domain; |
364 | 3.55k | int rc; |
365 | | |
366 | 3.55k | if (errpos) |
367 | 3.55k | *errpos = 0; |
368 | | |
369 | | /* Get TLD name. */ |
370 | 3.55k | rc = tld_get_4 (in, inlen, &domain); |
371 | | |
372 | 3.55k | if (rc != TLD_SUCCESS) |
373 | 3.08k | { |
374 | 3.08k | if (rc == TLD_NO_TLD) /* No TLD, say OK */ |
375 | 2.87k | return TLD_SUCCESS; |
376 | 211 | else |
377 | 211 | return rc; |
378 | 3.08k | } |
379 | | |
380 | | /* Retrieve appropriate data structure. */ |
381 | 467 | tld = tld_default_table (domain, overrides); |
382 | 467 | free (domain); |
383 | | |
384 | 467 | return tld_check_4t (in, inlen, errpos, tld); |
385 | 3.55k | } |
386 | | |
387 | | /** |
388 | | * tld_check_4z: |
389 | | * @in: Zero-terminated array of unicode code points to process. |
390 | | * @errpos: Position of offending character is returned here. |
391 | | * @overrides: A #Tld_table array of additional domain restriction |
392 | | * structures that complement and supersede the built-in information. |
393 | | * |
394 | | * Test each of the code points in @in for whether or not they are |
395 | | * allowed by the information in @overrides or by the built-in TLD |
396 | | * restriction data. When data for the same TLD is available both |
397 | | * internally and in @overrides, the information in @overrides takes |
398 | | * precedence. If several entries for a specific TLD are found, the |
399 | | * first one is used. If @overrides is %NULL, only the built-in |
400 | | * information is used. The position of the first offending character |
401 | | * is returned in @errpos. |
402 | | * |
403 | | * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code |
404 | | * points are valid or when @tld is null, %TLD_INVALID if a |
405 | | * character is not allowed, or additional error codes on general |
406 | | * failure conditions. |
407 | | */ |
408 | | int |
409 | | tld_check_4z (const uint32_t *in, size_t *errpos, const Tld_table **overrides) |
410 | 1.21k | { |
411 | 1.21k | const uint32_t *ipos = in; |
412 | | |
413 | 1.21k | if (!ipos) |
414 | 0 | return TLD_NODATA; |
415 | | |
416 | 11.3k | while (*ipos) |
417 | 10.1k | ipos++; |
418 | | |
419 | 1.21k | return tld_check_4 (in, ipos - in, errpos, overrides); |
420 | 1.21k | } |
421 | | |
422 | | /** |
423 | | * tld_check_8z: |
424 | | * @in: Zero-terminated UTF8 string to process. |
425 | | * @errpos: Position of offending character is returned here. |
426 | | * @overrides: A #Tld_table array of additional domain restriction |
427 | | * structures that complement and supersede the built-in information. |
428 | | * |
429 | | * Test each of the characters in @in for whether or not they are |
430 | | * allowed by the information in @overrides or by the built-in TLD |
431 | | * restriction data. When data for the same TLD is available both |
432 | | * internally and in @overrides, the information in @overrides takes |
433 | | * precedence. If several entries for a specific TLD are found, the |
434 | | * first one is used. If @overrides is %NULL, only the built-in |
435 | | * information is used. The position of the first offending character |
436 | | * is returned in @errpos. Note that the error position refers to the |
437 | | * decoded character offset rather than the byte position in the |
438 | | * string. |
439 | | * |
440 | | * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all |
441 | | * characters are valid or when @tld is null, %TLD_INVALID if a |
442 | | * character is not allowed, or additional error codes on general |
443 | | * failure conditions. |
444 | | */ |
445 | | int |
446 | | tld_check_8z (const char *in, size_t *errpos, const Tld_table **overrides) |
447 | 2.99k | { |
448 | 2.99k | uint32_t *iucs; |
449 | 2.99k | size_t ilen; |
450 | 2.99k | int rc; |
451 | | |
452 | 2.99k | if (!in) |
453 | 0 | return TLD_NODATA; |
454 | | |
455 | 2.99k | iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen); |
456 | | |
457 | 2.99k | if (!iucs) |
458 | 661 | return TLD_MALLOC_ERROR; |
459 | | |
460 | 2.33k | rc = tld_check_4 (iucs, ilen, errpos, overrides); |
461 | | |
462 | 2.33k | free (iucs); |
463 | | |
464 | 2.33k | return rc; |
465 | 2.99k | } |
466 | | |
467 | | /** |
468 | | * tld_check_lz: |
469 | | * @in: Zero-terminated string in the current locales encoding to process. |
470 | | * @errpos: Position of offending character is returned here. |
471 | | * @overrides: A #Tld_table array of additional domain restriction |
472 | | * structures that complement and supersede the built-in information. |
473 | | * |
474 | | * Test each of the characters in @in for whether or not they are |
475 | | * allowed by the information in @overrides or by the built-in TLD |
476 | | * restriction data. When data for the same TLD is available both |
477 | | * internally and in @overrides, the information in @overrides takes |
478 | | * precedence. If several entries for a specific TLD are found, the |
479 | | * first one is used. If @overrides is %NULL, only the built-in |
480 | | * information is used. The position of the first offending character |
481 | | * is returned in @errpos. Note that the error position refers to the |
482 | | * decoded character offset rather than the byte position in the |
483 | | * string. |
484 | | * |
485 | | * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all |
486 | | * characters are valid or when @tld is null, %TLD_INVALID if a |
487 | | * character is not allowed, or additional error codes on general |
488 | | * failure conditions. |
489 | | */ |
490 | | int |
491 | | tld_check_lz (const char *in, size_t *errpos, const Tld_table **overrides) |
492 | 2.46k | { |
493 | 2.46k | char *utf8; |
494 | 2.46k | int rc; |
495 | | |
496 | 2.46k | if (!in) |
497 | 0 | return TLD_NODATA; |
498 | | |
499 | 2.46k | utf8 = stringprep_locale_to_utf8 (in); |
500 | 2.46k | if (!utf8) |
501 | 1.93k | return TLD_ICONV_ERROR; |
502 | | |
503 | | |
504 | 529 | rc = tld_check_8z (utf8, errpos, overrides); |
505 | | |
506 | 529 | free (utf8); |
507 | | |
508 | 529 | return rc; |
509 | 2.46k | } |
510 | | |
511 | | /** |
512 | | * Tld_rc: |
513 | | * @TLD_SUCCESS: Successful operation. This value is guaranteed to |
514 | | * always be zero, the remaining ones are only guaranteed to hold |
515 | | * non-zero values, for logical comparison purposes. |
516 | | * @TLD_INVALID: Invalid character found. |
517 | | * @TLD_NODATA: No input data was provided. |
518 | | * @TLD_MALLOC_ERROR: Error during memory allocation. |
519 | | * @TLD_ICONV_ERROR: Character encoding conversion error. |
520 | | * @TLD_NO_TLD: No top-level domain found in domain string. |
521 | | * @TLD_NOTLD: Same as @TLD_NO_TLD, for compatibility |
522 | | * with typo in earlier versions. |
523 | | * |
524 | | * Enumerated return codes of the TLD checking functions. |
525 | | * The value 0 is guaranteed to always correspond to success. |
526 | | */ |