Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2012 Tim Ruehsen |
3 | | * Copyright (c) 2015-2024 Free Software Foundation, Inc. |
4 | | * |
5 | | * This file is part of libwget. |
6 | | * |
7 | | * Libwget is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as published by |
9 | | * the Free Software Foundation, either version 3 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * Libwget is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libwget. If not, see <https://www.gnu.org/licenses/>. |
19 | | * |
20 | | * |
21 | | * URI/IRI routines |
22 | | * about encoding see http://nikitathespider.com/articles/EncodingDivination.html |
23 | | * about GET encoding see https://stackoverflow.com/questions/1549213/whats-the-correct-encoding-of-http-get-request-strings |
24 | | * RFC 3986: URI generic syntax |
25 | | * |
26 | | * |
27 | | * Changelog |
28 | | * 25.04.2012 Tim Ruehsen created |
29 | | * |
30 | | */ |
31 | | |
32 | | #include <config.h> |
33 | | |
34 | | #include <string.h> |
35 | | #include <errno.h> |
36 | | #include "c-ctype.h" |
37 | | |
38 | | #include <wget.h> |
39 | | #include "private.h" |
40 | | |
41 | | static char *create_safe_uri(wget_iri *iri); |
42 | | |
43 | | /** |
44 | | * \file |
45 | | * \brief Functions to work with URIs and IRIs |
46 | | * \defgroup libwget-iri URIs/IRIs |
47 | | * |
48 | | * @{ |
49 | | * |
50 | | * URI/IRI parsing and manipulation functions. |
51 | | * |
52 | | * IRIs are processed according to [RFC 3987](https://datatracker.ietf.org/doc/rfc3987/). |
53 | | * Functions that escape certain characters (such as wget_iri_escape()) work according to |
54 | | * [RFC 3986](https://datatracker.ietf.org/doc/rfc3986/). |
55 | | * |
56 | | * The \ref wget_iri_st "wget_iri" structure represents an IRI. You generate one from a string with wget_iri_parse() or |
57 | | * wget_iri_parse_base(). You can use wget_iri_clone() to generate another identical \ref wget_iri_st "wget_iri". |
58 | | * |
59 | | * You can access each of the fields of a \ref wget_iri_st "wget_iri" (such as `path`) independently, and you can use |
60 | | * the getters here to escape each of those parts, or for convenience (e.g wget_iri_get_escaped_host(), |
61 | | * wget_iri_get_escaped_resource(), etc.). |
62 | | * |
63 | | * URIs/IRIs are all internally treated in UTF-8. The parsing functions that generate a \ref wget_iri_st "wget_iri" structure |
64 | | * (wget_iri_parse() and wget_iri_parse_base()) thus convert the input string to UTF-8 before anything else. |
65 | | * These functions take an `encoding` parameter that tells which is the original encoding of that string. |
66 | | * |
67 | | * Conversely, the getters (for example, wget_iri_get_path()) can convert the output string from UTF-8 |
68 | | * to an encoding of choice. The desired encoding is also specified in the `encoding` parameter. |
69 | | * |
70 | | * The `encoding` parameter, in all functions that accept it, is a string with the name of a character set |
71 | | * supported by GNU libiconv. You can find such a list elsewhere, but popular examples are "utf-8", "utf-16" or "iso-8859-1". |
72 | | */ |
73 | | |
74 | | static const char |
75 | | *default_page = "index.html"; |
76 | | static size_t |
77 | | default_page_length = 10; |
78 | | |
79 | | static struct iri_scheme { |
80 | | uint16_t port; |
81 | | const char name[6]; |
82 | | } schemes[] = { |
83 | | [WGET_IRI_SCHEME_HTTP] = { 80, "http" }, |
84 | | [WGET_IRI_SCHEME_HTTPS] = { 443, "https" }, |
85 | | }; |
86 | | |
87 | | static size_t WGET_GCC_NONNULL_ALL normalize_path(char *path); |
88 | | |
89 | | /** |
90 | | * \param[in] scheme Scheme to get name for |
91 | | * \return Name of \p scheme (e.g. "http" or "https") or NULL is not supported |
92 | | * |
93 | | * Maps \p scheme to it's string representation. |
94 | | */ |
95 | | const char *wget_iri_scheme_get_name(wget_iri_scheme scheme) |
96 | 0 | { |
97 | 0 | if ((unsigned) scheme < countof(schemes)) |
98 | 0 | return schemes[scheme].name; |
99 | | |
100 | 0 | return NULL; |
101 | 0 | } |
102 | | |
103 | | /** |
104 | | * \param[in] iri An IRI |
105 | | * \return 1 if the scheme is supported, 0 if not |
106 | | * |
107 | | * Tells whether the IRI's scheme is supported or not. |
108 | | */ |
109 | | bool wget_iri_supported(const wget_iri *iri) |
110 | 7.86k | { |
111 | 7.86k | return (unsigned) iri->scheme < countof(schemes); |
112 | 7.86k | } |
113 | | |
114 | | |
115 | | /* \cond _hide_internal_symbols */ |
116 | 4.62k | #define IRI_CTYPE_GENDELIM (1<<0) |
117 | 4.62k | #define iri_isgendelim(c) (iri_ctype[(unsigned char)(c)] & IRI_CTYPE_GENDELIM) |
118 | | |
119 | 825k | #define IRI_CTYPE_SUBDELIM (1<<1) |
120 | 1.74M | #define iri_issubdelim(c) (iri_ctype[(unsigned char)(c)] & IRI_CTYPE_SUBDELIM) |
121 | | |
122 | 2.91M | #define IRI_CTYPE_UNRESERVED (1<<2) |
123 | 4.50M | #define iri_isunreserved(c) (iri_ctype[(unsigned char)(c)] & IRI_CTYPE_UNRESERVED) |
124 | | |
125 | 150k | #define iri_isscheme(c) (c_isalnum(c) || c == '+' || c == '-' || c == '.') |
126 | | /* \endcond */ |
127 | | |
128 | | static const unsigned char |
129 | | iri_ctype[256] = { |
130 | | [':'] = IRI_CTYPE_GENDELIM, |
131 | | ['/'] = IRI_CTYPE_GENDELIM, |
132 | | ['?'] = IRI_CTYPE_GENDELIM, |
133 | | ['#'] = IRI_CTYPE_GENDELIM, |
134 | | ['['] = IRI_CTYPE_GENDELIM, |
135 | | [']'] = IRI_CTYPE_GENDELIM, |
136 | | ['@'] = IRI_CTYPE_GENDELIM, |
137 | | |
138 | | ['!'] = IRI_CTYPE_SUBDELIM, |
139 | | ['$'] = IRI_CTYPE_SUBDELIM, |
140 | | ['&'] = IRI_CTYPE_SUBDELIM, |
141 | | ['\''] = IRI_CTYPE_SUBDELIM, |
142 | | ['('] = IRI_CTYPE_SUBDELIM, |
143 | | [')'] = IRI_CTYPE_SUBDELIM, |
144 | | ['*'] = IRI_CTYPE_SUBDELIM, |
145 | | ['+'] = IRI_CTYPE_SUBDELIM, |
146 | | [','] = IRI_CTYPE_SUBDELIM, |
147 | | [';'] = IRI_CTYPE_SUBDELIM, |
148 | | ['='] = IRI_CTYPE_SUBDELIM, |
149 | | |
150 | | ['0'] = IRI_CTYPE_UNRESERVED, |
151 | | ['1'] = IRI_CTYPE_UNRESERVED, |
152 | | ['2'] = IRI_CTYPE_UNRESERVED, |
153 | | ['3'] = IRI_CTYPE_UNRESERVED, |
154 | | ['4'] = IRI_CTYPE_UNRESERVED, |
155 | | ['5'] = IRI_CTYPE_UNRESERVED, |
156 | | ['6'] = IRI_CTYPE_UNRESERVED, |
157 | | ['7'] = IRI_CTYPE_UNRESERVED, |
158 | | ['8'] = IRI_CTYPE_UNRESERVED, |
159 | | ['9'] = IRI_CTYPE_UNRESERVED, |
160 | | ['a'] = IRI_CTYPE_UNRESERVED, |
161 | | ['b'] = IRI_CTYPE_UNRESERVED, |
162 | | ['c'] = IRI_CTYPE_UNRESERVED, |
163 | | ['d'] = IRI_CTYPE_UNRESERVED, |
164 | | ['e'] = IRI_CTYPE_UNRESERVED, |
165 | | ['f'] = IRI_CTYPE_UNRESERVED, |
166 | | ['g'] = IRI_CTYPE_UNRESERVED, |
167 | | ['h'] = IRI_CTYPE_UNRESERVED, |
168 | | ['i'] = IRI_CTYPE_UNRESERVED, |
169 | | ['j'] = IRI_CTYPE_UNRESERVED, |
170 | | ['k'] = IRI_CTYPE_UNRESERVED, |
171 | | ['l'] = IRI_CTYPE_UNRESERVED, |
172 | | ['m'] = IRI_CTYPE_UNRESERVED, |
173 | | ['n'] = IRI_CTYPE_UNRESERVED, |
174 | | ['o'] = IRI_CTYPE_UNRESERVED, |
175 | | ['p'] = IRI_CTYPE_UNRESERVED, |
176 | | ['q'] = IRI_CTYPE_UNRESERVED, |
177 | | ['r'] = IRI_CTYPE_UNRESERVED, |
178 | | ['s'] = IRI_CTYPE_UNRESERVED, |
179 | | ['t'] = IRI_CTYPE_UNRESERVED, |
180 | | ['u'] = IRI_CTYPE_UNRESERVED, |
181 | | ['v'] = IRI_CTYPE_UNRESERVED, |
182 | | ['w'] = IRI_CTYPE_UNRESERVED, |
183 | | ['x'] = IRI_CTYPE_UNRESERVED, |
184 | | ['y'] = IRI_CTYPE_UNRESERVED, |
185 | | ['z'] = IRI_CTYPE_UNRESERVED, |
186 | | ['A'] = IRI_CTYPE_UNRESERVED, |
187 | | ['B'] = IRI_CTYPE_UNRESERVED, |
188 | | ['C'] = IRI_CTYPE_UNRESERVED, |
189 | | ['D'] = IRI_CTYPE_UNRESERVED, |
190 | | ['E'] = IRI_CTYPE_UNRESERVED, |
191 | | ['F'] = IRI_CTYPE_UNRESERVED, |
192 | | ['G'] = IRI_CTYPE_UNRESERVED, |
193 | | ['H'] = IRI_CTYPE_UNRESERVED, |
194 | | ['I'] = IRI_CTYPE_UNRESERVED, |
195 | | ['J'] = IRI_CTYPE_UNRESERVED, |
196 | | ['K'] = IRI_CTYPE_UNRESERVED, |
197 | | ['L'] = IRI_CTYPE_UNRESERVED, |
198 | | ['M'] = IRI_CTYPE_UNRESERVED, |
199 | | ['N'] = IRI_CTYPE_UNRESERVED, |
200 | | ['O'] = IRI_CTYPE_UNRESERVED, |
201 | | ['P'] = IRI_CTYPE_UNRESERVED, |
202 | | ['Q'] = IRI_CTYPE_UNRESERVED, |
203 | | ['R'] = IRI_CTYPE_UNRESERVED, |
204 | | ['S'] = IRI_CTYPE_UNRESERVED, |
205 | | ['T'] = IRI_CTYPE_UNRESERVED, |
206 | | ['U'] = IRI_CTYPE_UNRESERVED, |
207 | | ['V'] = IRI_CTYPE_UNRESERVED, |
208 | | ['W'] = IRI_CTYPE_UNRESERVED, |
209 | | ['X'] = IRI_CTYPE_UNRESERVED, |
210 | | ['Y'] = IRI_CTYPE_UNRESERVED, |
211 | | ['Z'] = IRI_CTYPE_UNRESERVED, |
212 | | ['-'] = IRI_CTYPE_UNRESERVED, |
213 | | ['.'] = IRI_CTYPE_UNRESERVED, |
214 | | ['_'] = IRI_CTYPE_UNRESERVED, |
215 | | ['~'] = IRI_CTYPE_UNRESERVED |
216 | | }; |
217 | | |
218 | | /** |
219 | | * \param[in] c A character |
220 | | * \return 1 if \p c is a generic delimiter, 0 if not |
221 | | * |
222 | | * Tests whether \p c is a generic delimiter (gen-delim), |
223 | | * according to [RFC 3986, sect. 2.2](https://tools.ietf.org/html/rfc3986#section-2.2). |
224 | | */ |
225 | | bool wget_iri_isgendelim(char c) |
226 | 4.62k | { |
227 | | // return strchr(":/?#[]@",c)!=NULL; |
228 | 4.62k | return iri_isgendelim(c); |
229 | 4.62k | } |
230 | | |
231 | | /** |
232 | | * \param[in] c A character |
233 | | * \return 1 if \p c is a subcomponent delimiter, 0 if not |
234 | | * |
235 | | * Tests whether \p c is a subcomponent delimiter (sub-delim) |
236 | | * according to [RFC 3986, sect. 2.2](https://tools.ietf.org/html/rfc3986#section-2.2). |
237 | | */ |
238 | | bool wget_iri_issubdelim(char c) |
239 | 4.62k | { |
240 | | // return strchr("!$&\'()*+,;=",c)!=NULL; |
241 | 4.62k | return iri_issubdelim(c); |
242 | 4.62k | } |
243 | | |
244 | | /** |
245 | | * \param[in] c A character |
246 | | * \return 1 if \p c is a reserved character, 0 if not |
247 | | * |
248 | | * Tests whether \p c is a reserved character. |
249 | | * |
250 | | * According to [RFC 3986, sect. 2.2](https://tools.ietf.org/html/rfc3986#section-2.2), |
251 | | * the set of reserved characters is formed |
252 | | * by the generic delimiters (gen-delims, wget_iri_isgendelim()) and the |
253 | | * subcomponent delimiters (sub-delims, wget_iri_is_subdelim()). |
254 | | * |
255 | | * This function is thus equivalent to: |
256 | | * |
257 | | * return wget_iri_isgendelim(c) || wget_iri_issubdelim(c); |
258 | | * |
259 | | */ |
260 | | bool wget_iri_isreserved(char c) |
261 | 4.62k | { |
262 | 4.62k | return wget_iri_isgendelim(c) || wget_iri_issubdelim(c); |
263 | 4.62k | } |
264 | | |
265 | | /** |
266 | | * \param[in] c A character |
267 | | * \return 1 if \p c is an unreserved character, 0 if not |
268 | | * |
269 | | * Tests whether \p c is an unreserved character. |
270 | | */ |
271 | | bool wget_iri_isunreserved(char c) |
272 | 0 | { |
273 | 0 | return iri_isunreserved(c); |
274 | 0 | } |
275 | | |
276 | | static unsigned char WGET_GCC_CONST unhex(unsigned char c) |
277 | 13.4k | { |
278 | 13.4k | return c <= '9' ? c - '0' : (c <= 'F' ? c - 'A' + 10 : c - 'a' + 10); |
279 | 13.4k | } |
280 | | |
281 | | static char *iri_unescape_inline(char *src, int ctype) |
282 | 61.5k | { |
283 | 61.5k | char *ret = NULL; |
284 | 61.5k | unsigned char *s = (unsigned char *)src; // just a helper to avoid casting a lot |
285 | 61.5k | unsigned char *d = s; |
286 | | |
287 | 2.00M | while (*s) { |
288 | 1.93M | if (*s == '%') { |
289 | 3.08k | if (c_isxdigit(s[1]) && c_isxdigit(s[2])) { |
290 | 1.38k | unsigned char c = (unsigned char) (unhex(s[1]) << 4) | unhex(s[2]); |
291 | 1.38k | if (!ctype || (!(iri_ctype[(unsigned char)(c)] & ctype) && c != '%')) { |
292 | 1.38k | *d++ = c; |
293 | 1.38k | s += 3; |
294 | 1.38k | ret = src; |
295 | 1.38k | continue; |
296 | 1.38k | } |
297 | 1.38k | } |
298 | 1.93M | } else if (*s == '#') { |
299 | 11.4k | uint32_t value = 0; |
300 | | |
301 | 11.4k | if (s[1] == 'x') { |
302 | 3.34k | unsigned char *p = s + 2; |
303 | 14.0k | while (c_isxdigit(*p)) { |
304 | 10.6k | value = ((value & 0x0FFFFFFF) << 4) | unhex(*p); |
305 | 10.6k | p++; |
306 | 10.6k | } |
307 | 3.34k | if (*p == ';') { |
308 | 2.03k | if (value > 0 && value < 128) { |
309 | 510 | *d++ = (unsigned char) value; |
310 | 510 | s = p + 1; |
311 | 510 | continue; |
312 | 510 | } |
313 | | // else: we have to convert the unicode value to whatever encoding the URL is in (likely UTF-8) |
314 | | // this cannot be done inline since the URL's length may increase |
315 | 2.03k | } |
316 | 8.07k | } else { |
317 | 8.07k | unsigned char *p = s + 1; |
318 | 12.7k | while (c_isdigit(*p) && value <= 0x10FFFF) { // max. Unicode value |
319 | 4.71k | value = value * 10 + (*p - '0'); |
320 | 4.71k | p++; |
321 | 4.71k | } |
322 | 8.07k | if (*p == ';') { |
323 | 1.83k | if (value > 0 && value < 128) { |
324 | 602 | *d++ = (unsigned char) value; |
325 | 602 | s = p + 1; |
326 | 602 | continue; |
327 | 602 | } |
328 | | // else: we have to convert the unicode value to whatever encoding the URL is in (likely UTF-8) |
329 | | // this cannot be done inline since the URL's length may increase |
330 | 1.83k | } |
331 | 8.07k | } |
332 | 1.92M | } else if (*s == '\r' || *s == '\n') { |
333 | | // Ignore / remove CR and LF from URLs. See https://gitlab.com/gnuwget/wget2/-/issues/522 |
334 | 930 | s++; |
335 | 930 | continue; |
336 | 930 | } |
337 | | |
338 | 1.93M | *d++ = *s++; |
339 | 1.93M | } |
340 | 61.5k | *d = 0; |
341 | | |
342 | 61.5k | return ret; |
343 | 61.5k | } |
344 | | |
345 | | /** |
346 | | * \param[in] src A string |
347 | | * \return A pointer to \p src, after the transformation is done |
348 | | * |
349 | | * Unescape a string. All the percent-encoded characters (`%XX`) are converted |
350 | | * back to their original form. |
351 | | * |
352 | | * **The transformation is done inline**, so `src` will be modified after this function returns. |
353 | | * If no percent-encoded characters are found, the string is left untouched. |
354 | | */ |
355 | | char *wget_iri_unescape_inline(char *src) |
356 | 61.5k | { |
357 | 61.5k | return iri_unescape_inline(src, 0); |
358 | 61.5k | } |
359 | | |
360 | | /** |
361 | | * \param[in] src A string |
362 | | * \return A pointer to \p src, after the transformation is done |
363 | | * |
364 | | * Unescape a string except escaped generic delimiters (and escaped '%'. |
365 | | * The percent-encoded characters (`%XX`) are converted back to their original form. |
366 | | * |
367 | | * This variant of unescaping is helpful before an URL is being parsed, so that |
368 | | * the parser recognizes e.g. 'http%3A//' as relative URL (path) and not as a scheme. |
369 | | * |
370 | | * **The transformation is done inline**, so `src` will be modified after this function returns. |
371 | | * If no characters were unescaped, the string is left untouched. |
372 | | */ |
373 | | char *wget_iri_unescape_url_inline(char *src) |
374 | 0 | { |
375 | 0 | return iri_unescape_inline(src, IRI_CTYPE_GENDELIM); |
376 | 0 | } |
377 | | |
378 | | /** |
379 | | * \param[in] iri An IRI |
380 | | * |
381 | | * Free the heap-allocated content of the provided IRI, but leave the rest |
382 | | * of the fields. |
383 | | * |
384 | | * This function frees the following fields of \ref wget_iri_st "wget_iri": |
385 | | * |
386 | | * - `host` |
387 | | * - `path` |
388 | | * - `query` |
389 | | * - `fragment` |
390 | | * - `connection_part` |
391 | | */ |
392 | | void wget_iri_free_content(wget_iri *iri) |
393 | 43.5k | { |
394 | 43.5k | if (iri) { |
395 | 43.5k | if (iri->userinfo) |
396 | 332 | xfree(iri->safe_uri); |
397 | 43.2k | else |
398 | 43.2k | iri->safe_uri = NULL; |
399 | 43.5k | if (iri->uri_allocated) |
400 | 7.85k | xfree(iri->uri); |
401 | 43.5k | if (iri->host_allocated) |
402 | 20.1k | xfree(iri->host); |
403 | 43.5k | if (iri->path_allocated) |
404 | 5.79k | xfree(iri->path); |
405 | 43.5k | if (iri->query_allocated) |
406 | 1.04k | xfree(iri->query); |
407 | 43.5k | if (iri->fragment_allocated) |
408 | 112 | xfree(iri->fragment); |
409 | 43.5k | xfree(iri->connection_part); |
410 | 43.5k | } |
411 | 43.5k | } |
412 | | |
413 | | /** |
414 | | * \param[in] iri A pointer to a pointer to an IRI (a \ref wget_iri_st "wget_iri") |
415 | | * |
416 | | * Destroy a \ref wget_iri_st "wget_iri" structure. |
417 | | * |
418 | | * The provided pointer is set to NULL. |
419 | | */ |
420 | | void wget_iri_free(wget_iri **iri) |
421 | 49.8k | { |
422 | 49.8k | if (iri && *iri) { |
423 | 43.5k | wget_iri_free_content(*iri); |
424 | 43.5k | xfree(*iri); |
425 | 43.5k | } |
426 | 49.8k | } |
427 | | |
428 | | // URIs are assumed to be unescaped at this point |
429 | | |
430 | | /** |
431 | | * \param[in] url A URL/IRI |
432 | | * \param[in] encoding Original encoding of \p url |
433 | | * \return A libwget IRI (`wget_iri`) |
434 | | * |
435 | | * The host, path, query and fragment parts will be converted to UTF-8 from |
436 | | * the encoding given in the parameter \p encoding. GNU libiconv is used |
437 | | * to perform the conversion, so this value should be the name of a valid character set |
438 | | * supported by that library, such as "utf-8" or "iso-8859-1". |
439 | | */ |
440 | | wget_iri *wget_iri_parse(const char *url, const char *encoding) |
441 | 37.0k | { |
442 | 37.0k | wget_iri *iri; |
443 | 37.0k | char *p, *s, *authority, c; |
444 | 37.0k | size_t slen, extra; |
445 | 37.0k | int have_scheme; |
446 | | |
447 | 37.0k | if (!url) |
448 | 1.28k | return NULL; |
449 | | |
450 | | /* |
451 | | URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] |
452 | | hier-part = "//" authority path-abempty / path-absolute / path-rootless / path-empty |
453 | | scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) |
454 | | */ |
455 | 36.6k | while (c_isspace(*url)) url++; |
456 | 35.7k | if (!*url) return NULL; |
457 | | /* |
458 | | // first unescape, than convert to UTF-8 |
459 | | if (strchr(url, '%')) { |
460 | | char *unesc_url = wget_strdup(url); |
461 | | |
462 | | wget_percent_unescape(unesc_url); |
463 | | |
464 | | if (wget_str_needs_encoding(unesc_url)) { |
465 | | if ((url = wget_str_to_utf8(unesc_url, encoding))) |
466 | | xfree(unesc_url); |
467 | | else |
468 | | url = unesc_url; // on error, use what we have |
469 | | } else |
470 | | url = unesc_url; |
471 | | |
472 | | url_allocated = 1; |
473 | | } else { |
474 | | url_allocated = 0; |
475 | | |
476 | | if (wget_str_needs_encoding(url)) { |
477 | | if ((s = wget_str_to_utf8(url, encoding))) { |
478 | | url = s; |
479 | | url_allocated = 1; |
480 | | } |
481 | | } |
482 | | } |
483 | | */ |
484 | | |
485 | 35.6k | if (c_isalpha(*url)) { |
486 | 22.5k | const char *x; |
487 | 22.5k | have_scheme = 1; |
488 | | |
489 | 150k | for (x = url; *x && iri_isscheme(*x); x++) |
490 | 128k | ; |
491 | | |
492 | 22.5k | if (*x != ':' || c_isdigit(x[1])) |
493 | 4.83k | have_scheme = 0; // not a scheme |
494 | 22.5k | } else |
495 | 13.1k | have_scheme = 0; |
496 | | |
497 | | // just use one block of memory for all parsed URI parts |
498 | 35.6k | slen = strlen(url); |
499 | 35.6k | extra = have_scheme ? 0 : sizeof("http://") - 1; // extra space for http:// |
500 | | |
501 | 35.6k | iri = wget_malloc(sizeof(wget_iri) + (slen + extra + 1) * 2); |
502 | 35.6k | if (!iri) |
503 | 0 | return NULL; |
504 | | |
505 | 35.6k | memset(iri, 0, sizeof(wget_iri)); |
506 | | |
507 | 35.6k | if (have_scheme) { |
508 | 17.6k | iri->msize = slen + 1; |
509 | 17.6k | iri->uri = memcpy(iri + 1, url, iri->msize); |
510 | 17.6k | p = s = memcpy((char *)iri->uri + iri->msize, url, iri->msize); |
511 | 17.6k | s = strchr(s, ':'); // we know there is a : |
512 | 17.6k | *s++ = 0; |
513 | | |
514 | | // p points to scheme |
515 | 17.6k | wget_iri_unescape_inline(p); // percent unescape |
516 | 17.6k | wget_strtolower(p); // convert to lowercase |
517 | | |
518 | 17.6k | bool found = false; // assume the scheme is unsupported |
519 | | |
520 | | // find the scheme in our static list of supported schemes |
521 | | // for later comparisons we compare pointers (avoiding strcasecmp()) |
522 | 20.8k | for (unsigned it = 0; it < countof(schemes); it++) { |
523 | 19.2k | if (!strcmp(schemes[it].name, p)) { |
524 | 16.0k | iri->scheme = it; |
525 | 16.0k | iri->port = schemes[it].port; |
526 | 16.0k | found = true; |
527 | 16.0k | break; |
528 | 16.0k | } |
529 | 19.2k | } |
530 | | |
531 | 17.6k | if (!found) { |
532 | 1.57k | debug_printf("Unsupported scheme in '%s'\n", url); |
533 | 1.57k | wget_iri_free(&iri); |
534 | 1.57k | return NULL; |
535 | 1.57k | } |
536 | 18.0k | } else { |
537 | | // add http:// scheme to url |
538 | 18.0k | iri->uri = memcpy(iri + 1, "http://", extra); |
539 | 18.0k | memcpy((char *)iri->uri + extra, url, slen + 1); |
540 | 18.0k | iri->msize = extra + slen + 1; |
541 | 18.0k | s = memcpy((char *)iri->uri + iri->msize, iri->uri, iri->msize); |
542 | 18.0k | s[extra - 3] = 0; |
543 | 18.0k | s += extra; |
544 | | |
545 | 18.0k | iri->scheme = WGET_IRI_SCHEME_HTTP; |
546 | 18.0k | iri->port = schemes[WGET_IRI_SCHEME_HTTP].port; |
547 | 18.0k | } |
548 | | |
549 | | // if (url_allocated) |
550 | | // xfree(url); |
551 | | |
552 | | // this is true for http, https, ftp, file (accept any number of /, like most browsers) |
553 | 69.1k | while (*s == '/') |
554 | 35.0k | s++; |
555 | | |
556 | | // authority |
557 | 34.1k | authority = s; |
558 | 1.03M | while (*s && *s != '/' && *s != '?' && *s != '#') |
559 | 1.00M | s++; |
560 | 34.1k | c = *s; |
561 | 34.1k | if (c) *s++ = 0; |
562 | 34.1k | wget_iri_unescape_inline(authority); |
563 | | |
564 | | // left over: [path][?query][#fragment] |
565 | 34.1k | if (c == '/') { |
566 | 8.31k | iri->path = s; |
567 | 792k | while (*s && *s != '?' && *s != '#') |
568 | 784k | s++; |
569 | 8.31k | c = *s; |
570 | 8.31k | if (c) *s++ = 0; |
571 | 8.31k | wget_iri_unescape_inline((char *)iri->path); |
572 | 8.31k | normalize_path((char *)iri->path); |
573 | 8.31k | } |
574 | | |
575 | 34.1k | if (c == '?') { |
576 | 1.64k | iri->query = s; |
577 | 14.3k | while (*s && *s != '#') { |
578 | 12.7k | if (*s == '+') |
579 | 710 | *s = ' '; |
580 | 12.7k | s++; |
581 | 12.7k | } |
582 | 1.64k | c = *s; |
583 | 1.64k | if (c) *s++ = 0; |
584 | | /* do not unescape query else we get ambiguity for chars like &, =, +, ... */ |
585 | 1.64k | } |
586 | | |
587 | 34.1k | if (c == '#') { |
588 | 1.47k | iri->fragment = s; |
589 | 1.47k | s += strlen(s); |
590 | 1.47k | wget_iri_unescape_inline((char *)iri->fragment); |
591 | 1.47k | } |
592 | | |
593 | 34.1k | if (*s) { |
594 | 0 | debug_printf("unparsed rest '%s'\n", s); |
595 | 0 | } |
596 | | |
597 | 34.1k | if (*authority) { |
598 | 33.0k | s = authority; |
599 | 33.0k | p = strchr(authority, '@'); |
600 | 33.0k | if (p) { |
601 | 230 | iri->userinfo = s; |
602 | 230 | *p = 0; |
603 | 230 | if ((s = strchr(s, ':'))) { |
604 | 14 | *s = 0; |
605 | 14 | iri->password = s + 1; |
606 | 14 | } |
607 | 230 | s = p + 1; |
608 | 230 | } |
609 | 33.0k | if (*s == '[') { |
610 | 62 | p = strrchr(s, ']'); |
611 | 62 | if (p) { |
612 | 42 | iri->host = s + 1; |
613 | 42 | *p = 0; |
614 | 42 | s = p + 1; |
615 | 42 | } else { |
616 | | // something is broken |
617 | 20 | iri->host = s + 1; |
618 | 20 | s += strlen(s); |
619 | 20 | } |
620 | 32.9k | } else { |
621 | 32.9k | iri->host = s; |
622 | 1.00M | while (*s && *s != ':') |
623 | 969k | s++; |
624 | 32.9k | } |
625 | 33.0k | if (*s == ':') { |
626 | 5.78k | if (c_isdigit(s[1])) { |
627 | 1.32k | unsigned long port = strtoul(s + 1, NULL, 10); |
628 | 1.32k | if (port == 0 || port > 65535) { |
629 | 920 | error_printf(_("Port number must be in the range 1..65535\n")); |
630 | 920 | wget_iri_free(&iri); |
631 | 920 | return NULL; |
632 | 920 | } |
633 | 408 | iri->port = (uint16_t) port; |
634 | 408 | iri->port_given = true; |
635 | 408 | } |
636 | 5.78k | } |
637 | 32.1k | *s = 0; |
638 | 32.1k | } |
639 | | |
640 | | // now unescape all components (not interested in display, userinfo, password right now) |
641 | | |
642 | 33.1k | if (iri->host) { |
643 | 32.1k | wget_strtolower((char *)iri->host); |
644 | 32.1k | if (wget_str_needs_encoding(iri->host)) { |
645 | 13.6k | if ((s = wget_str_to_utf8(iri->host, encoding))) { |
646 | 13.6k | iri->host = s; |
647 | 13.6k | iri->host_allocated = true; |
648 | 13.6k | } |
649 | 13.6k | } |
650 | 32.1k | if ((p = (char *)wget_str_to_ascii(iri->host)) != iri->host) { |
651 | 6.36k | if (iri->host_allocated) |
652 | 6.36k | xfree(iri->host); |
653 | 6.36k | iri->host = p; |
654 | 6.36k | iri->host_allocated = true; |
655 | 6.36k | } |
656 | | |
657 | | // Finally, if the host is a literal IPv4 or IPv6 address, mark it as so |
658 | 32.1k | if (wget_ip_is_family(iri->host, WGET_NET_FAMILY_IPV4) || wget_ip_is_family(iri->host, WGET_NET_FAMILY_IPV6)) |
659 | 19 | iri->is_ip_address = true; |
660 | 32.1k | } |
661 | | |
662 | 33.1k | if (!iri->host) { |
663 | 1.05k | error_printf(_("Missing host/domain in URI '%s'\n"), iri->uri); |
664 | 1.05k | wget_iri_free(&iri); |
665 | 1.05k | return NULL; |
666 | 1.05k | } |
667 | | |
668 | 32.1k | if (iri->path && wget_str_needs_encoding(iri->path)) { |
669 | 5.40k | if ((s = wget_str_to_utf8(iri->path, encoding))) { |
670 | 5.40k | iri->path = s; |
671 | 5.40k | iri->path_allocated = true; |
672 | 5.40k | } |
673 | 5.40k | } |
674 | | |
675 | 32.1k | if (iri->query && wget_str_needs_encoding(iri->query)) { |
676 | 730 | if ((s = wget_str_to_utf8(iri->query, encoding))) { |
677 | 730 | iri->query = s; |
678 | 730 | iri->query_allocated = true; |
679 | 730 | } |
680 | 730 | } |
681 | | |
682 | 32.1k | if (iri->fragment && wget_str_needs_encoding(iri->fragment)) { |
683 | 84 | if ((s = wget_str_to_utf8(iri->fragment, encoding))) { |
684 | 84 | iri->fragment = s; |
685 | 84 | iri->fragment_allocated = true; |
686 | 84 | } |
687 | 84 | } |
688 | | |
689 | 32.1k | if (iri->userinfo) { |
690 | 222 | iri->safe_uri = create_safe_uri(iri); |
691 | 31.9k | } else { |
692 | 31.9k | iri->safe_uri = iri->uri; |
693 | 31.9k | } |
694 | | |
695 | | /* |
696 | | debug_printf("scheme=%s\n",iri->scheme); |
697 | | debug_printf("host=%s\n",iri->host); |
698 | | debug_printf("path=%s\n",iri->path); |
699 | | debug_printf("query=%s\n",iri->query); |
700 | | debug_printf("fragment=%s\n",iri->fragment); |
701 | | */ |
702 | 32.1k | return iri; |
703 | 33.1k | } |
704 | | |
705 | | /** |
706 | | * \param[in] iri An IRI |
707 | | * \return A new IRI, with the exact same contents as the provided one. |
708 | | * |
709 | | * Clone the provided IRI. |
710 | | */ |
711 | | wget_iri *wget_iri_clone(const wget_iri *iri) |
712 | 9.25k | { |
713 | 9.25k | if (!iri || !iri->uri) |
714 | 1.39k | return NULL; |
715 | | |
716 | 7.86k | size_t slen = strlen(iri->uri); |
717 | 7.86k | wget_iri *clone = wget_malloc(sizeof(wget_iri) + (slen + 1) + iri->msize); |
718 | | |
719 | 7.86k | if (!clone) |
720 | 0 | return NULL; |
721 | | |
722 | 7.86k | memcpy(clone, iri, sizeof(wget_iri)); |
723 | 7.86k | clone->uri = memcpy(clone + 1, iri->uri, (slen + 1) + iri->msize); |
724 | 7.86k | clone->uri_allocated = 0; |
725 | | |
726 | 7.86k | if (iri->userinfo) |
727 | 102 | clone->safe_uri = wget_strdup(iri->safe_uri); |
728 | 7.75k | else |
729 | 7.75k | clone->safe_uri = clone->uri; |
730 | | |
731 | 7.86k | clone->connection_part = wget_strdup(iri->connection_part); |
732 | | |
733 | | // adjust pointers |
734 | 7.86k | if (iri->host_allocated) |
735 | 6.46k | clone->host = wget_strdup(iri->host); |
736 | 1.39k | else |
737 | 1.39k | clone->host = iri->host ? (char *)clone + (size_t) (iri->host - (const char *)iri) : NULL; |
738 | | |
739 | 7.86k | clone->display = iri->display ? (char *)clone + (size_t) (iri->display - (const char *)iri): NULL; |
740 | | // not adjust scheme, it is a pointer to a static string |
741 | 7.86k | clone->userinfo = iri->userinfo ? (char *)clone + (size_t) (iri->userinfo - (const char *)iri): NULL; |
742 | 7.86k | clone->password = iri->password ? (char *)clone + (size_t) (iri->password - (const char *)iri): NULL; |
743 | | |
744 | 7.86k | if (iri->path_allocated) |
745 | 390 | clone->path = wget_strdup(iri->path); |
746 | 7.47k | else |
747 | 7.47k | clone->path = iri->path ? (char *)clone + (size_t) (iri->path - (const char *)iri): NULL; |
748 | | |
749 | 7.86k | if (iri->query_allocated) |
750 | 312 | clone->query = wget_strdup(iri->query); |
751 | 7.54k | else |
752 | 7.54k | clone->query = iri->query ? (char *)clone + (size_t) (iri->query - (const char *)iri): NULL; |
753 | | |
754 | 7.86k | if (iri->fragment_allocated) |
755 | 28 | clone->fragment = wget_strdup(iri->fragment); |
756 | 7.83k | else |
757 | 7.83k | clone->fragment = iri->fragment ? (char *)clone + (size_t) (iri->fragment - (const char *)iri): NULL; |
758 | | |
759 | 7.86k | return clone; |
760 | 7.86k | } |
761 | | |
762 | | /** |
763 | | * \param[in] iri An IRI |
764 | | * \param[in] buf A buffer, where the resulting string will be put |
765 | | * \return The contents of the buffer \p buf |
766 | | * |
767 | | * Append the connection part of the IRI \p iri to \p buf. |
768 | | * |
769 | | * The connection part is formed by the scheme, the hostname, and optionally the port. For example: |
770 | | * |
771 | | * https://localhost:8080 |
772 | | * https://www.example.com |
773 | | * |
774 | | * It may be of the form `https://example.com:8080` if the port was provided when creating the IRI |
775 | | * or of the form `https://example.com` otherwise. |
776 | | */ |
777 | | const char *wget_iri_get_connection_part(const wget_iri *iri, wget_buffer *buf) |
778 | 20.3k | { |
779 | 20.3k | if (iri) { |
780 | 20.3k | if (wget_ip_is_family(iri->host, WGET_NET_FAMILY_IPV6)) |
781 | 4 | wget_buffer_printf_append(buf, "%s://[%s]", schemes[iri->scheme].name, iri->host); |
782 | 20.3k | else |
783 | 20.3k | wget_buffer_printf_append(buf, "%s://%s", schemes[iri->scheme].name, iri->host); |
784 | | |
785 | 20.3k | if (iri->port_given) |
786 | 152 | wget_buffer_printf_append(buf, ":%hu", iri->port); |
787 | 20.3k | } |
788 | | |
789 | 20.3k | return buf->data; |
790 | 20.3k | } |
791 | | |
792 | | // normalize /../ and remove /./ |
793 | | |
794 | | static size_t WGET_GCC_NONNULL_ALL normalize_path(char *path) |
795 | 21.8k | { |
796 | 21.8k | char *p1 = path, *p2 = path; |
797 | | |
798 | 21.8k | debug_printf("path %s ->\n", path); |
799 | | |
800 | | // skip ./ and ../ at the beginning of the path |
801 | 25.7k | for (;;) { |
802 | 25.7k | if (*p2 == '/') |
803 | 2.33k | p2++; |
804 | 23.3k | else if (*p2 == '.') { |
805 | 2.08k | if (p2[1] == '/') |
806 | 490 | p2 += 2; |
807 | 1.59k | else if (p2[1] == '.') { |
808 | 1.15k | if (p2[2] == '/') |
809 | 510 | p2 += 3; |
810 | 644 | else if (!p2[2]) |
811 | 486 | p2 += 2; |
812 | 158 | else |
813 | 158 | break; |
814 | 1.15k | } |
815 | 436 | else if (!p2[1]) |
816 | 58 | p2++; |
817 | 378 | else |
818 | 378 | break; |
819 | 2.08k | } else |
820 | 21.3k | break; |
821 | 25.7k | } |
822 | | |
823 | | // normalize path but stop at query or fragment |
824 | 1.83M | while (*p2 && *p2 != '?' && *p2 != '#') { |
825 | 1.81M | if (*p2 == '/') { |
826 | 8.08k | if (p2[1] == '.') { |
827 | 2.49k | if (!strncmp(p2, "/../", 4)) { |
828 | | // go one level up |
829 | 840 | p2 += 3; |
830 | 16.7k | while (p1 > path && *--p1 != '/'); |
831 | 1.65k | } else if (!strcmp(p2, "/..")) { |
832 | 580 | p2 += 3; |
833 | 2.69k | while (p1 > path && *--p1 != '/'); |
834 | 580 | if (p1 > path) *p1++='/'; |
835 | 1.07k | } else if (!strncmp(p2, "/./", 3)) { |
836 | 202 | p2 += 2; |
837 | 876 | } else if (!strcmp(p2, "/.")) { |
838 | 28 | p2 += 2; |
839 | 28 | if (p1 > path) *p1++='/'; |
840 | 28 | } else |
841 | 848 | *p1++ = *p2++; |
842 | 5.58k | } else if (p1 == path) |
843 | 800 | p2++; // avoid leading slash |
844 | 4.78k | else if (p2[1] == '/') |
845 | 692 | p2++; // double slash to single slash |
846 | 4.09k | else |
847 | 4.09k | *p1++ = *p2++; |
848 | 8.08k | } else |
849 | 1.80M | *p1++ = *p2++; |
850 | 1.81M | } |
851 | | |
852 | 21.8k | if (p1 != p2) { |
853 | 41.6k | while (*p2) |
854 | 37.4k | *p1++ = *p2++; |
855 | | |
856 | 4.23k | *p1 = 0; |
857 | 17.6k | } else { |
858 | 17.6k | p1 += strlen(p1); |
859 | 17.6k | } |
860 | | |
861 | 21.8k | debug_printf(" %s\n", path); |
862 | | |
863 | 21.8k | return p1 - path; |
864 | 21.8k | } |
865 | | |
866 | | // create an absolute URI from a base + relative URI |
867 | | |
868 | | //char *iri_relative_to_absolute(IRI *iri, const char *tag, const char *val, size_t len, char *dst, size_t dst_size) |
869 | | /** |
870 | | * \param[in] base A base IRI |
871 | | * \param[in] val A path, or another URI |
872 | | * \param[in] len Length of the string \p val or -1 |
873 | | * \param[in] buf Destination buffer, where the result will be copied. |
874 | | * \return A new URI (string) which is based on the base IRI \p base provided, or NULL in case of error. |
875 | | * |
876 | | * Calculates a new URI which is based on the provided IRI \p base. |
877 | | * |
878 | | * Taking the IRI \p base as a starting point, a new URI is created with the path \p val, which may be |
879 | | * a relative or absolute path, or even a whole URI. The result is returned as a string, and if the buffer |
880 | | * \p buf is provided, it is also placed there. |
881 | | * |
882 | | * If \p val is an absolute path (it begins with a `/`), it is normalized first. Then the provided IRI's |
883 | | * path is replaced by that new path. If it's a relative path, the file name of the \p base IRI's path |
884 | | * is replaced by that path. Finally, if \p val begins with a scheme (such as `https://`) that string is returned |
885 | | * untouched, and placed in the buffer if provided. |
886 | | * |
887 | | * If \p base is NULL, then \p val must itself be an absolute URI. Likewise, if \p buf is NULL, |
888 | | * then \p val must also be an absolute URI. |
889 | | * |
890 | | * if \p len is `-1`, the length of \p val will be the result from `strlen(val)`. |
891 | | */ |
892 | | const char *wget_iri_relative_to_abs(const wget_iri *base, const char *val, size_t len, wget_buffer *buf) |
893 | 27.7k | { |
894 | 27.7k | if (len == (size_t) -1) |
895 | 18.5k | len = strlen(val); |
896 | | |
897 | 27.7k | if (*val == '/') { |
898 | 3.85k | if (base) { |
899 | 2.56k | char tmp[4096], *path = tmp; |
900 | | |
901 | 2.56k | if (len >= sizeof(tmp)) { |
902 | 26 | path = wget_malloc(len + 1); |
903 | 26 | if (!path) |
904 | 0 | return NULL; |
905 | 26 | } |
906 | | |
907 | | // strlcpy or snprintf are ineffective here since they do strlen(val), which might be large |
908 | 2.56k | wget_strscpy(path, val, len + 1); |
909 | | |
910 | 2.56k | if (len >= 2 && val[1] == '/') { |
911 | 1.01k | char *p; |
912 | | |
913 | | // absolute URI without scheme: //authority/path... |
914 | 1.01k | if ((p = strchr(path + 2, '/'))) |
915 | 1.00k | normalize_path(p + 1); |
916 | | |
917 | 1.01k | wget_buffer_strcpy(buf, schemes[base->scheme].name); |
918 | 1.01k | wget_buffer_strcat(buf, ":"); |
919 | 1.01k | wget_buffer_strcat(buf, path); |
920 | 1.55k | } else { |
921 | | // absolute path |
922 | 1.55k | normalize_path(path); |
923 | | |
924 | 1.55k | wget_buffer_reset(buf); |
925 | 1.55k | wget_iri_get_connection_part(base, buf); |
926 | 1.55k | wget_buffer_strcat(buf, "/"); |
927 | 1.55k | wget_buffer_strcat(buf, path); |
928 | 1.55k | } |
929 | | |
930 | 2.56k | if (path != tmp) |
931 | 26 | xfree(path); |
932 | 2.56k | } else { |
933 | 1.28k | return NULL; |
934 | 1.28k | } |
935 | 23.9k | } else { |
936 | | // see if URI begins with a scheme: |
937 | 23.9k | if (memchr(val, ':', len)) { |
938 | | // absolute URI |
939 | 7.43k | if (buf) { |
940 | 4.96k | wget_buffer_memcpy(buf, val, len); |
941 | 4.96k | } else { |
942 | 2.47k | return val; |
943 | 2.47k | } |
944 | 16.4k | } else if (base) { |
945 | | // relative path |
946 | 10.9k | const char *lastsep = base->path ? strrchr(base->path, '/') : NULL; |
947 | 10.9k | wget_buffer_reset(buf); |
948 | 10.9k | wget_iri_get_connection_part(base, buf); |
949 | 10.9k | wget_buffer_strcat(buf, "/"); |
950 | | |
951 | 10.9k | size_t tmp_len = buf->length; |
952 | | |
953 | 10.9k | if (lastsep) |
954 | 0 | wget_buffer_memcat(buf, base->path, lastsep - base->path + 1); |
955 | | |
956 | 10.9k | if (len) |
957 | 10.9k | wget_buffer_memcat(buf, val, len); |
958 | | |
959 | 10.9k | buf->length = normalize_path(buf->data + tmp_len) + tmp_len; |
960 | 10.9k | } else if (val[len] == 0) { |
961 | 5.50k | return val; |
962 | 5.50k | } else { |
963 | 0 | return NULL; |
964 | 0 | } |
965 | 23.9k | } |
966 | | |
967 | 18.5k | return likely(buf) ? buf->data : NULL; |
968 | 27.7k | } |
969 | | |
970 | | /** |
971 | | * \param[in] base The base IRI |
972 | | * \param[in] url A relative/absolute path (or a URI) to be appended to \p base |
973 | | * \param[in] encoding The encoding of \p url (e.g. "utf-8" or "iso-8859-1") |
974 | | * \return A new IRI |
975 | | * |
976 | | * Generate a new IRI by using the provided IRI \p base as a base and the path \p url. |
977 | | * |
978 | | * This is equivalent to: |
979 | | * |
980 | | * wget_iri *iri = wget_iri_parse(wget_iri_relative_to_abs(base, url, strlen(url), NULL), encoding); |
981 | | * return iri; |
982 | | * |
983 | | * As such, \p url can be a relative or absolute path, or another URI. |
984 | | * |
985 | | * If \p base is NULL, then the parameter \p url must itself be an absolute URI. |
986 | | */ |
987 | | wget_iri *wget_iri_parse_base(const wget_iri *base, const char *url, const char *encoding) |
988 | 18.5k | { |
989 | 18.5k | wget_iri *iri; |
990 | | |
991 | 18.5k | if (base) { |
992 | 9.25k | wget_buffer buf; |
993 | 9.25k | char sbuf[256]; |
994 | | |
995 | 9.25k | wget_buffer_init(&buf, sbuf, sizeof(sbuf)); |
996 | 9.25k | iri = wget_iri_parse(wget_iri_relative_to_abs(base, url, (size_t) -1, &buf), encoding); |
997 | 9.25k | wget_buffer_deinit(&buf); |
998 | 9.25k | } else { |
999 | | // no base: just check URL for being an absolute URI |
1000 | 9.25k | iri = wget_iri_parse(wget_iri_relative_to_abs(NULL, url, (size_t) -1, NULL), encoding); |
1001 | 9.25k | } |
1002 | | |
1003 | 18.5k | return iri; |
1004 | 18.5k | } |
1005 | | |
1006 | | // RFC conform comparison as described in https://tools.ietf.org/html/rfc2616#section-3.2.3 |
1007 | | /** |
1008 | | * \param[in] iri1 An IRI |
1009 | | * \param[in] iri2 Another IRI |
1010 | | * \return 0 if both IRIs are equal according to RFC 2616 or a non-zero value otherwise |
1011 | | * |
1012 | | * Compare two IRIs. |
1013 | | * |
1014 | | * Comparison is performed according to [RFC 2616, sect. 3.2.3](https://tools.ietf.org/html/rfc2616#section-3.2.3). |
1015 | | * |
1016 | | * This function uses wget_strcasecmp() to compare the various parts of the IRIs so a non-zero negative return value |
1017 | | * indicates that \p iri1 is less than \p iri2, whereas a positive value indicates \p iri1 is greater than \p iri2. |
1018 | | */ |
1019 | | int wget_iri_compare(const wget_iri *iri1, const wget_iri *iri2) |
1020 | 9.25k | { |
1021 | 9.25k | int n; |
1022 | | |
1023 | 9.25k | if (!iri1) { |
1024 | 1.39k | if (!iri2) |
1025 | 852 | return 0; |
1026 | 546 | else |
1027 | 546 | return -1; |
1028 | 7.86k | } else if (!iri2) |
1029 | 6 | return 1; |
1030 | | |
1031 | | // info_printf("iri %p %p %s:%s %s:%s\n",iri1,iri2,iri1->scheme,iri1->port,iri2->scheme,iri2->port); |
1032 | | |
1033 | | /* |
1034 | | if (!iri1->path) { |
1035 | | // if (iri2->path && strcmp(iri2->path, "/")) |
1036 | | if (iri2->path) |
1037 | | return -1; |
1038 | | } |
1039 | | else if (!iri2->path) { |
1040 | | // if (iri1->path && strcmp(iri1->path, "/")) |
1041 | | if (iri1->path) |
1042 | | return 1; |
1043 | | } |
1044 | | */ |
1045 | 7.85k | if ((n = wget_strcasecmp(iri1->path, iri2->path))) |
1046 | 5.44k | return n; |
1047 | | |
1048 | 2.40k | if ((n = wget_strcasecmp(iri1->query, iri2->query))) |
1049 | 234 | return n; |
1050 | | |
1051 | 2.17k | if (iri1->scheme != iri2->scheme) |
1052 | 0 | return iri1->scheme < iri2->scheme ? -1 : 1; |
1053 | | |
1054 | 2.17k | if ((n = iri1->port - iri2->port)) |
1055 | 66 | return n; |
1056 | | |
1057 | | // host is already lowercase, no need to call strcasecmp() |
1058 | 2.10k | if ((n = strcmp(iri1->host, iri2->host))) |
1059 | 463 | return n; |
1060 | | |
1061 | | // if ((n = wget_strcasecmp(iri1->fragment, iri2->fragment))) |
1062 | | // return n; |
1063 | | |
1064 | 1.64k | return 0; |
1065 | 2.10k | } |
1066 | | |
1067 | | /** |
1068 | | * \param[in] src A string, whose reserved characters are to be percent-encoded |
1069 | | * \param[in] buf A buffer where the result will be copied. |
1070 | | * \return The contents of the buffer \p buf after \p src has been encoded. |
1071 | | * |
1072 | | * Escapes (using percent-encoding) all the reserved characters in the string \p src. |
1073 | | * |
1074 | | * If \p src is NULL, the contents of the buffer \p buf are returned. \p buf cannot be NULL. |
1075 | | */ |
1076 | | const char *wget_iri_escape(const char *src, wget_buffer *buf) |
1077 | 17.1k | { |
1078 | 17.1k | const char *begin; |
1079 | | |
1080 | 17.1k | if (!src) |
1081 | 0 | return buf->data; |
1082 | | |
1083 | 1.34M | for (begin = src; *src; src++) { |
1084 | 1.32M | if (!iri_isunreserved(*src)) { |
1085 | 1.15M | if (begin != src) |
1086 | 21.7k | wget_buffer_memcat(buf, begin, src - begin); |
1087 | 1.15M | begin = src + 1; |
1088 | 1.15M | wget_buffer_printf_append(buf, "%%%02X", (unsigned char)*src); |
1089 | 1.15M | } |
1090 | 1.32M | } |
1091 | | |
1092 | 17.1k | if (begin != src) |
1093 | 6.92k | wget_buffer_memcat(buf, begin, src - begin); |
1094 | | |
1095 | 17.1k | return buf->data; |
1096 | 17.1k | } |
1097 | | |
1098 | | /** |
1099 | | * \param[in] src A string, whose reserved characters are to be percent-encoded |
1100 | | * \param[in] buf A buffer where the result will be copied. |
1101 | | * \return The contents of the buffer \p buf after \p src has been encoded |
1102 | | * as described in https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.1. |
1103 | | * |
1104 | | * Escapes the path part of the URI suitable for GET/POST requests (origin-form). |
1105 | | * origin-form = absolute-path [ "?" query ] |
1106 | | * path-absolute = "/" [ segment-nz *( "/" segment ) ] |
1107 | | * segment-nz = 1*pchar |
1108 | | * segment = *pchar |
1109 | | * pchar = unreserved / pct-encoded / sub-delims / ":" / "@" |
1110 | | */ |
1111 | | const char *wget_iri_escape_path(const char *src, wget_buffer *buf) |
1112 | 10.3k | { |
1113 | 10.3k | const char *begin; |
1114 | | |
1115 | 927k | for (begin = src; *src; src++) { |
1116 | 916k | if (!(iri_isunreserved(*src) || iri_issubdelim(*src) || *src == '/' || *src == ':' || *src == '@')) { |
1117 | 804k | if (begin != src) |
1118 | 16.3k | wget_buffer_memcat(buf, begin, src - begin); |
1119 | 804k | begin = src + 1; |
1120 | 804k | wget_buffer_printf_append(buf, "%%%02X", (unsigned char)*src); |
1121 | 804k | } |
1122 | 916k | } |
1123 | | |
1124 | 10.3k | if (begin != src) |
1125 | 5.22k | wget_buffer_memcat(buf, begin, src - begin); |
1126 | | |
1127 | 10.3k | return buf->data; |
1128 | 10.3k | } |
1129 | | |
1130 | | /** |
1131 | | * \param[in] src A string, whose reserved characters are to be percent-encoded |
1132 | | * \param[in] buf A buffer where the result will be copied. |
1133 | | * \return The contents of the buffer \p buf after \p src has been encoded. |
1134 | | * |
1135 | | * Escapes (using percent-encoding) all the reserved characters in the string \p src |
1136 | | * (just like wget_iri_escape()), but **excluding the equal sign `=` and the ampersand `&`**. |
1137 | | * This function is thus ideally suited for query parts of URIs. |
1138 | | */ |
1139 | | const char *wget_iri_escape_query(const char *src, wget_buffer *buf) |
1140 | 9.25k | { |
1141 | 9.25k | const char *begin; |
1142 | | |
1143 | 681k | for (begin = src; *src; src++) { |
1144 | 672k | if (!iri_isunreserved(*src) && *src != '=' && *src != '&') { |
1145 | 585k | if (begin != src) |
1146 | 15.2k | wget_buffer_memcat(buf, begin, src - begin); |
1147 | 585k | begin = src + 1; |
1148 | 585k | if (*src == ' ') |
1149 | 518 | wget_buffer_memcat(buf, "+", 1); |
1150 | 584k | else |
1151 | 584k | wget_buffer_printf_append(buf, "%%%02X", (unsigned char)*src); |
1152 | 585k | } |
1153 | 672k | } |
1154 | | |
1155 | 9.25k | if (begin != src) |
1156 | 2.38k | wget_buffer_memcat(buf, begin, src - begin); |
1157 | | |
1158 | 9.25k | return buf->data; |
1159 | 9.25k | } |
1160 | | |
1161 | | /** |
1162 | | * \param[in] iri An IRI |
1163 | | * \param[in] buf A buffer, where the resulting string will be put |
1164 | | * \return The contents of the buffer \p buf |
1165 | | * |
1166 | | * Return the host part of the provided IRI. It is placed in the buffer \p buf |
1167 | | * and also returned as a `const char *`. |
1168 | | * |
1169 | | * The host is escaped using wget_iri_escape(). |
1170 | | */ |
1171 | | const char *wget_iri_get_escaped_host(const wget_iri *iri, wget_buffer *buf) |
1172 | 7.86k | { |
1173 | 7.86k | return wget_iri_escape(iri->host, buf); |
1174 | 7.86k | } |
1175 | | |
1176 | | /** |
1177 | | * \param[in] iri An IRI |
1178 | | * \param[in] buf A buffer, where the resulting string will be put |
1179 | | * \return The contents of the buffer \p buf |
1180 | | * |
1181 | | * Return the resource string, suitable for use in HTTP requests. |
1182 | | * Details: |
1183 | | * https://datatracker.ietf.org/doc/html/rfc7230#section-3.1.1 |
1184 | | * https://datatracker.ietf.org/doc/html/rfc7230#section-2.7 |
1185 | | * https://datatracker.ietf.org/doc/html/rfc3986#section-3.3 |
1186 | | * |
1187 | | * The resource string is comprised of the path, plus the query part, if present. Example: |
1188 | | * |
1189 | | * /foo/bar/?param_1=one¶m_2=two |
1190 | | * |
1191 | | * Both the path and the query are escaped using wget_iri_escape_path() and |
1192 | | * wget_iri_escape_query(), respectively. |
1193 | | * |
1194 | | * The resulting string is placed in the buffer \p buf and also returned as a `const char *`. |
1195 | | */ |
1196 | | const char *wget_iri_get_escaped_resource(const wget_iri *iri, wget_buffer *buf) |
1197 | 7.86k | { |
1198 | 7.86k | if (iri->path) |
1199 | 1.06k | wget_iri_escape_path(iri->path, buf); |
1200 | | |
1201 | | // Do not actually escape the query field. This part of the URL *MAY* |
1202 | | // contain reserved characters which should be passed on as-is and without |
1203 | | // escaping them. This is according to the rules laid out in RFC 2616 and |
1204 | | // RFC 7230. But we have to replace spaces in any case. |
1205 | 7.86k | if (iri->query) { |
1206 | 646 | wget_buffer_memcat(buf, "?", 1); |
1207 | 7.25k | for (const char *p = iri->query; *p; p++) |
1208 | 6.60k | wget_buffer_memcat(buf, *p == ' ' ? "+" : p, 1); |
1209 | 646 | } |
1210 | | |
1211 | 7.86k | return buf->data; |
1212 | 7.86k | } |
1213 | | |
1214 | | /** |
1215 | | * \param[in] iri An IRI |
1216 | | * \param[in] buf A buffer, where the resulting string will be put |
1217 | | * \param[in] encoding Character set the string should be converted to |
1218 | | * \return The contents of the buffer \p buf |
1219 | | * |
1220 | | * Get the path part of the provided IRI. |
1221 | | * |
1222 | | * The path is appended to \p buf. If \p buf is non-empty and does not end with |
1223 | | * a path separator (`/`), then one is added before the path is appended to \p |
1224 | | * buf. |
1225 | | * |
1226 | | * If \p encoding is provided, this function will try to convert the path (which is originally |
1227 | | * in UTF-8) to that encoding. |
1228 | | */ |
1229 | | |
1230 | | char *wget_iri_get_path(const wget_iri *iri, wget_buffer *buf, const char *encoding) |
1231 | 7.86k | { |
1232 | 7.86k | if (buf->length != 0 && buf->data[buf->length - 1] != '/') |
1233 | 7.74k | wget_buffer_memcat(buf, "/", 1); |
1234 | | |
1235 | 7.86k | if (iri->path) { |
1236 | 1.06k | if (wget_strcasecmp_ascii(encoding, "utf-8")) { |
1237 | 533 | char *fname; |
1238 | | |
1239 | 533 | if ((fname = wget_utf8_to_str(iri->path, encoding))) { |
1240 | 533 | wget_buffer_strcat(buf, fname); |
1241 | 533 | xfree(fname); |
1242 | 533 | } else { |
1243 | | // conversion failed, keep original string |
1244 | 0 | wget_buffer_strcat(buf, iri->path); |
1245 | 0 | } |
1246 | 533 | } else { |
1247 | 533 | wget_buffer_strcat(buf, iri->path); |
1248 | 533 | } |
1249 | 1.06k | } |
1250 | | |
1251 | 7.86k | if ((buf->length == 0 || buf->data[buf->length - 1] == '/') && default_page) |
1252 | 7.42k | wget_buffer_memcat(buf, default_page, default_page_length); |
1253 | | |
1254 | 7.86k | return buf->data; |
1255 | 7.86k | } |
1256 | | |
1257 | | /** |
1258 | | * \param[in] iri An IRI |
1259 | | * \param[in] buf A buffer, where the resulting string will be put |
1260 | | * \param[in] encoding Character set the string should be converted to |
1261 | | * \return The contents of the buffer \p buf |
1262 | | * |
1263 | | * Take the query part, and escape the path separators (`/`), so that it can be used as part |
1264 | | * of a filename. |
1265 | | * |
1266 | | * The resulting string will be placed in the buffer \p buf and also returned as a `const char *`. |
1267 | | * If the provided IRI has no query part, then the original contents of \p buf are returned and \p buf |
1268 | | * is kept untouched. |
1269 | | * |
1270 | | * If \p encoding is provided, this function will try to convert the query (which is originally |
1271 | | * in UTF-8) to that encoding. |
1272 | | */ |
1273 | | char *wget_iri_get_query_as_filename(const wget_iri *iri, wget_buffer *buf, const char *encoding) |
1274 | 15.7k | { |
1275 | 15.7k | if (iri->query) { |
1276 | 1.29k | const char *query; |
1277 | 1.29k | int allocated = 0; |
1278 | | |
1279 | 1.29k | wget_buffer_memcat(buf, "?", 1); |
1280 | | |
1281 | 1.29k | if (wget_strcasecmp_ascii(encoding, "utf-8")) { |
1282 | 646 | if ((query = wget_utf8_to_str(iri->query, encoding))) |
1283 | 646 | allocated = 1; |
1284 | 0 | else |
1285 | 0 | query = iri->query; |
1286 | 646 | } else { |
1287 | 646 | query = iri->query; |
1288 | 646 | } |
1289 | | |
1290 | 1.29k | int slashes = 0; |
1291 | 1.29k | const char *src = query; |
1292 | | |
1293 | | // count slashes in query string |
1294 | 3.15k | while ((src = strchr(src, '/'))) { |
1295 | 1.86k | slashes++; |
1296 | 1.86k | src++; |
1297 | 1.86k | } |
1298 | | |
1299 | 1.29k | if (slashes) { |
1300 | | // escape slashes to use query as part of a filename |
1301 | 636 | const char *begin; |
1302 | | |
1303 | 10.1k | for (src = begin = query; *src; src++) { |
1304 | 9.50k | if (*src == '/') { |
1305 | 1.86k | if (begin != src) |
1306 | 1.09k | wget_buffer_memcat(buf, begin, src - begin); |
1307 | 1.86k | begin = src + 1; |
1308 | 1.86k | wget_buffer_memcat(buf, "%2F", 3); |
1309 | 1.86k | } |
1310 | 9.50k | } |
1311 | | |
1312 | 636 | if (begin != src) |
1313 | 500 | wget_buffer_memcat(buf, begin, src - begin); |
1314 | 656 | } else { |
1315 | 656 | wget_buffer_strcat(buf, query); |
1316 | 656 | } |
1317 | | |
1318 | 1.29k | if (allocated) |
1319 | 646 | xfree(query); |
1320 | 1.29k | } |
1321 | | |
1322 | 15.7k | return buf->data; |
1323 | 15.7k | } |
1324 | | |
1325 | | /** |
1326 | | * \param[in] iri An IRI |
1327 | | * \param[in] buf A buffer, where the resulting string will be put |
1328 | | * \param[in] encoding Character set the string should be converted to |
1329 | | * \return The contents of the buffer \p buf |
1330 | | * |
1331 | | * Get the filename of the path of the provided IRI. |
1332 | | * |
1333 | | * This is similar to wget_iri_get_path(), but instead of returning the whole path |
1334 | | * it only returns the substring after the last occurrence of `/`. In other words, the |
1335 | | * filename of the path. |
1336 | | * |
1337 | | * This is also known as the "basename" in the UNIX world, and the output of this function |
1338 | | * would be equivalent to the output of the `basename(1)` tool. |
1339 | | * |
1340 | | * The path is copied into \p buf if it's empty. If the buffer \p buf is not empty, |
1341 | | * it is appended to it after a path separator (`/`). |
1342 | | * |
1343 | | * If \p encoding is provided, this function will try to convert the path (which is originally |
1344 | | * in UTF-8) to that encoding. |
1345 | | */ |
1346 | | char *wget_iri_get_basename(const wget_iri *iri, wget_buffer *buf, const char *encoding, int flags) |
1347 | 7.86k | { |
1348 | 7.86k | if (iri->path) { |
1349 | 1.06k | char *fname; |
1350 | | |
1351 | 1.06k | if (wget_strcasecmp_ascii(encoding, "utf-8")) { |
1352 | 533 | char *p; |
1353 | | |
1354 | 533 | if ((p = strrchr(iri->path, '/'))) { |
1355 | 111 | if (!(fname = wget_utf8_to_str(p + 1, encoding))) |
1356 | 0 | wget_buffer_strcat(buf, p + 1); // conversion failed, keep original string |
1357 | 422 | } else { |
1358 | 422 | if (!(fname = wget_utf8_to_str(iri->path, encoding))) |
1359 | 0 | wget_buffer_strcat(buf, iri->path); // conversion failed, keep original string |
1360 | 422 | } |
1361 | | |
1362 | 533 | if (fname) { |
1363 | | // conversion succeeded |
1364 | 533 | wget_buffer_strcat(buf, fname); |
1365 | 533 | xfree(fname); |
1366 | 533 | } |
1367 | 533 | } else { |
1368 | 533 | if ((fname = strrchr(iri->path, '/'))) |
1369 | 111 | wget_buffer_strcat(buf, fname + 1); |
1370 | 422 | else |
1371 | 422 | wget_buffer_strcat(buf, iri->path); |
1372 | 533 | } |
1373 | 1.06k | } |
1374 | | |
1375 | 7.86k | if ((buf->length == 0 || buf->data[buf->length - 1] == '/') && default_page) |
1376 | 0 | wget_buffer_memcat(buf, default_page, default_page_length); |
1377 | | |
1378 | 7.86k | if (flags & WGET_IRI_WITH_QUERY) |
1379 | 7.86k | return wget_iri_get_query_as_filename(iri, buf, encoding); |
1380 | | |
1381 | 0 | return buf->data; |
1382 | 7.86k | } |
1383 | | |
1384 | | // escaping: see https://tools.ietf.org/html/rfc2396#2 following (especially 2.4.2) |
1385 | | /*const char *iri_escape(const char *uri) |
1386 | | { |
1387 | | int esc = 0; |
1388 | | const char *p; |
1389 | | |
1390 | | for (p = uri; *p; p++) { |
1391 | | if (*p == '%') { |
1392 | | if ((isxdigit(p[1]) && isxdigit(p[2])) || p[1] == '%') |
1393 | | return uri; // assume that URI is already escaped |
1394 | | esc++; |
1395 | | } else if () |
1396 | | } |
1397 | | } |
1398 | | */ |
1399 | | |
1400 | | void wget_iri_set_defaultpage(const char *page) |
1401 | 4.62k | { |
1402 | 4.62k | default_page = page; |
1403 | 4.62k | default_page_length = default_page ? strlen(default_page) : 0; |
1404 | 4.62k | } |
1405 | | |
1406 | | /** |
1407 | | * \param scheme The scheme for the new default port |
1408 | | * \param port The new default port value for the given scheme |
1409 | | * \return 0: success -1: Unknown scheme |
1410 | | * |
1411 | | * Set the default \p port for the given \p scheme. |
1412 | | */ |
1413 | | int wget_iri_set_defaultport(wget_iri_scheme scheme, uint16_t port) |
1414 | 0 | { |
1415 | 0 | if ((unsigned) scheme < countof(schemes)) { |
1416 | 0 | schemes[scheme].port = port; |
1417 | 0 | return 0; |
1418 | 0 | } |
1419 | | |
1420 | 0 | return -1; |
1421 | 0 | } |
1422 | | |
1423 | | /** |
1424 | | * \param[in] iri An IRI |
1425 | | * \param[in] scheme A scheme, such as `http` or `https`. |
1426 | | * \return The original scheme of IRI (ie. before the replacement) |
1427 | | * |
1428 | | * Set the scheme of the provided IRI. The IRI's original scheme |
1429 | | * is replaced by the new one. |
1430 | | * |
1431 | | * If the IRI was using a default port (such as 80 for HTTP or 443 for HTTPS) |
1432 | | * that port is modified as well to match the default port of the new scheme. |
1433 | | * Otherwise the port is left untouched. |
1434 | | */ |
1435 | | wget_iri_scheme wget_iri_set_scheme(wget_iri *iri, wget_iri_scheme scheme) |
1436 | 7.86k | { |
1437 | 7.86k | wget_iri_scheme old_scheme = iri->scheme; |
1438 | | |
1439 | 7.86k | if ((unsigned) scheme < countof(schemes) && iri->scheme != scheme) { |
1440 | 7.85k | iri->scheme = scheme; |
1441 | | |
1442 | | // If the IRI is using the default port, also change it |
1443 | 7.85k | if (iri->port == schemes[old_scheme].port) |
1444 | 7.70k | iri->port = schemes[scheme].port; |
1445 | | |
1446 | 7.85k | size_t old_scheme_len = strlen(schemes[old_scheme].name); |
1447 | | |
1448 | 7.85k | if (strncmp(iri->uri, schemes[old_scheme].name, old_scheme_len) == 0 && iri->uri[old_scheme_len] == ':') { |
1449 | 7.85k | char *new_uri = wget_aprintf("%s%s", schemes[iri->scheme].name, iri->uri + old_scheme_len); |
1450 | 7.85k | if (iri->uri_allocated) |
1451 | 0 | xfree(iri->uri); |
1452 | 7.85k | iri->uri = new_uri; |
1453 | 7.85k | iri->uri_allocated = true; |
1454 | 7.85k | } |
1455 | 7.85k | } |
1456 | | |
1457 | 7.86k | if (iri->userinfo) { |
1458 | 102 | xfree(iri->safe_uri); |
1459 | 102 | iri->safe_uri = create_safe_uri(iri); |
1460 | 7.75k | } else { |
1461 | 7.75k | iri->safe_uri = iri->uri; |
1462 | 7.75k | } |
1463 | 7.86k | return old_scheme; |
1464 | 7.86k | } |
1465 | | |
1466 | | static char *create_safe_uri(wget_iri *iri) |
1467 | 324 | { |
1468 | 324 | if (!iri || !iri->uri) |
1469 | 0 | return NULL; |
1470 | | |
1471 | 324 | wget_buffer *buf = wget_buffer_alloc(strlen(iri->uri)); |
1472 | 324 | if (!buf) |
1473 | 0 | return NULL; |
1474 | | |
1475 | 324 | wget_buffer_printf(buf, "%s://%s", schemes[iri->scheme].name, iri->host); |
1476 | | |
1477 | 324 | if (iri->path) { |
1478 | 154 | wget_buffer_strcat(buf, "/"); |
1479 | 154 | wget_buffer_strcat(buf, iri->path); |
1480 | 154 | } |
1481 | 324 | if (iri->query) { |
1482 | 158 | wget_buffer_strcat(buf, "?"); |
1483 | 158 | wget_buffer_strcat(buf, iri->query); |
1484 | 158 | } |
1485 | 324 | if (iri->fragment) { |
1486 | 38 | wget_buffer_strcat(buf, "#"); |
1487 | 38 | wget_buffer_strcat(buf, iri->fragment); |
1488 | 38 | } |
1489 | | |
1490 | 324 | char *safe_uri = buf->data; |
1491 | 324 | buf->data = NULL; |
1492 | 324 | wget_buffer_free(&buf); |
1493 | | |
1494 | 324 | return safe_uri; |
1495 | 324 | } |
1496 | | |
1497 | | /** @} */ |