Coverage Report

Created: 2025-01-28 06:58

/src/wget2/libwget/iri.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2012 Tim Ruehsen
3
 * Copyright (c) 2015-2024 Free Software Foundation, Inc.
4
 *
5
 * This file is part of libwget.
6
 *
7
 * Libwget is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as published by
9
 * the Free Software Foundation, either version 3 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * Libwget is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
19
 *
20
 *
21
 * URI/IRI routines
22
 * about encoding see http://nikitathespider.com/articles/EncodingDivination.html
23
 * about GET encoding see https://stackoverflow.com/questions/1549213/whats-the-correct-encoding-of-http-get-request-strings
24
 * RFC 3986: URI generic syntax
25
 *
26
 *
27
 * Changelog
28
 * 25.04.2012  Tim Ruehsen  created
29
 *
30
 */
31
32
#include <config.h>
33
34
#include <string.h>
35
#include <errno.h>
36
#include "c-ctype.h"
37
38
#include <wget.h>
39
#include "private.h"
40
41
static char *create_safe_uri(wget_iri *iri);
42
43
/**
44
 * \file
45
 * \brief Functions to work with URIs and IRIs
46
 * \defgroup libwget-iri URIs/IRIs
47
 *
48
 * @{
49
 *
50
 * URI/IRI parsing and manipulation functions.
51
 *
52
 * IRIs are processed according to [RFC 3987](https://datatracker.ietf.org/doc/rfc3987/).
53
 * Functions that escape certain characters (such as wget_iri_escape()) work according to
54
 * [RFC 3986](https://datatracker.ietf.org/doc/rfc3986/).
55
 *
56
 * The \ref wget_iri_st "wget_iri" structure represents an IRI. You generate one from a string with wget_iri_parse() or
57
 * wget_iri_parse_base(). You can use wget_iri_clone() to generate another identical \ref wget_iri_st "wget_iri".
58
 *
59
 * You can access each of the fields of a \ref wget_iri_st "wget_iri" (such as `path`) independently, and you can use
60
 * the getters here to escape each of those parts, or for convenience (e.g wget_iri_get_escaped_host(),
61
 * wget_iri_get_escaped_resource(), etc.).
62
 *
63
 * URIs/IRIs are all internally treated in UTF-8. The parsing functions that generate a \ref wget_iri_st "wget_iri" structure
64
 * (wget_iri_parse() and wget_iri_parse_base()) thus convert the input string to UTF-8 before anything else.
65
 * These functions take an `encoding` parameter that tells which is the original encoding of that string.
66
 *
67
 * Conversely, the getters (for example, wget_iri_get_path()) can convert the output string from UTF-8
68
 * to an encoding of choice. The desired encoding is also specified in the `encoding` parameter.
69
 *
70
 * The `encoding` parameter, in all functions that accept it, is a string with the name of a character set
71
 * supported by GNU libiconv. You can find such a list elsewhere, but popular examples are "utf-8", "utf-16" or "iso-8859-1".
72
 */
73
74
static const char
75
  *default_page = "index.html";
76
static size_t
77
  default_page_length = 10;
78
79
static struct iri_scheme {
80
  uint16_t port;
81
  const char name[6];
82
} schemes[] = {
83
  [WGET_IRI_SCHEME_HTTP]  = {  80, "http"  },
84
  [WGET_IRI_SCHEME_HTTPS] = { 443, "https" },
85
};
86
87
static size_t WGET_GCC_NONNULL_ALL normalize_path(char *path);
88
89
/**
90
 * \param[in] scheme Scheme to get name for
91
 * \return Name of \p scheme (e.g. "http" or "https") or NULL is not supported
92
 *
93
 * Maps \p scheme to it's string representation.
94
 */
95
const char *wget_iri_scheme_get_name(wget_iri_scheme scheme)
96
0
{
97
0
  if ((unsigned) scheme < countof(schemes))
98
0
    return schemes[scheme].name;
99
100
0
  return NULL;
101
0
}
102
103
/**
104
 * \param[in] iri An IRI
105
 * \return 1 if the scheme is supported, 0 if not
106
 *
107
 * Tells whether the IRI's scheme is supported or not.
108
 */
109
bool wget_iri_supported(const wget_iri *iri)
110
7.86k
{
111
7.86k
  return (unsigned) iri->scheme < countof(schemes);
112
7.86k
}
113
114
115
/* \cond _hide_internal_symbols */
116
4.62k
#define IRI_CTYPE_GENDELIM (1<<0)
117
4.62k
#define iri_isgendelim(c) (iri_ctype[(unsigned char)(c)] & IRI_CTYPE_GENDELIM)
118
119
825k
#define IRI_CTYPE_SUBDELIM (1<<1)
120
1.74M
#define iri_issubdelim(c) (iri_ctype[(unsigned char)(c)] & IRI_CTYPE_SUBDELIM)
121
122
2.91M
#define IRI_CTYPE_UNRESERVED (1<<2)
123
4.50M
#define iri_isunreserved(c) (iri_ctype[(unsigned char)(c)] & IRI_CTYPE_UNRESERVED)
124
125
150k
#define iri_isscheme(c) (c_isalnum(c) || c == '+' || c == '-' || c == '.')
126
/* \endcond */
127
128
static const unsigned char
129
  iri_ctype[256] = {
130
    [':'] = IRI_CTYPE_GENDELIM,
131
    ['/'] = IRI_CTYPE_GENDELIM,
132
    ['?'] = IRI_CTYPE_GENDELIM,
133
    ['#'] = IRI_CTYPE_GENDELIM,
134
    ['['] = IRI_CTYPE_GENDELIM,
135
    [']'] = IRI_CTYPE_GENDELIM,
136
    ['@'] = IRI_CTYPE_GENDELIM,
137
138
    ['!'] = IRI_CTYPE_SUBDELIM,
139
    ['$'] = IRI_CTYPE_SUBDELIM,
140
    ['&'] = IRI_CTYPE_SUBDELIM,
141
    ['\''] = IRI_CTYPE_SUBDELIM,
142
    ['('] = IRI_CTYPE_SUBDELIM,
143
    [')'] = IRI_CTYPE_SUBDELIM,
144
    ['*'] = IRI_CTYPE_SUBDELIM,
145
    ['+'] = IRI_CTYPE_SUBDELIM,
146
    [','] = IRI_CTYPE_SUBDELIM,
147
    [';'] = IRI_CTYPE_SUBDELIM,
148
    ['='] = IRI_CTYPE_SUBDELIM,
149
150
    ['0'] = IRI_CTYPE_UNRESERVED,
151
    ['1'] = IRI_CTYPE_UNRESERVED,
152
    ['2'] = IRI_CTYPE_UNRESERVED,
153
    ['3'] = IRI_CTYPE_UNRESERVED,
154
    ['4'] = IRI_CTYPE_UNRESERVED,
155
    ['5'] = IRI_CTYPE_UNRESERVED,
156
    ['6'] = IRI_CTYPE_UNRESERVED,
157
    ['7'] = IRI_CTYPE_UNRESERVED,
158
    ['8'] = IRI_CTYPE_UNRESERVED,
159
    ['9'] = IRI_CTYPE_UNRESERVED,
160
    ['a'] = IRI_CTYPE_UNRESERVED,
161
    ['b'] = IRI_CTYPE_UNRESERVED,
162
    ['c'] = IRI_CTYPE_UNRESERVED,
163
    ['d'] = IRI_CTYPE_UNRESERVED,
164
    ['e'] = IRI_CTYPE_UNRESERVED,
165
    ['f'] = IRI_CTYPE_UNRESERVED,
166
    ['g'] = IRI_CTYPE_UNRESERVED,
167
    ['h'] = IRI_CTYPE_UNRESERVED,
168
    ['i'] = IRI_CTYPE_UNRESERVED,
169
    ['j'] = IRI_CTYPE_UNRESERVED,
170
    ['k'] = IRI_CTYPE_UNRESERVED,
171
    ['l'] = IRI_CTYPE_UNRESERVED,
172
    ['m'] = IRI_CTYPE_UNRESERVED,
173
    ['n'] = IRI_CTYPE_UNRESERVED,
174
    ['o'] = IRI_CTYPE_UNRESERVED,
175
    ['p'] = IRI_CTYPE_UNRESERVED,
176
    ['q'] = IRI_CTYPE_UNRESERVED,
177
    ['r'] = IRI_CTYPE_UNRESERVED,
178
    ['s'] = IRI_CTYPE_UNRESERVED,
179
    ['t'] = IRI_CTYPE_UNRESERVED,
180
    ['u'] = IRI_CTYPE_UNRESERVED,
181
    ['v'] = IRI_CTYPE_UNRESERVED,
182
    ['w'] = IRI_CTYPE_UNRESERVED,
183
    ['x'] = IRI_CTYPE_UNRESERVED,
184
    ['y'] = IRI_CTYPE_UNRESERVED,
185
    ['z'] = IRI_CTYPE_UNRESERVED,
186
    ['A'] = IRI_CTYPE_UNRESERVED,
187
    ['B'] = IRI_CTYPE_UNRESERVED,
188
    ['C'] = IRI_CTYPE_UNRESERVED,
189
    ['D'] = IRI_CTYPE_UNRESERVED,
190
    ['E'] = IRI_CTYPE_UNRESERVED,
191
    ['F'] = IRI_CTYPE_UNRESERVED,
192
    ['G'] = IRI_CTYPE_UNRESERVED,
193
    ['H'] = IRI_CTYPE_UNRESERVED,
194
    ['I'] = IRI_CTYPE_UNRESERVED,
195
    ['J'] = IRI_CTYPE_UNRESERVED,
196
    ['K'] = IRI_CTYPE_UNRESERVED,
197
    ['L'] = IRI_CTYPE_UNRESERVED,
198
    ['M'] = IRI_CTYPE_UNRESERVED,
199
    ['N'] = IRI_CTYPE_UNRESERVED,
200
    ['O'] = IRI_CTYPE_UNRESERVED,
201
    ['P'] = IRI_CTYPE_UNRESERVED,
202
    ['Q'] = IRI_CTYPE_UNRESERVED,
203
    ['R'] = IRI_CTYPE_UNRESERVED,
204
    ['S'] = IRI_CTYPE_UNRESERVED,
205
    ['T'] = IRI_CTYPE_UNRESERVED,
206
    ['U'] = IRI_CTYPE_UNRESERVED,
207
    ['V'] = IRI_CTYPE_UNRESERVED,
208
    ['W'] = IRI_CTYPE_UNRESERVED,
209
    ['X'] = IRI_CTYPE_UNRESERVED,
210
    ['Y'] = IRI_CTYPE_UNRESERVED,
211
    ['Z'] = IRI_CTYPE_UNRESERVED,
212
    ['-'] = IRI_CTYPE_UNRESERVED,
213
    ['.'] = IRI_CTYPE_UNRESERVED,
214
    ['_'] = IRI_CTYPE_UNRESERVED,
215
    ['~'] = IRI_CTYPE_UNRESERVED
216
  };
217
218
/**
219
 * \param[in] c A character
220
 * \return 1 if \p c is a generic delimiter, 0 if not
221
 *
222
 * Tests whether \p c is a generic delimiter (gen-delim),
223
 * according to [RFC 3986, sect. 2.2](https://tools.ietf.org/html/rfc3986#section-2.2).
224
 */
225
bool wget_iri_isgendelim(char c)
226
4.62k
{
227
  // return strchr(":/?#[]@",c)!=NULL;
228
4.62k
  return iri_isgendelim(c);
229
4.62k
}
230
231
/**
232
 * \param[in] c A character
233
 * \return 1 if \p c is a subcomponent delimiter, 0 if not
234
 *
235
 * Tests whether \p c is a subcomponent delimiter (sub-delim)
236
 * according to [RFC 3986, sect. 2.2](https://tools.ietf.org/html/rfc3986#section-2.2).
237
 */
238
bool wget_iri_issubdelim(char c)
239
4.62k
{
240
  // return strchr("!$&\'()*+,;=",c)!=NULL;
241
4.62k
  return iri_issubdelim(c);
242
4.62k
}
243
244
/**
245
 * \param[in] c A character
246
 * \return 1 if \p c is a reserved character, 0 if not
247
 *
248
 * Tests whether \p c is a reserved character.
249
 *
250
 * According to [RFC 3986, sect. 2.2](https://tools.ietf.org/html/rfc3986#section-2.2),
251
 * the set of reserved characters is formed
252
 * by the generic delimiters (gen-delims, wget_iri_isgendelim()) and the
253
 * subcomponent delimiters (sub-delims, wget_iri_is_subdelim()).
254
 *
255
 * This function is thus equivalent to:
256
 *
257
 *     return wget_iri_isgendelim(c) || wget_iri_issubdelim(c);
258
 *
259
 */
260
bool wget_iri_isreserved(char c)
261
4.62k
{
262
4.62k
  return wget_iri_isgendelim(c) || wget_iri_issubdelim(c);
263
4.62k
}
264
265
/**
266
 * \param[in] c A character
267
 * \return 1 if \p c is an unreserved character, 0 if not
268
 *
269
 * Tests whether \p c is an unreserved character.
270
 */
271
bool wget_iri_isunreserved(char c)
272
0
{
273
0
  return iri_isunreserved(c);
274
0
}
275
276
static unsigned char WGET_GCC_CONST unhex(unsigned char c)
277
13.4k
{
278
13.4k
  return c <= '9' ? c - '0' : (c <= 'F' ? c - 'A' + 10 : c - 'a' + 10);
279
13.4k
}
280
281
static char *iri_unescape_inline(char *src, int ctype)
282
61.5k
{
283
61.5k
  char *ret = NULL;
284
61.5k
  unsigned char *s = (unsigned char *)src; // just a helper to avoid casting a lot
285
61.5k
  unsigned char *d = s;
286
287
2.00M
  while (*s) {
288
1.93M
    if (*s == '%') {
289
3.08k
      if (c_isxdigit(s[1]) && c_isxdigit(s[2])) {
290
1.38k
        unsigned char c = (unsigned char) (unhex(s[1]) << 4) | unhex(s[2]);
291
1.38k
        if (!ctype || (!(iri_ctype[(unsigned char)(c)] & ctype) && c != '%')) {
292
1.38k
          *d++ = c;
293
1.38k
          s += 3;
294
1.38k
          ret = src;
295
1.38k
          continue;
296
1.38k
        }
297
1.38k
      }
298
1.93M
    } else if (*s == '#') {
299
11.4k
      uint32_t value = 0;
300
301
11.4k
      if (s[1] == 'x') {
302
3.34k
        unsigned char *p = s + 2;
303
14.0k
        while (c_isxdigit(*p)) {
304
10.6k
          value = ((value & 0x0FFFFFFF) << 4) | unhex(*p);
305
10.6k
          p++;
306
10.6k
        }
307
3.34k
        if (*p == ';') {
308
2.03k
          if (value > 0 && value < 128) {
309
510
            *d++ = (unsigned char) value;
310
510
            s = p + 1;
311
510
            continue;
312
510
          }
313
          // else: we have to convert the unicode value to whatever encoding the URL is in (likely UTF-8)
314
          // this cannot be done inline since the URL's length may increase
315
2.03k
        }
316
8.07k
      } else {
317
8.07k
        unsigned char *p = s + 1;
318
12.7k
        while (c_isdigit(*p) && value <= 0x10FFFF) { // max. Unicode value
319
4.71k
          value = value * 10 + (*p - '0');
320
4.71k
          p++;
321
4.71k
        }
322
8.07k
        if (*p == ';') {
323
1.83k
          if (value > 0 && value < 128) {
324
602
            *d++ = (unsigned char) value;
325
602
            s = p + 1;
326
602
            continue;
327
602
          }
328
          // else: we have to convert the unicode value to whatever encoding the URL is in (likely UTF-8)
329
          // this cannot be done inline since the URL's length may increase
330
1.83k
        }
331
8.07k
      }
332
1.92M
    } else if (*s == '\r' || *s == '\n') {
333
      // Ignore / remove CR and LF from URLs. See https://gitlab.com/gnuwget/wget2/-/issues/522
334
930
      s++;
335
930
      continue;
336
930
    }
337
338
1.93M
    *d++ = *s++;
339
1.93M
  }
340
61.5k
  *d = 0;
341
342
61.5k
  return ret;
343
61.5k
}
344
345
/**
346
 * \param[in] src A string
347
 * \return A pointer to \p src, after the transformation is done
348
 *
349
 * Unescape a string. All the percent-encoded characters (`%XX`) are converted
350
 * back to their original form.
351
 *
352
 * **The transformation is done inline**, so `src` will be modified after this function returns.
353
 * If no percent-encoded characters are found, the string is left untouched.
354
 */
355
char *wget_iri_unescape_inline(char *src)
356
61.5k
{
357
61.5k
  return iri_unescape_inline(src, 0);
358
61.5k
}
359
360
/**
361
 * \param[in] src A string
362
 * \return A pointer to \p src, after the transformation is done
363
 *
364
 * Unescape a string except escaped generic delimiters (and escaped '%'.
365
 * The percent-encoded characters (`%XX`) are converted back to their original form.
366
 *
367
 * This variant of unescaping is helpful before an URL is being parsed, so that
368
 * the parser recognizes e.g. 'http%3A//' as relative URL (path) and not as a scheme.
369
 *
370
 * **The transformation is done inline**, so `src` will be modified after this function returns.
371
 * If no characters were unescaped, the string is left untouched.
372
 */
373
char *wget_iri_unescape_url_inline(char *src)
374
0
{
375
0
  return iri_unescape_inline(src, IRI_CTYPE_GENDELIM);
376
0
}
377
378
/**
379
 * \param[in] iri An IRI
380
 *
381
 * Free the heap-allocated content of the provided IRI, but leave the rest
382
 * of the fields.
383
 *
384
 * This function frees the following fields of \ref wget_iri_st "wget_iri":
385
 *
386
 *  - `host`
387
 *  - `path`
388
 *  - `query`
389
 *  - `fragment`
390
 *  - `connection_part`
391
 */
392
void wget_iri_free_content(wget_iri *iri)
393
43.5k
{
394
43.5k
  if (iri) {
395
43.5k
    if (iri->userinfo)
396
332
      xfree(iri->safe_uri);
397
43.2k
    else
398
43.2k
      iri->safe_uri = NULL;
399
43.5k
    if (iri->uri_allocated)
400
7.85k
      xfree(iri->uri);
401
43.5k
    if (iri->host_allocated)
402
20.1k
      xfree(iri->host);
403
43.5k
    if (iri->path_allocated)
404
5.79k
      xfree(iri->path);
405
43.5k
    if (iri->query_allocated)
406
1.04k
      xfree(iri->query);
407
43.5k
    if (iri->fragment_allocated)
408
112
      xfree(iri->fragment);
409
43.5k
    xfree(iri->connection_part);
410
43.5k
  }
411
43.5k
}
412
413
/**
414
 * \param[in] iri A pointer to a pointer to an IRI (a \ref wget_iri_st "wget_iri")
415
 *
416
 * Destroy a \ref wget_iri_st "wget_iri" structure.
417
 *
418
 * The provided pointer is set to NULL.
419
 */
420
void wget_iri_free(wget_iri **iri)
421
49.8k
{
422
49.8k
  if (iri && *iri) {
423
43.5k
    wget_iri_free_content(*iri);
424
43.5k
    xfree(*iri);
425
43.5k
  }
426
49.8k
}
427
428
// URIs are assumed to be unescaped at this point
429
430
/**
431
 * \param[in] url A URL/IRI
432
 * \param[in] encoding Original encoding of \p url
433
 * \return A libwget IRI (`wget_iri`)
434
 *
435
 * The host, path, query and fragment parts will be converted to UTF-8 from
436
 * the encoding given in the parameter \p encoding. GNU libiconv is used
437
 * to perform the conversion, so this value should be the name of a valid character set
438
 * supported by that library, such as "utf-8" or "iso-8859-1".
439
 */
440
wget_iri *wget_iri_parse(const char *url, const char *encoding)
441
37.0k
{
442
37.0k
  wget_iri *iri;
443
37.0k
  char *p, *s, *authority, c;
444
37.0k
  size_t slen, extra;
445
37.0k
  int have_scheme;
446
447
37.0k
  if (!url)
448
1.28k
    return NULL;
449
450
  /*
451
    URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
452
    hier-part   = "//" authority path-abempty / path-absolute / path-rootless / path-empty
453
    scheme      =  ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
454
   */
455
36.6k
  while (c_isspace(*url)) url++;
456
35.7k
  if (!*url) return NULL;
457
/*
458
  // first unescape, than convert to UTF-8
459
  if (strchr(url, '%')) {
460
    char *unesc_url = wget_strdup(url);
461
462
    wget_percent_unescape(unesc_url);
463
464
    if (wget_str_needs_encoding(unesc_url)) {
465
      if ((url = wget_str_to_utf8(unesc_url, encoding)))
466
        xfree(unesc_url);
467
      else
468
        url = unesc_url; // on error, use what we have
469
    } else
470
      url = unesc_url;
471
472
    url_allocated = 1;
473
  } else {
474
    url_allocated = 0;
475
476
    if (wget_str_needs_encoding(url)) {
477
      if ((s = wget_str_to_utf8(url, encoding))) {
478
        url = s;
479
        url_allocated = 1;
480
      }
481
    }
482
  }
483
*/
484
485
35.6k
  if (c_isalpha(*url)) {
486
22.5k
    const char *x;
487
22.5k
    have_scheme = 1;
488
489
150k
    for (x = url; *x && iri_isscheme(*x); x++)
490
128k
      ;
491
492
22.5k
    if (*x != ':' || c_isdigit(x[1]))
493
4.83k
      have_scheme = 0; // not a scheme
494
22.5k
  } else
495
13.1k
    have_scheme = 0;
496
497
  // just use one block of memory for all parsed URI parts
498
35.6k
  slen = strlen(url);
499
35.6k
  extra = have_scheme ? 0 : sizeof("http://") - 1; // extra space for http://
500
501
35.6k
  iri = wget_malloc(sizeof(wget_iri) + (slen + extra + 1) * 2);
502
35.6k
  if (!iri)
503
0
    return NULL;
504
505
35.6k
  memset(iri, 0, sizeof(wget_iri));
506
507
35.6k
  if (have_scheme) {
508
17.6k
    iri->msize = slen + 1;
509
17.6k
    iri->uri = memcpy(iri + 1, url, iri->msize);
510
17.6k
    p = s = memcpy((char *)iri->uri + iri->msize, url, iri->msize);
511
17.6k
    s = strchr(s, ':'); // we know there is a :
512
17.6k
    *s++ = 0;
513
514
    // p points to scheme
515
17.6k
    wget_iri_unescape_inline(p); // percent unescape
516
17.6k
    wget_strtolower(p); // convert to lowercase
517
518
17.6k
    bool found = false; // assume the scheme is unsupported
519
520
    // find the scheme in our static list of supported schemes
521
    // for later comparisons we compare pointers (avoiding strcasecmp())
522
20.8k
    for (unsigned it = 0; it < countof(schemes); it++) {
523
19.2k
      if (!strcmp(schemes[it].name, p)) {
524
16.0k
        iri->scheme = it;
525
16.0k
        iri->port = schemes[it].port;
526
16.0k
        found = true;
527
16.0k
        break;
528
16.0k
      }
529
19.2k
    }
530
531
17.6k
    if (!found) {
532
1.57k
      debug_printf("Unsupported scheme in '%s'\n", url);
533
1.57k
      wget_iri_free(&iri);
534
1.57k
      return NULL;
535
1.57k
    }
536
18.0k
  } else {
537
    // add http:// scheme to url
538
18.0k
    iri->uri = memcpy(iri + 1, "http://", extra);
539
18.0k
    memcpy((char *)iri->uri + extra, url, slen + 1);
540
18.0k
    iri->msize = extra + slen + 1;
541
18.0k
    s = memcpy((char *)iri->uri + iri->msize, iri->uri, iri->msize);
542
18.0k
    s[extra - 3] = 0;
543
18.0k
    s += extra;
544
545
18.0k
    iri->scheme = WGET_IRI_SCHEME_HTTP;
546
18.0k
    iri->port = schemes[WGET_IRI_SCHEME_HTTP].port;
547
18.0k
  }
548
549
//  if (url_allocated)
550
//    xfree(url);
551
552
  // this is true for http, https, ftp, file (accept any number of /, like most browsers)
553
69.1k
  while (*s == '/')
554
35.0k
    s++;
555
556
  // authority
557
34.1k
  authority = s;
558
1.03M
  while (*s && *s != '/' && *s != '?' && *s != '#')
559
1.00M
    s++;
560
34.1k
  c = *s;
561
34.1k
  if (c) *s++ = 0;
562
34.1k
  wget_iri_unescape_inline(authority);
563
564
  // left over: [path][?query][#fragment]
565
34.1k
  if (c == '/') {
566
8.31k
    iri->path = s;
567
792k
    while (*s && *s != '?' && *s != '#')
568
784k
      s++;
569
8.31k
    c = *s;
570
8.31k
    if (c) *s++ = 0;
571
8.31k
    wget_iri_unescape_inline((char *)iri->path);
572
8.31k
    normalize_path((char *)iri->path);
573
8.31k
  }
574
575
34.1k
  if (c == '?') {
576
1.64k
    iri->query = s;
577
14.3k
    while (*s && *s != '#') {
578
12.7k
      if (*s == '+')
579
710
        *s = ' ';
580
12.7k
      s++;
581
12.7k
    }
582
1.64k
    c = *s;
583
1.64k
    if (c) *s++ = 0;
584
    /* do not unescape query else we get ambiguity for chars like &, =, +, ... */
585
1.64k
  }
586
587
34.1k
  if (c == '#') {
588
1.47k
    iri->fragment = s;
589
1.47k
    s += strlen(s);
590
1.47k
    wget_iri_unescape_inline((char *)iri->fragment);
591
1.47k
  }
592
593
34.1k
  if (*s) {
594
0
    debug_printf("unparsed rest '%s'\n", s);
595
0
  }
596
597
34.1k
  if (*authority) {
598
33.0k
    s = authority;
599
33.0k
    p = strchr(authority, '@');
600
33.0k
    if (p) {
601
230
      iri->userinfo = s;
602
230
      *p = 0;
603
230
      if ((s = strchr(s, ':'))) {
604
14
        *s = 0;
605
14
        iri->password = s + 1;
606
14
      }
607
230
      s = p + 1;
608
230
    }
609
33.0k
    if (*s == '[') {
610
62
      p = strrchr(s, ']');
611
62
      if (p) {
612
42
        iri->host = s + 1;
613
42
        *p = 0;
614
42
        s = p + 1;
615
42
      } else {
616
        // something is broken
617
20
        iri->host = s + 1;
618
20
        s += strlen(s);
619
20
      }
620
32.9k
    } else {
621
32.9k
      iri->host = s;
622
1.00M
      while (*s && *s != ':')
623
969k
        s++;
624
32.9k
    }
625
33.0k
    if (*s == ':') {
626
5.78k
      if (c_isdigit(s[1])) {
627
1.32k
        unsigned long port = strtoul(s + 1, NULL, 10);
628
1.32k
        if (port == 0 || port > 65535) {
629
920
          error_printf(_("Port number must be in the range 1..65535\n"));
630
920
          wget_iri_free(&iri);
631
920
          return NULL;
632
920
        }
633
408
        iri->port = (uint16_t) port;
634
408
        iri->port_given = true;
635
408
      }
636
5.78k
    }
637
32.1k
    *s = 0;
638
32.1k
  }
639
640
  // now unescape all components (not interested in display, userinfo, password right now)
641
642
33.1k
  if (iri->host) {
643
32.1k
    wget_strtolower((char *)iri->host);
644
32.1k
    if (wget_str_needs_encoding(iri->host)) {
645
13.6k
      if ((s = wget_str_to_utf8(iri->host, encoding))) {
646
13.6k
        iri->host = s;
647
13.6k
        iri->host_allocated = true;
648
13.6k
      }
649
13.6k
    }
650
32.1k
    if ((p = (char *)wget_str_to_ascii(iri->host)) != iri->host) {
651
6.36k
      if (iri->host_allocated)
652
6.36k
        xfree(iri->host);
653
6.36k
      iri->host = p;
654
6.36k
      iri->host_allocated = true;
655
6.36k
    }
656
657
    // Finally, if the host is a literal IPv4 or IPv6 address, mark it as so
658
32.1k
    if (wget_ip_is_family(iri->host, WGET_NET_FAMILY_IPV4) || wget_ip_is_family(iri->host, WGET_NET_FAMILY_IPV6))
659
19
      iri->is_ip_address = true;
660
32.1k
  }
661
662
33.1k
  if (!iri->host) {
663
1.05k
    error_printf(_("Missing host/domain in URI '%s'\n"), iri->uri);
664
1.05k
    wget_iri_free(&iri);
665
1.05k
    return NULL;
666
1.05k
  }
667
668
32.1k
  if (iri->path && wget_str_needs_encoding(iri->path)) {
669
5.40k
    if ((s = wget_str_to_utf8(iri->path, encoding))) {
670
5.40k
      iri->path = s;
671
5.40k
      iri->path_allocated = true;
672
5.40k
    }
673
5.40k
  }
674
675
32.1k
  if (iri->query && wget_str_needs_encoding(iri->query)) {
676
730
    if ((s = wget_str_to_utf8(iri->query, encoding))) {
677
730
      iri->query = s;
678
730
      iri->query_allocated = true;
679
730
    }
680
730
  }
681
682
32.1k
  if (iri->fragment && wget_str_needs_encoding(iri->fragment)) {
683
84
    if ((s = wget_str_to_utf8(iri->fragment, encoding))) {
684
84
      iri->fragment = s;
685
84
      iri->fragment_allocated = true;
686
84
    }
687
84
  }
688
689
32.1k
  if (iri->userinfo) {
690
222
    iri->safe_uri = create_safe_uri(iri);
691
31.9k
  } else {
692
31.9k
    iri->safe_uri = iri->uri;
693
31.9k
  }
694
695
/*
696
  debug_printf("scheme=%s\n",iri->scheme);
697
  debug_printf("host=%s\n",iri->host);
698
  debug_printf("path=%s\n",iri->path);
699
  debug_printf("query=%s\n",iri->query);
700
  debug_printf("fragment=%s\n",iri->fragment);
701
*/
702
32.1k
  return iri;
703
33.1k
}
704
705
/**
706
 * \param[in] iri An IRI
707
 * \return A new IRI, with the exact same contents as the provided one.
708
 *
709
 * Clone the provided IRI.
710
 */
711
wget_iri *wget_iri_clone(const wget_iri *iri)
712
9.25k
{
713
9.25k
  if (!iri || !iri->uri)
714
1.39k
    return NULL;
715
716
7.86k
  size_t slen = strlen(iri->uri);
717
7.86k
  wget_iri *clone = wget_malloc(sizeof(wget_iri) + (slen + 1) + iri->msize);
718
719
7.86k
  if (!clone)
720
0
    return NULL;
721
722
7.86k
  memcpy(clone, iri, sizeof(wget_iri));
723
7.86k
  clone->uri = memcpy(clone + 1, iri->uri, (slen + 1) + iri->msize);
724
7.86k
  clone->uri_allocated = 0;
725
726
7.86k
  if (iri->userinfo)
727
102
    clone->safe_uri = wget_strdup(iri->safe_uri);
728
7.75k
  else
729
7.75k
    clone->safe_uri = clone->uri;
730
731
7.86k
  clone->connection_part = wget_strdup(iri->connection_part);
732
733
  // adjust pointers
734
7.86k
  if (iri->host_allocated)
735
6.46k
    clone->host = wget_strdup(iri->host);
736
1.39k
  else
737
1.39k
    clone->host = iri->host ? (char *)clone + (size_t) (iri->host - (const char *)iri) : NULL;
738
739
7.86k
  clone->display = iri->display ? (char *)clone + (size_t) (iri->display - (const char *)iri): NULL;
740
  // not adjust scheme, it is a pointer to a static string
741
7.86k
  clone->userinfo = iri->userinfo ? (char *)clone + (size_t) (iri->userinfo - (const char *)iri): NULL;
742
7.86k
  clone->password = iri->password ? (char *)clone + (size_t) (iri->password - (const char *)iri): NULL;
743
744
7.86k
  if (iri->path_allocated)
745
390
    clone->path = wget_strdup(iri->path);
746
7.47k
  else
747
7.47k
    clone->path = iri->path ? (char *)clone + (size_t) (iri->path - (const char *)iri): NULL;
748
749
7.86k
  if (iri->query_allocated)
750
312
    clone->query = wget_strdup(iri->query);
751
7.54k
  else
752
7.54k
    clone->query = iri->query ? (char *)clone + (size_t) (iri->query - (const char *)iri): NULL;
753
754
7.86k
  if (iri->fragment_allocated)
755
28
    clone->fragment = wget_strdup(iri->fragment);
756
7.83k
  else
757
7.83k
    clone->fragment = iri->fragment ? (char *)clone + (size_t) (iri->fragment - (const char *)iri): NULL;
758
759
7.86k
  return clone;
760
7.86k
}
761
762
/**
763
 * \param[in] iri An IRI
764
 * \param[in] buf A buffer, where the resulting string will be put
765
 * \return The contents of the buffer \p buf
766
 *
767
 * Append the connection part of the IRI \p iri to \p buf.
768
 *
769
 * The connection part is formed by the scheme, the hostname, and optionally the port. For example:
770
 *
771
 *     https://localhost:8080
772
 *     https://www.example.com
773
 *
774
 * It may be of the form `https://example.com:8080` if the port was provided when creating the IRI
775
 * or of the form `https://example.com` otherwise.
776
 */
777
const char *wget_iri_get_connection_part(const wget_iri *iri, wget_buffer *buf)
778
20.3k
{
779
20.3k
  if (iri) {
780
20.3k
    if (wget_ip_is_family(iri->host, WGET_NET_FAMILY_IPV6))
781
4
      wget_buffer_printf_append(buf, "%s://[%s]", schemes[iri->scheme].name, iri->host);
782
20.3k
    else
783
20.3k
      wget_buffer_printf_append(buf, "%s://%s", schemes[iri->scheme].name, iri->host);
784
785
20.3k
    if (iri->port_given)
786
152
      wget_buffer_printf_append(buf, ":%hu", iri->port);
787
20.3k
  }
788
789
20.3k
  return buf->data;
790
20.3k
}
791
792
// normalize /../ and remove /./
793
794
static size_t WGET_GCC_NONNULL_ALL normalize_path(char *path)
795
21.8k
{
796
21.8k
  char *p1 = path, *p2 = path;
797
798
21.8k
  debug_printf("path %s ->\n", path);
799
800
  // skip ./ and ../ at the beginning of the path
801
25.7k
  for (;;) {
802
25.7k
    if (*p2 == '/')
803
2.33k
      p2++;
804
23.3k
    else if (*p2 == '.') {
805
2.08k
      if (p2[1] == '/')
806
490
        p2 += 2;
807
1.59k
      else if (p2[1] == '.') {
808
1.15k
        if (p2[2] == '/')
809
510
          p2 += 3;
810
644
        else if (!p2[2])
811
486
          p2 += 2;
812
158
        else
813
158
          break;
814
1.15k
      }
815
436
      else if (!p2[1])
816
58
        p2++;
817
378
      else
818
378
        break;
819
2.08k
    } else
820
21.3k
      break;
821
25.7k
  }
822
823
  // normalize path but stop at query or fragment
824
1.83M
  while (*p2 && *p2 != '?' && *p2 != '#') {
825
1.81M
    if (*p2 == '/') {
826
8.08k
      if (p2[1] == '.') {
827
2.49k
        if (!strncmp(p2, "/../", 4)) {
828
          // go one level up
829
840
          p2 += 3;
830
16.7k
          while (p1 > path && *--p1 != '/');
831
1.65k
        } else if (!strcmp(p2, "/..")) {
832
580
          p2 += 3;
833
2.69k
          while (p1 > path && *--p1 != '/');
834
580
          if (p1 > path) *p1++='/';
835
1.07k
        } else if (!strncmp(p2, "/./", 3)) {
836
202
          p2 += 2;
837
876
        } else if (!strcmp(p2, "/.")) {
838
28
          p2 += 2;
839
28
          if (p1 > path) *p1++='/';
840
28
        } else
841
848
          *p1++ = *p2++;
842
5.58k
      } else if (p1 == path)
843
800
        p2++; // avoid leading slash
844
4.78k
      else if (p2[1] == '/')
845
692
        p2++; // double slash to single slash
846
4.09k
      else
847
4.09k
        *p1++ = *p2++;
848
8.08k
    } else
849
1.80M
      *p1++ = *p2++;
850
1.81M
  }
851
852
21.8k
  if (p1 != p2) {
853
41.6k
    while (*p2)
854
37.4k
      *p1++ = *p2++;
855
856
4.23k
    *p1 = 0;
857
17.6k
  } else {
858
17.6k
    p1 += strlen(p1);
859
17.6k
  }
860
861
21.8k
  debug_printf("     %s\n", path);
862
863
21.8k
  return p1 - path;
864
21.8k
}
865
866
// create an absolute URI from a base + relative URI
867
868
//char *iri_relative_to_absolute(IRI *iri, const char *tag, const char *val, size_t len, char *dst, size_t dst_size)
869
/**
870
 * \param[in] base A base IRI
871
 * \param[in] val A path, or another URI
872
 * \param[in] len Length of the string \p val or -1
873
 * \param[in] buf Destination buffer, where the result will be copied.
874
 * \return A new URI (string) which is based on the base IRI \p base provided, or NULL in case of error.
875
 *
876
 * Calculates a new URI which is based on the provided IRI \p base.
877
 *
878
 * Taking the IRI \p base as a starting point, a new URI is created with the path \p val, which may be
879
 * a relative or absolute path, or even a whole URI. The result is returned as a string, and if the buffer
880
 * \p buf is provided, it is also placed there.
881
 *
882
 * If \p val is an absolute path (it begins with a `/`), it is normalized first. Then the provided IRI's
883
 * path is replaced by that new path. If it's a relative path, the file name of the \p base IRI's path
884
 * is replaced by that path. Finally, if \p val begins with a scheme (such as `https://`) that string is returned
885
 * untouched, and placed in the buffer if provided.
886
 *
887
 * If \p base is NULL, then \p val must itself be an absolute URI. Likewise, if \p buf is NULL,
888
 * then \p val must also be an absolute URI.
889
 *
890
 * if \p len is `-1`, the length of \p val will be the result from `strlen(val)`.
891
 */
892
const char *wget_iri_relative_to_abs(const wget_iri *base, const char *val, size_t len, wget_buffer *buf)
893
27.7k
{
894
27.7k
  if (len == (size_t) -1)
895
18.5k
    len = strlen(val);
896
897
27.7k
  if (*val == '/') {
898
3.85k
    if (base) {
899
2.56k
      char tmp[4096], *path = tmp;
900
901
2.56k
      if (len >= sizeof(tmp)) {
902
26
        path = wget_malloc(len + 1);
903
26
        if (!path)
904
0
          return NULL;
905
26
      }
906
907
      // strlcpy or snprintf are ineffective here since they do strlen(val), which might be large
908
2.56k
      wget_strscpy(path, val, len + 1);
909
910
2.56k
      if (len >= 2 && val[1] == '/') {
911
1.01k
        char *p;
912
913
        // absolute URI without scheme: //authority/path...
914
1.01k
        if ((p = strchr(path + 2, '/')))
915
1.00k
          normalize_path(p + 1);
916
917
1.01k
        wget_buffer_strcpy(buf, schemes[base->scheme].name);
918
1.01k
        wget_buffer_strcat(buf, ":");
919
1.01k
        wget_buffer_strcat(buf, path);
920
1.55k
      } else {
921
        // absolute path
922
1.55k
        normalize_path(path);
923
924
1.55k
        wget_buffer_reset(buf);
925
1.55k
        wget_iri_get_connection_part(base, buf);
926
1.55k
        wget_buffer_strcat(buf, "/");
927
1.55k
        wget_buffer_strcat(buf, path);
928
1.55k
      }
929
930
2.56k
      if (path != tmp)
931
26
        xfree(path);
932
2.56k
    } else {
933
1.28k
      return NULL;
934
1.28k
    }
935
23.9k
  } else {
936
    // see if URI begins with a scheme:
937
23.9k
    if (memchr(val, ':', len)) {
938
      // absolute URI
939
7.43k
      if (buf) {
940
4.96k
        wget_buffer_memcpy(buf, val, len);
941
4.96k
      } else {
942
2.47k
        return val;
943
2.47k
      }
944
16.4k
    } else if (base) {
945
      // relative path
946
10.9k
      const char *lastsep = base->path ? strrchr(base->path, '/') : NULL;
947
10.9k
      wget_buffer_reset(buf);
948
10.9k
      wget_iri_get_connection_part(base, buf);
949
10.9k
      wget_buffer_strcat(buf, "/");
950
951
10.9k
      size_t tmp_len = buf->length;
952
953
10.9k
      if (lastsep)
954
0
        wget_buffer_memcat(buf, base->path, lastsep - base->path + 1);
955
956
10.9k
      if (len)
957
10.9k
        wget_buffer_memcat(buf, val, len);
958
959
10.9k
      buf->length = normalize_path(buf->data + tmp_len) + tmp_len;
960
10.9k
    } else if (val[len] == 0) {
961
5.50k
      return val;
962
5.50k
    } else {
963
0
      return NULL;
964
0
    }
965
23.9k
  }
966
967
18.5k
  return likely(buf) ? buf->data : NULL;
968
27.7k
}
969
970
/**
971
 * \param[in] base The base IRI
972
 * \param[in] url A relative/absolute path (or a URI) to be appended to \p base
973
 * \param[in] encoding The encoding of \p url (e.g. "utf-8" or "iso-8859-1")
974
 * \return A new IRI
975
 *
976
 * Generate a new IRI by using the provided IRI \p base as a base and the path \p url.
977
 *
978
 * This is equivalent to:
979
 *
980
 *     wget_iri *iri = wget_iri_parse(wget_iri_relative_to_abs(base, url, strlen(url), NULL), encoding);
981
 *     return iri;
982
 *
983
 * As such, \p url can be a relative or absolute path, or another URI.
984
 *
985
 * If \p base is NULL, then the parameter \p url must itself be an absolute URI.
986
 */
987
wget_iri *wget_iri_parse_base(const wget_iri *base, const char *url, const char *encoding)
988
18.5k
{
989
18.5k
  wget_iri *iri;
990
991
18.5k
  if (base) {
992
9.25k
    wget_buffer buf;
993
9.25k
    char sbuf[256];
994
995
9.25k
    wget_buffer_init(&buf, sbuf, sizeof(sbuf));
996
9.25k
    iri = wget_iri_parse(wget_iri_relative_to_abs(base, url, (size_t) -1, &buf), encoding);
997
9.25k
    wget_buffer_deinit(&buf);
998
9.25k
  } else {
999
    // no base: just check URL for being an absolute URI
1000
9.25k
    iri = wget_iri_parse(wget_iri_relative_to_abs(NULL, url, (size_t) -1, NULL), encoding);
1001
9.25k
  }
1002
1003
18.5k
  return iri;
1004
18.5k
}
1005
1006
// RFC conform comparison as described in https://tools.ietf.org/html/rfc2616#section-3.2.3
1007
/**
1008
 * \param[in] iri1 An IRI
1009
 * \param[in] iri2 Another IRI
1010
 * \return 0 if both IRIs are equal according to RFC 2616 or a non-zero value otherwise
1011
 *
1012
 * Compare two IRIs.
1013
 *
1014
 * Comparison is performed according to [RFC 2616, sect. 3.2.3](https://tools.ietf.org/html/rfc2616#section-3.2.3).
1015
 *
1016
 * This function uses wget_strcasecmp() to compare the various parts of the IRIs so a non-zero negative return value
1017
 * indicates that \p iri1 is less than \p iri2, whereas a positive value indicates \p iri1 is greater than \p iri2.
1018
 */
1019
int wget_iri_compare(const wget_iri *iri1, const wget_iri *iri2)
1020
9.25k
{
1021
9.25k
  int n;
1022
1023
9.25k
  if (!iri1) {
1024
1.39k
    if (!iri2)
1025
852
      return 0;
1026
546
    else
1027
546
      return -1;
1028
7.86k
  } else if (!iri2)
1029
6
    return 1;
1030
1031
//  info_printf("iri %p %p %s:%s %s:%s\n",iri1,iri2,iri1->scheme,iri1->port,iri2->scheme,iri2->port);
1032
1033
/*
1034
  if (!iri1->path) {
1035
//    if (iri2->path && strcmp(iri2->path, "/"))
1036
    if (iri2->path)
1037
      return -1;
1038
  }
1039
  else if (!iri2->path) {
1040
//    if (iri1->path && strcmp(iri1->path, "/"))
1041
    if (iri1->path)
1042
      return 1;
1043
  }
1044
*/
1045
7.85k
  if ((n = wget_strcasecmp(iri1->path, iri2->path)))
1046
5.44k
    return n;
1047
1048
2.40k
  if ((n = wget_strcasecmp(iri1->query, iri2->query)))
1049
234
    return n;
1050
1051
2.17k
  if (iri1->scheme != iri2->scheme)
1052
0
    return iri1->scheme < iri2->scheme ? -1 : 1;
1053
1054
2.17k
  if ((n = iri1->port - iri2->port))
1055
66
    return n;
1056
1057
  // host is already lowercase, no need to call strcasecmp()
1058
2.10k
  if ((n = strcmp(iri1->host, iri2->host)))
1059
463
    return n;
1060
1061
  // if ((n = wget_strcasecmp(iri1->fragment, iri2->fragment)))
1062
  //    return n;
1063
1064
1.64k
  return 0;
1065
2.10k
}
1066
1067
/**
1068
 * \param[in] src A string, whose reserved characters are to be percent-encoded
1069
 * \param[in] buf A buffer where the result will be copied.
1070
 * \return The contents of the buffer \p buf after \p src has been encoded.
1071
 *
1072
 * Escapes (using percent-encoding) all the reserved characters in the string \p src.
1073
 *
1074
 * If \p src is NULL, the contents of the buffer \p buf are returned. \p buf cannot be NULL.
1075
 */
1076
const char *wget_iri_escape(const char *src, wget_buffer *buf)
1077
17.1k
{
1078
17.1k
  const char *begin;
1079
1080
17.1k
  if (!src)
1081
0
    return buf->data;
1082
1083
1.34M
  for (begin = src; *src; src++) {
1084
1.32M
    if (!iri_isunreserved(*src)) {
1085
1.15M
      if (begin != src)
1086
21.7k
        wget_buffer_memcat(buf, begin, src - begin);
1087
1.15M
      begin = src + 1;
1088
1.15M
      wget_buffer_printf_append(buf, "%%%02X", (unsigned char)*src);
1089
1.15M
    }
1090
1.32M
  }
1091
1092
17.1k
  if (begin != src)
1093
6.92k
    wget_buffer_memcat(buf, begin, src - begin);
1094
1095
17.1k
  return buf->data;
1096
17.1k
}
1097
1098
/**
1099
 * \param[in] src A string, whose reserved characters are to be percent-encoded
1100
 * \param[in] buf A buffer where the result will be copied.
1101
 * \return The contents of the buffer \p buf after \p src has been encoded
1102
 * as described in https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.1.
1103
 *
1104
 * Escapes the path part of the URI suitable for GET/POST requests (origin-form).
1105
 *   origin-form    = absolute-path [ "?" query ]
1106
 *   path-absolute = "/" [ segment-nz *( "/" segment ) ]
1107
 *   segment-nz    = 1*pchar
1108
 *   segment       = *pchar
1109
 *   pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
1110
 */
1111
const char *wget_iri_escape_path(const char *src, wget_buffer *buf)
1112
10.3k
{
1113
10.3k
  const char *begin;
1114
1115
927k
  for (begin = src; *src; src++) {
1116
916k
    if (!(iri_isunreserved(*src) || iri_issubdelim(*src) || *src == '/' || *src == ':' || *src == '@')) {
1117
804k
      if (begin != src)
1118
16.3k
        wget_buffer_memcat(buf, begin, src - begin);
1119
804k
      begin = src + 1;
1120
804k
      wget_buffer_printf_append(buf, "%%%02X", (unsigned char)*src);
1121
804k
    }
1122
916k
  }
1123
1124
10.3k
  if (begin != src)
1125
5.22k
    wget_buffer_memcat(buf, begin, src - begin);
1126
1127
10.3k
  return buf->data;
1128
10.3k
}
1129
1130
/**
1131
 * \param[in] src A string, whose reserved characters are to be percent-encoded
1132
 * \param[in] buf A buffer where the result will be copied.
1133
 * \return The contents of the buffer \p buf after \p src has been encoded.
1134
 *
1135
 * Escapes (using percent-encoding) all the reserved characters in the string \p src
1136
 * (just like wget_iri_escape()), but **excluding the equal sign `=` and the ampersand `&`**.
1137
 * This function is thus ideally suited for query parts of URIs.
1138
 */
1139
const char *wget_iri_escape_query(const char *src, wget_buffer *buf)
1140
9.25k
{
1141
9.25k
  const char *begin;
1142
1143
681k
  for (begin = src; *src; src++) {
1144
672k
    if (!iri_isunreserved(*src) && *src != '=' && *src != '&') {
1145
585k
      if (begin != src)
1146
15.2k
        wget_buffer_memcat(buf, begin, src - begin);
1147
585k
      begin = src + 1;
1148
585k
      if (*src == ' ')
1149
518
        wget_buffer_memcat(buf, "+", 1);
1150
584k
      else
1151
584k
        wget_buffer_printf_append(buf, "%%%02X", (unsigned char)*src);
1152
585k
    }
1153
672k
  }
1154
1155
9.25k
  if (begin != src)
1156
2.38k
    wget_buffer_memcat(buf, begin, src - begin);
1157
1158
9.25k
  return buf->data;
1159
9.25k
}
1160
1161
/**
1162
 * \param[in] iri An IRI
1163
 * \param[in] buf A buffer, where the resulting string will be put
1164
 * \return The contents of the buffer \p buf
1165
 *
1166
 * Return the host part of the provided IRI. It is placed in the buffer \p buf
1167
 * and also returned as a `const char *`.
1168
 *
1169
 * The host is escaped using wget_iri_escape().
1170
 */
1171
const char *wget_iri_get_escaped_host(const wget_iri *iri, wget_buffer *buf)
1172
7.86k
{
1173
7.86k
  return wget_iri_escape(iri->host, buf);
1174
7.86k
}
1175
1176
/**
1177
 * \param[in] iri An IRI
1178
 * \param[in] buf A buffer, where the resulting string will be put
1179
 * \return The contents of the buffer \p buf
1180
 *
1181
 * Return the resource string, suitable for use in HTTP requests.
1182
 * Details:
1183
 *   https://datatracker.ietf.org/doc/html/rfc7230#section-3.1.1
1184
 *   https://datatracker.ietf.org/doc/html/rfc7230#section-2.7
1185
 *   https://datatracker.ietf.org/doc/html/rfc3986#section-3.3
1186
 *
1187
 * The resource string is comprised of the path, plus the query part, if present. Example:
1188
 *
1189
 *     /foo/bar/?param_1=one&param_2=two
1190
 *
1191
 * Both the path and the query are escaped using wget_iri_escape_path() and
1192
 * wget_iri_escape_query(), respectively.
1193
 *
1194
 * The resulting string is placed in the buffer \p buf and also returned as a `const char *`.
1195
 */
1196
const char *wget_iri_get_escaped_resource(const wget_iri *iri, wget_buffer *buf)
1197
7.86k
{
1198
7.86k
  if (iri->path)
1199
1.06k
    wget_iri_escape_path(iri->path, buf);
1200
1201
  // Do not actually escape the query field. This part of the URL *MAY*
1202
  // contain reserved characters which should be passed on as-is and without
1203
  // escaping them. This is according to the rules laid out in RFC 2616 and
1204
  // RFC 7230. But we have to replace spaces in any case.
1205
7.86k
  if (iri->query) {
1206
646
    wget_buffer_memcat(buf, "?", 1);
1207
7.25k
    for (const char *p = iri->query; *p; p++)
1208
6.60k
      wget_buffer_memcat(buf, *p == ' ' ? "+" : p, 1);
1209
646
  }
1210
1211
7.86k
  return buf->data;
1212
7.86k
}
1213
1214
/**
1215
 * \param[in] iri An IRI
1216
 * \param[in] buf A buffer, where the resulting string will be put
1217
 * \param[in] encoding Character set the string should be converted to
1218
 * \return The contents of the buffer \p buf
1219
 *
1220
 * Get the path part of the provided IRI.
1221
 *
1222
 * The path is appended to \p buf. If \p buf is non-empty and does not end with
1223
 * a path separator (`/`), then one is added before the path is appended to \p
1224
 * buf.
1225
 *
1226
 * If \p encoding is provided, this function will try to convert the path (which is originally
1227
 * in UTF-8) to that encoding.
1228
 */
1229
1230
char *wget_iri_get_path(const wget_iri *iri, wget_buffer *buf, const char *encoding)
1231
7.86k
{
1232
7.86k
  if (buf->length != 0 && buf->data[buf->length - 1] != '/')
1233
7.74k
    wget_buffer_memcat(buf, "/", 1);
1234
1235
7.86k
  if (iri->path) {
1236
1.06k
    if (wget_strcasecmp_ascii(encoding, "utf-8")) {
1237
533
      char *fname;
1238
1239
533
      if ((fname = wget_utf8_to_str(iri->path, encoding))) {
1240
533
        wget_buffer_strcat(buf, fname);
1241
533
        xfree(fname);
1242
533
      } else {
1243
        // conversion failed, keep original string
1244
0
        wget_buffer_strcat(buf, iri->path);
1245
0
      }
1246
533
    } else {
1247
533
      wget_buffer_strcat(buf, iri->path);
1248
533
    }
1249
1.06k
  }
1250
1251
7.86k
  if ((buf->length == 0 || buf->data[buf->length - 1] == '/') && default_page)
1252
7.42k
    wget_buffer_memcat(buf, default_page, default_page_length);
1253
1254
7.86k
  return buf->data;
1255
7.86k
}
1256
1257
/**
1258
 * \param[in] iri An IRI
1259
 * \param[in] buf A buffer, where the resulting string will be put
1260
 * \param[in] encoding Character set the string should be converted to
1261
 * \return The contents of the buffer \p buf
1262
 *
1263
 * Take the query part, and escape the path separators (`/`), so that it can be used as part
1264
 * of a filename.
1265
 *
1266
 * The resulting string will be placed in the buffer \p buf and also returned as a `const char *`.
1267
 * If the provided IRI has no query part, then the original contents of \p buf are returned and \p buf
1268
 * is kept untouched.
1269
 *
1270
 * If \p encoding is provided, this function will try to convert the query (which is originally
1271
 * in UTF-8) to that encoding.
1272
 */
1273
char *wget_iri_get_query_as_filename(const wget_iri *iri, wget_buffer *buf, const char *encoding)
1274
15.7k
{
1275
15.7k
  if (iri->query) {
1276
1.29k
    const char *query;
1277
1.29k
    int allocated = 0;
1278
1279
1.29k
    wget_buffer_memcat(buf, "?", 1);
1280
1281
1.29k
    if (wget_strcasecmp_ascii(encoding, "utf-8")) {
1282
646
      if ((query = wget_utf8_to_str(iri->query, encoding)))
1283
646
        allocated = 1;
1284
0
      else
1285
0
        query = iri->query;
1286
646
    } else {
1287
646
      query = iri->query;
1288
646
    }
1289
1290
1.29k
    int slashes = 0;
1291
1.29k
    const char *src = query;
1292
1293
    // count slashes in query string
1294
3.15k
    while ((src = strchr(src, '/'))) {
1295
1.86k
      slashes++;
1296
1.86k
      src++;
1297
1.86k
    }
1298
1299
1.29k
    if (slashes) {
1300
      // escape slashes to use query as part of a filename
1301
636
      const char *begin;
1302
1303
10.1k
      for (src = begin = query; *src; src++) {
1304
9.50k
        if (*src == '/') {
1305
1.86k
          if (begin != src)
1306
1.09k
            wget_buffer_memcat(buf, begin, src - begin);
1307
1.86k
          begin = src + 1;
1308
1.86k
          wget_buffer_memcat(buf, "%2F", 3);
1309
1.86k
        }
1310
9.50k
      }
1311
1312
636
      if (begin != src)
1313
500
        wget_buffer_memcat(buf, begin, src - begin);
1314
656
    } else {
1315
656
      wget_buffer_strcat(buf, query);
1316
656
    }
1317
1318
1.29k
    if (allocated)
1319
646
      xfree(query);
1320
1.29k
  }
1321
1322
15.7k
  return buf->data;
1323
15.7k
}
1324
1325
/**
1326
 * \param[in] iri An IRI
1327
 * \param[in] buf A buffer, where the resulting string will be put
1328
 * \param[in] encoding Character set the string should be converted to
1329
 * \return The contents of the buffer \p buf
1330
 *
1331
 * Get the filename of the path of the provided IRI.
1332
 *
1333
 * This is similar to wget_iri_get_path(), but instead of returning the whole path
1334
 * it only returns the substring after the last occurrence of `/`. In other words, the
1335
 * filename of the path.
1336
 *
1337
 * This is also known as the "basename" in the UNIX world, and the output of this function
1338
 * would be equivalent to the output of the `basename(1)` tool.
1339
 *
1340
 * The path is copied into \p buf if it's empty. If the buffer \p buf is not empty,
1341
 * it is appended to it after a path separator (`/`).
1342
 *
1343
 * If \p encoding is provided, this function will try to convert the path (which is originally
1344
 * in UTF-8) to that encoding.
1345
 */
1346
char *wget_iri_get_basename(const wget_iri *iri, wget_buffer *buf, const char *encoding, int flags)
1347
7.86k
{
1348
7.86k
  if (iri->path) {
1349
1.06k
    char *fname;
1350
1351
1.06k
    if (wget_strcasecmp_ascii(encoding, "utf-8")) {
1352
533
      char *p;
1353
1354
533
      if ((p = strrchr(iri->path, '/'))) {
1355
111
        if (!(fname = wget_utf8_to_str(p + 1, encoding)))
1356
0
          wget_buffer_strcat(buf, p + 1); // conversion failed, keep original string
1357
422
      } else {
1358
422
        if (!(fname = wget_utf8_to_str(iri->path, encoding)))
1359
0
          wget_buffer_strcat(buf, iri->path); // conversion failed, keep original string
1360
422
      }
1361
1362
533
      if (fname) {
1363
        // conversion succeeded
1364
533
        wget_buffer_strcat(buf, fname);
1365
533
        xfree(fname);
1366
533
      }
1367
533
    } else {
1368
533
      if ((fname = strrchr(iri->path, '/')))
1369
111
        wget_buffer_strcat(buf, fname + 1);
1370
422
      else
1371
422
        wget_buffer_strcat(buf, iri->path);
1372
533
    }
1373
1.06k
  }
1374
1375
7.86k
  if ((buf->length == 0 || buf->data[buf->length - 1] == '/') && default_page)
1376
0
    wget_buffer_memcat(buf, default_page, default_page_length);
1377
1378
7.86k
  if (flags & WGET_IRI_WITH_QUERY)
1379
7.86k
    return wget_iri_get_query_as_filename(iri, buf, encoding);
1380
1381
0
  return buf->data;
1382
7.86k
}
1383
1384
// escaping: see https://tools.ietf.org/html/rfc2396#2 following (especially 2.4.2)
1385
/*const char *iri_escape(const char *uri)
1386
{
1387
  int esc = 0;
1388
  const char *p;
1389
1390
  for (p = uri; *p; p++) {
1391
    if (*p == '%') {
1392
      if ((isxdigit(p[1]) && isxdigit(p[2])) || p[1] == '%')
1393
        return uri; // assume that URI is already escaped
1394
      esc++;
1395
    } else if ()
1396
  }
1397
}
1398
*/
1399
1400
void wget_iri_set_defaultpage(const char *page)
1401
4.62k
{
1402
4.62k
  default_page = page;
1403
4.62k
  default_page_length = default_page ? strlen(default_page) : 0;
1404
4.62k
}
1405
1406
/**
1407
 * \param scheme The scheme for the new default port
1408
 * \param port The new default port value for the given scheme
1409
 * \return 0: success  -1: Unknown scheme
1410
 *
1411
 * Set the default \p port for the given \p scheme.
1412
 */
1413
int wget_iri_set_defaultport(wget_iri_scheme scheme, uint16_t port)
1414
0
{
1415
0
  if ((unsigned) scheme < countof(schemes)) {
1416
0
    schemes[scheme].port = port;
1417
0
    return 0;
1418
0
  }
1419
1420
0
  return -1;
1421
0
}
1422
1423
/**
1424
 * \param[in] iri An IRI
1425
 * \param[in] scheme A scheme, such as `http` or `https`.
1426
 * \return The original scheme of IRI (ie. before the replacement)
1427
 *
1428
 * Set the scheme of the provided IRI. The IRI's original scheme
1429
 * is replaced by the new one.
1430
 *
1431
 * If the IRI was using a default port (such as 80 for HTTP or 443 for HTTPS)
1432
 * that port is modified as well to match the default port of the new scheme.
1433
 * Otherwise the port is left untouched.
1434
 */
1435
wget_iri_scheme wget_iri_set_scheme(wget_iri *iri, wget_iri_scheme scheme)
1436
7.86k
{
1437
7.86k
  wget_iri_scheme old_scheme = iri->scheme;
1438
1439
7.86k
  if ((unsigned) scheme < countof(schemes) && iri->scheme != scheme) {
1440
7.85k
    iri->scheme = scheme;
1441
1442
    // If the IRI is using the default port, also change it
1443
7.85k
    if (iri->port == schemes[old_scheme].port)
1444
7.70k
      iri->port = schemes[scheme].port;
1445
1446
7.85k
    size_t old_scheme_len = strlen(schemes[old_scheme].name);
1447
1448
7.85k
    if (strncmp(iri->uri, schemes[old_scheme].name, old_scheme_len) == 0 && iri->uri[old_scheme_len] == ':') {
1449
7.85k
      char *new_uri = wget_aprintf("%s%s",  schemes[iri->scheme].name, iri->uri + old_scheme_len);
1450
7.85k
      if (iri->uri_allocated)
1451
0
        xfree(iri->uri);
1452
7.85k
      iri->uri = new_uri;
1453
7.85k
      iri->uri_allocated = true;
1454
7.85k
    }
1455
7.85k
  }
1456
1457
7.86k
  if (iri->userinfo) {
1458
102
    xfree(iri->safe_uri);
1459
102
    iri->safe_uri = create_safe_uri(iri);
1460
7.75k
  } else {
1461
7.75k
    iri->safe_uri = iri->uri;
1462
7.75k
  }
1463
7.86k
  return old_scheme;
1464
7.86k
}
1465
1466
static char *create_safe_uri(wget_iri *iri)
1467
324
{
1468
324
  if (!iri || !iri->uri)
1469
0
    return NULL;
1470
1471
324
  wget_buffer *buf = wget_buffer_alloc(strlen(iri->uri));
1472
324
  if (!buf)
1473
0
    return NULL;
1474
1475
324
  wget_buffer_printf(buf, "%s://%s", schemes[iri->scheme].name, iri->host);
1476
1477
324
  if (iri->path) {
1478
154
    wget_buffer_strcat(buf, "/");
1479
154
    wget_buffer_strcat(buf, iri->path);
1480
154
  }
1481
324
  if (iri->query) {
1482
158
    wget_buffer_strcat(buf, "?");
1483
158
    wget_buffer_strcat(buf, iri->query);
1484
158
  }
1485
324
  if (iri->fragment) {
1486
38
    wget_buffer_strcat(buf, "#");
1487
38
    wget_buffer_strcat(buf, iri->fragment);
1488
38
  }
1489
1490
324
  char *safe_uri = buf->data;
1491
324
  buf->data = NULL;
1492
324
  wget_buffer_free(&buf);
1493
1494
324
  return safe_uri;
1495
324
}
1496
1497
/** @} */