Coverage Report

Created: 2026-02-16 07:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/serenity/Userland/Libraries/LibURL/Parser.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
3
 * Copyright (c) 2023-2024, Shannon Booth <shannon@serenityos.org>
4
 *
5
 * SPDX-License-Identifier: BSD-2-Clause
6
 */
7
8
#include <AK/ByteString.h>
9
#include <AK/CharacterTypes.h>
10
#include <AK/Debug.h>
11
#include <AK/IntegralMath.h>
12
#include <AK/Optional.h>
13
#include <AK/SourceLocation.h>
14
#include <AK/StringBuilder.h>
15
#include <AK/StringUtils.h>
16
#include <AK/Utf8View.h>
17
#include <LibTextCodec/Decoder.h>
18
#include <LibTextCodec/Encoder.h>
19
#include <LibURL/Parser.h>
20
#include <LibUnicode/IDNA.h>
21
22
namespace URL {
23
24
// NOTE: This is similar to the LibC macro EOF = -1.
25
constexpr u32 end_of_file = 0xFFFFFFFF;
26
27
// https://url.spec.whatwg.org/#forbidden-host-code-point
28
static bool is_forbidden_host_code_point(u32 code_point)
29
18.2M
{
30
    // A forbidden host code point is U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR, U+0020 SPACE,
31
    // U+0023 (#), U+002F (/), U+003A (:), U+003C (<), U+003E (>), U+003F (?), U+0040 (@), U+005B ([),
32
    // U+005C (\), U+005D (]), U+005E (^), or U+007C (|).
33
18.2M
    return "\0\t\n\r #/:<>?@[\\]^|"sv.contains(code_point);
34
18.2M
}
35
36
// https://url.spec.whatwg.org/#forbidden-domain-code-point
37
static bool is_forbidden_domain_code_point(u32 code_point)
38
18.0M
{
39
    // A forbidden domain code point is a forbidden host code point, a C0 control, U+0025 (%), or U+007F DELETE.
40
18.0M
    return is_forbidden_host_code_point(code_point) || is_ascii_c0_control(code_point) || code_point == '%' || code_point == 0x7F;
41
18.0M
}
42
43
// https://url.spec.whatwg.org/#url-code-points
44
static bool is_url_code_point(u32 code_point)
45
22.8M
{
46
    // The URL code points are ASCII alphanumeric, U+0021 (!), U+0024 ($), U+0026 (&),
47
    // U+0027 ('), U+0028 LEFT PARENTHESIS, U+0029 RIGHT PARENTHESIS, U+002A (*),
48
    // U+002B (+), U+002C (,), U+002D (-), U+002E (.), U+002F (/), U+003A (:),
49
    // U+003B (;), U+003D (=), U+003F (?), U+0040 (@), U+005F (_), U+007E (~), and code
50
    // points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and
51
    // noncharacters.
52
22.8M
    return is_ascii_alphanumeric(code_point) || "!$&'()*+,-./:;=?@_~"sv.contains(code_point)
53
15.3M
        || (code_point >= 0x00A0 && code_point <= 0x10FFFD && !is_unicode_surrogate(code_point) && !is_unicode_noncharacter(code_point));
54
22.8M
}
55
56
static void report_validation_error(SourceLocation const& location = SourceLocation::current())
57
416k
{
58
416k
    dbgln_if(URL_PARSER_DEBUG, "URL::Parser::basic_parse: Validation error! {}", location);
59
416k
}
60
61
// https://url.spec.whatwg.org/#concept-opaque-host-parser
62
static Optional<Host> parse_opaque_host(StringView input)
63
549
{
64
    // 1. If input contains a forbidden host code point, host-invalid-code-point validation error, return failure.
65
141k
    for (auto code_point : Utf8View { input }) {
66
141k
        if (is_forbidden_host_code_point(code_point)) {
67
12
            report_validation_error();
68
12
            return {};
69
12
        }
70
141k
    }
71
72
    // 2. If input contains a code point that is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
73
    // 3. If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, invalid-URL-unit validation error.
74
    // NOTE: These steps are not implemented because they are not cheap checks and exist just to report validation errors. With how we
75
    //       currently report validation errors, they are only useful for debugging efforts in the URL parsing code.
76
77
    // 4. Return the result of running UTF-8 percent-encode on input using the C0 control percent-encode set.
78
537
    return percent_encode(input, PercentEncodeSet::C0Control);
79
549
}
80
81
struct ParsedIPv4Number {
82
    u32 number { 0 };
83
    bool validation_error { false };
84
};
85
86
// https://url.spec.whatwg.org/#ipv4-number-parser
87
static Optional<ParsedIPv4Number> parse_ipv4_number(StringView input)
88
488
{
89
    // 1. If input is the empty string, then return failure.
90
488
    if (input.is_empty())
91
5
        return {};
92
93
    // 2. Let validationError be false.
94
483
    bool validation_error = false;
95
96
    // 3. Let R be 10.
97
483
    u8 radix = 10;
98
99
    // 4. If input contains at least two code points and the first two code points are either "0X" or "0x", then:
100
483
    if (input.length() >= 2 && (input.starts_with("0X"sv) || input.starts_with("0x"sv))) {
101
        // 1. Set validationError to true.
102
163
        validation_error = true;
103
104
        // 2. Remove the first two code points from input.
105
163
        input = input.substring_view(2);
106
107
        // 3. Set R to 16.
108
163
        radix = 16;
109
163
    }
110
    // 5. Otherwise, if input contains at least two code points and the first code point is U+0030 (0), then:
111
320
    else if (input.length() >= 2 && input[0] == '0') {
112
        // 1. Set validationError to true.
113
92
        validation_error = true;
114
115
        // 2. Remove the first code point from input.
116
92
        input = input.substring_view(1);
117
118
        // 3. Set R to 8.
119
92
        radix = 8;
120
92
    }
121
122
    // 6. If input is the empty string, then return (0, true).
123
483
    if (input.is_empty())
124
12
        return ParsedIPv4Number { 0, true };
125
126
    // 7. If input contains a code point that is not a radix-R digit, then return failure.
127
471
    if (radix == 8) {
128
10.6k
        if (!all_of(input, [](auto character) { return is_ascii_octal_digit(character); }))
129
9
            return {};
130
379
    } else if (radix == 10) {
131
26.4k
        if (!all_of(input, [](auto character) { return is_ascii_digit(character); }))
132
24
            return {};
133
228
    } else if (radix == 16) {
134
2.24M
        if (!all_of(input, [](auto character) { return is_ascii_hex_digit(character); }))
135
5
            return {};
136
151
    } else {
137
0
        VERIFY_NOT_REACHED();
138
0
    }
139
140
    // 8. Let output be the mathematical integer value that is represented by input in radix-R notation, using ASCII hex digits for digits with values 0 through 15.
141
433
    Optional<u32> maybe_output;
142
433
    if (radix == 8)
143
83
        maybe_output = AK::StringUtils::convert_to_uint_from_octal(input);
144
350
    else if (radix == 10)
145
204
        maybe_output = input.to_number<u32>();
146
146
    else if (radix == 16)
147
146
        maybe_output = AK::StringUtils::convert_to_uint_from_hex(input);
148
0
    else
149
0
        VERIFY_NOT_REACHED();
150
151
    // NOTE: Parsing may have failed due to overflow.
152
433
    if (!maybe_output.has_value())
153
80
        return {};
154
155
    // 9. Return (output, validationError).
156
353
    return ParsedIPv4Number { maybe_output.value(), validation_error };
157
433
}
158
159
// https://url.spec.whatwg.org/#concept-ipv4-parser
160
static Optional<IPv4Address> parse_ipv4_address(StringView input)
161
470
{
162
    // 1. Let parts be the result of strictly splitting input on U+002E (.).
163
470
    auto parts = input.split_view("."sv, SplitBehavior::KeepEmpty);
164
165
    // 2. If the last item in parts is the empty string, then:
166
470
    if (parts.last().is_empty()) {
167
        // 1. IPv4-empty-part validation error.
168
27
        report_validation_error();
169
170
        // 2. If parts’s size is greater than 1, then remove the last item from parts.
171
27
        if (parts.size() > 1)
172
27
            parts.take_last();
173
27
    }
174
175
    // 3. If parts’s size is greater than 4, IPv4-too-many-parts validation error, return failure.
176
470
    if (parts.size() > 4) {
177
62
        report_validation_error();
178
62
        return {};
179
62
    }
180
181
    // 4. Let numbers be an empty list.
182
408
    Vector<u32, 4> numbers;
183
184
    // 5. For each part of parts:
185
488
    for (auto const& part : parts) {
186
        // 1. Let result be the result of parsing part.
187
488
        auto const result = parse_ipv4_number(part);
188
189
        // 2. If result is failure, IPv4-non-numeric-part validation error, return failure.
190
488
        if (!result.has_value()) {
191
123
            report_validation_error();
192
123
            return {};
193
123
        }
194
195
        // 3. If result[1] is true, IPv4-non-decimal-part validation error.
196
365
        if (result->validation_error)
197
182
            report_validation_error();
198
199
        // 4. Append result[0] to numbers.
200
365
        numbers.append(result->number);
201
365
    }
202
203
    // 6. If any item in numbers is greater than 255, IPv4-out-of-range-part validation error.
204
    // 7. If any but the last item in numbers is greater than 255, then return failure.
205
629
    for (size_t i = 0; i < numbers.size(); ++i) {
206
349
        if (numbers[i] > 255) {
207
158
            report_validation_error();
208
158
            if (i != numbers.size() - 1)
209
5
                return {};
210
158
        }
211
349
    }
212
213
    // 8. If the last item in numbers is greater than or equal to 256^(5 − numbers’s size), then return failure.
214
280
    if (numbers.last() >= AK::pow<size_t>(256, 5 - numbers.size()))
215
1
        return {};
216
217
    // 9. Let ipv4 be the last item in numbers.
218
279
    auto ipv4 = numbers.last();
219
220
    // 10. Remove the last item from numbers.
221
279
    numbers.take_last();
222
223
    // 11. Let counter be 0.
224
279
    u8 counter = 0;
225
226
    // 12. For each n of numbers:
227
279
    for (u32 n : numbers) {
228
        // 1. Increment ipv4 by n × 256^(3 − counter).
229
61
        ipv4 += n * AK::pow<size_t>(256, 3 - counter);
230
231
        // 2. Increment counter by 1.
232
61
        ++counter;
233
61
    }
234
235
    // 13. Return ipv4.
236
279
    return ipv4;
237
280
}
238
239
// https://url.spec.whatwg.org/#concept-ipv4-serializer
240
static ErrorOr<String> serialize_ipv4_address(IPv4Address address)
241
0
{
242
    // 1. Let output be the empty string.
243
    // NOTE: Array to avoid prepend.
244
0
    Array<u8, 4> output;
245
246
    // 2. Let n be the value of address.
247
0
    u32 n = address;
248
249
    // 3. For each i in the range 1 to 4, inclusive:
250
0
    for (size_t i = 0; i <= 3; ++i) {
251
        // 1. Prepend n % 256, serialized, to output.
252
0
        output[3 - i] = n % 256;
253
254
        // 2. If i is not 4, then prepend U+002E (.) to output.
255
        // NOTE: done at end
256
257
        // 3. Set n to floor(n / 256).
258
0
        n /= 256;
259
0
    }
260
261
    // 4. Return output.
262
0
    return String::formatted("{}.{}.{}.{}", output[0], output[1], output[2], output[3]);
263
0
}
264
265
// https://url.spec.whatwg.org/#concept-ipv6-serializer
266
static void serialize_ipv6_address(IPv6Address const& address, StringBuilder& output)
267
0
{
268
    // 1. Let output be the empty string.
269
270
    // 2. Let compress be an index to the first IPv6 piece in the first longest sequences of address’s IPv6 pieces that are 0.
271
0
    Optional<size_t> compress;
272
0
    size_t longest_sequence_length = 0;
273
0
    size_t current_sequence_length = 0;
274
0
    size_t current_sequence_start = 0;
275
0
    for (size_t i = 0; i < 8; ++i) {
276
0
        if (address[i] == 0) {
277
0
            if (current_sequence_length == 0)
278
0
                current_sequence_start = i;
279
0
            ++current_sequence_length;
280
0
        } else {
281
0
            if (current_sequence_length > longest_sequence_length) {
282
0
                longest_sequence_length = current_sequence_length;
283
0
                compress = current_sequence_start;
284
0
            }
285
0
            current_sequence_length = 0;
286
0
        }
287
0
    }
288
289
0
    if (current_sequence_length > longest_sequence_length) {
290
0
        longest_sequence_length = current_sequence_length;
291
0
        compress = current_sequence_start;
292
0
    }
293
294
    // 3. If there is no sequence of address’s IPv6 pieces that are 0 that is longer than 1, then set compress to null.
295
0
    if (longest_sequence_length <= 1)
296
0
        compress = {};
297
298
    // 4. Let ignore0 be false.
299
0
    auto ignore0 = false;
300
301
    // 5. For each pieceIndex in the range 0 to 7, inclusive:
302
0
    for (size_t piece_index = 0; piece_index <= 7; ++piece_index) {
303
        // 1. If ignore0 is true and address[pieceIndex] is 0, then continue.
304
0
        if (ignore0 && address[piece_index] == 0)
305
0
            continue;
306
307
        // 2. Otherwise, if ignore0 is true, set ignore0 to false.
308
0
        if (ignore0)
309
0
            ignore0 = false;
310
311
        // 3. If compress is pieceIndex, then:
312
0
        if (compress == piece_index) {
313
            // 1. Let separator be "::" if pieceIndex is 0, and U+003A (:) otherwise.
314
0
            auto separator = piece_index == 0 ? "::"sv : ":"sv;
315
316
            // 2. Append separator to output.
317
0
            output.append(separator);
318
319
            // 3. Set ignore0 to true and continue.
320
0
            ignore0 = true;
321
0
            continue;
322
0
        }
323
324
        // 4. Append address[pieceIndex], represented as the shortest possible lowercase hexadecimal number, to output.
325
0
        output.appendff("{:x}", address[piece_index]);
326
327
        // 5. If pieceIndex is not 7, then append U+003A (:) to output.
328
0
        if (piece_index != 7)
329
0
            output.append(':');
330
0
    }
331
332
    // 6. Return output.
333
0
}
334
335
// https://url.spec.whatwg.org/#concept-ipv6-parser
336
static Optional<IPv6Address> parse_ipv6_address(StringView input)
337
547
{
338
    // 1. Let address be a new IPv6 address whose IPv6 pieces are all 0.
339
547
    Array<u16, 8> address {};
340
341
    // 2. Let pieceIndex be 0.
342
547
    size_t piece_index = 0;
343
344
    // 3. Let compress be null.
345
547
    Optional<size_t> compress;
346
347
547
    Vector<u32> code_points;
348
547
    code_points.ensure_capacity(input.length());
349
14.5M
    for (auto code_point : Utf8View { input }) {
350
14.5M
        code_points.append(code_point);
351
14.5M
    }
352
353
    // 4. Let pointer be a pointer for input.
354
547
    size_t pointer = 0;
355
13.0k
    auto c = [&]() -> u32 {
356
13.0k
        if (pointer >= code_points.size())
357
490
            return end_of_file;
358
12.5k
        return code_points[pointer];
359
13.0k
    };
360
361
547
    auto remaining = [&]() -> ReadonlySpan<u32> {
362
77
        if ((pointer + 1) >= code_points.size())
363
1
            return {};
364
76
        return code_points.span().slice(pointer + 1);
365
77
    };
366
367
    // 5. If c is U+003A (:), then:
368
547
    if (c() == ':') {
369
        // 1. If remaining does not start with U+003A (:), IPv6-invalid-compression validation error, return failure.
370
39
        if (remaining().is_empty() || remaining()[0] != ':') {
371
7
            report_validation_error();
372
7
            return {};
373
7
        }
374
375
        // 2. Increase pointer by 2.
376
32
        pointer += 2;
377
378
        // 3. Increase pieceIndex by 1 and then set compress to pieceIndex.
379
32
        ++piece_index;
380
32
        compress = piece_index;
381
32
    }
382
383
    // 6. While c is not the EOF code point:
384
1.23k
    while (c() != end_of_file) {
385
        // 1. If pieceIndex is 8, IPv6-too-many-pieces validation error, return failure.
386
1.13k
        if (piece_index == 8) {
387
17
            report_validation_error();
388
17
            return {};
389
17
        }
390
391
        // 2. If c is U+003A (:), then:
392
1.12k
        if (c() == ':') {
393
            // 1. If compress is non-null, IPv6-multiple-compression validation error, return failure.
394
15
            if (compress.has_value()) {
395
1
                report_validation_error();
396
1
                return {};
397
1
            }
398
399
            // 2. Increase pointer and pieceIndex by 1, set compress to pieceIndex, and then continue.
400
14
            ++pointer;
401
14
            ++piece_index;
402
14
            compress = piece_index;
403
14
            continue;
404
15
        }
405
406
        // 3. Let value and length be 0.
407
1.10k
        u32 value = 0;
408
1.10k
        size_t length = 0;
409
410
        // 4. While length is less than 4 and c is an ASCII hex digit,
411
        //    set value to value × 0x10 + c interpreted as hexadecimal number,
412
        //    and increase pointer and length by 1.
413
3.57k
        while (length < 4 && is_ascii_hex_digit(c())) {
414
2.46k
            value = value * 0x10 + parse_ascii_hex_digit(c());
415
2.46k
            ++pointer;
416
2.46k
            ++length;
417
2.46k
        }
418
419
        // 5. If c is U+002E (.), then:
420
1.10k
        if (c() == '.') {
421
            // 1. If length is 0, IPv4-in-IPv6-invalid-code-point validation error, return failure.
422
135
            if (length == 0) {
423
1
                report_validation_error();
424
1
                return {};
425
1
            }
426
427
            // 2. Decrease pointer by length.
428
134
            pointer -= length;
429
430
            // 3. If pieceIndex is greater than 6, IPv4-in-IPv6-too-many-pieces validation error, return failure.
431
134
            if (piece_index > 6) {
432
1
                report_validation_error();
433
1
                return {};
434
1
            }
435
436
            // 4. Let numbersSeen be 0.
437
133
            size_t numbers_seen = 0;
438
439
            // 5. While c is not the EOF code point:
440
300
            while (c() != end_of_file) {
441
                // 1. Let ipv4Piece be null.
442
296
                Optional<u32> ipv4_piece;
443
444
                // 2. If numbersSeen is greater than 0, then:
445
296
                if (numbers_seen > 0) {
446
                    // 1. If c is a U+002E (.) and numbersSeen is less than 4, then increase pointer by 1.
447
163
                    if (c() == '.' && numbers_seen < 4) {
448
123
                        ++pointer;
449
123
                    }
450
                    // 2. Otherwise, IPv4-in-IPv6-invalid-code-point validation error, return failure.
451
40
                    else {
452
40
                        report_validation_error();
453
40
                        return {};
454
40
                    }
455
163
                }
456
457
                // 3. If c is not an ASCII digit, IPv4-in-IPv6-invalid-code-point validation error, return failure.
458
256
                if (!is_ascii_digit(c())) {
459
73
                    report_validation_error();
460
73
                    return {};
461
73
                }
462
463
                // 4. While c is an ASCII digit:
464
419
                while (is_ascii_digit(c())) {
465
                    // 1. Let number be c interpreted as decimal number.
466
252
                    u32 number = parse_ascii_digit(c());
467
468
                    // 2. If ipv4Piece is null, then set ipv4Piece to number.
469
252
                    if (!ipv4_piece.has_value()) {
470
183
                        ipv4_piece = number;
471
183
                    }
472
                    // Otherwise, if ipv4Piece is 0, IPv4-in-IPv6-invalid-code-point validation error, return failure.
473
69
                    else if (ipv4_piece.value() == 0) {
474
1
                        report_validation_error();
475
1
                        return {};
476
1
                    }
477
                    // Otherwise, set ipv4Piece to ipv4Piece × 10 + number.
478
68
                    else {
479
68
                        ipv4_piece = ipv4_piece.value() * 10 + number;
480
68
                    }
481
482
                    // 3. If ipv4Piece is greater than 255, IPv4-in-IPv6-out-of-range-part validation error, return failure.
483
251
                    if (ipv4_piece.value() > 255) {
484
15
                        report_validation_error();
485
15
                        return {};
486
15
                    }
487
488
                    // 4. Increase pointer by 1.
489
236
                    ++pointer;
490
236
                }
491
                // 5. Set address[pieceIndex] to address[pieceIndex] × 0x100 + ipv4Piece.
492
167
                address[piece_index] = address[piece_index] * 0x100 + ipv4_piece.value();
493
494
                // 6. Increase numbersSeen by 1.
495
167
                ++numbers_seen;
496
497
                // 7. If numbersSeen is 2 or 4, then increase pieceIndex by 1.
498
167
                if (numbers_seen == 2 || numbers_seen == 4)
499
49
                    ++piece_index;
500
167
            }
501
502
            // 6. If numbersSeen is not 4, IPv4-in-IPv6-too-few-parts validation error, return failure.
503
4
            if (numbers_seen != 4) {
504
3
                report_validation_error();
505
3
                return {};
506
3
            }
507
508
            // 7. Break.
509
1
            break;
510
4
        }
511
        // 6. Otherwise, if c is U+003A (:):
512
971
        else if (c() == ':') {
513
            // 1. Increase pointer by 1.
514
609
            ++pointer;
515
516
            // 2. If c is the EOF code point, IPv6-invalid-code-point validation error, return failure.
517
609
            if (c() == end_of_file) {
518
16
                report_validation_error();
519
16
                return {};
520
16
            }
521
609
        }
522
523
        // 7. Otherwise, if c is not the EOF code point, IPv6-invalid-code-point validation error, return failure.
524
362
        else if (c() != end_of_file) {
525
270
            report_validation_error();
526
270
            return {};
527
270
        }
528
529
        // 8. Set address[pieceIndex] to value.
530
685
        address[piece_index] = value;
531
532
        // 9. Increase pieceIndex by 1.
533
685
        ++piece_index;
534
685
    }
535
536
    // 7. If compress is non-null, then:
537
102
    if (compress.has_value()) {
538
        // 1. Let swaps be pieceIndex − compress.
539
20
        size_t swaps = piece_index - compress.value();
540
541
        // 2. Set pieceIndex to 7.
542
20
        piece_index = 7;
543
544
        // 3. While pieceIndex is not 0 and swaps is greater than 0,
545
        //    swap address[pieceIndex] with address[compress + swaps − 1],
546
        //    and then decrease both pieceIndex and swaps by 1.
547
64
        while (piece_index != 0 && swaps > 0) {
548
44
            swap(address[piece_index], address[compress.value() + swaps - 1]);
549
44
            --piece_index;
550
44
            --swaps;
551
44
        }
552
20
    }
553
554
    // 8. Otherwise, if compress is null and pieceIndex is not 8, IPv6-too-few-pieces validation error, return failure.
555
82
    else if (!compress.has_value() && piece_index != 8) {
556
70
        report_validation_error();
557
70
        return {};
558
70
    }
559
560
    // 9. Return address.
561
32
    return address;
562
102
}
563
564
// https://url.spec.whatwg.org/#ends-in-a-number-checker
565
static bool ends_in_a_number_checker(StringView input)
566
858
{
567
    // 1. Let parts be the result of strictly splitting input on U+002E (.).
568
858
    auto parts = input.split_view("."sv, SplitBehavior::KeepEmpty);
569
570
    // 2. If the last item in parts is the empty string, then:
571
858
    if (parts.last().is_empty()) {
572
        // 1. If parts’s size is 1, then return false.
573
97
        if (parts.size() == 1)
574
0
            return false;
575
576
        // 2. Remove the last item from parts.
577
97
        parts.take_last();
578
97
    }
579
580
    // 3. Let last be the last item in parts.
581
858
    auto last = parts.last();
582
583
    // 4. If last is non-empty and contains only ASCII digits, then return true.
584
858
    if (!last.is_empty() && all_of(last, is_ascii_digit))
585
330
        return true;
586
587
    // 5. If parsing last as an IPv4 number does not return failure, then return true.
588
    // NOTE: This is equivalent to checking that last is "0X" or "0x", followed by zero or more ASCII hex digits.
589
528
    if (last.starts_with("0x"sv, CaseSensitivity::CaseInsensitive) && all_of(last.substring_view(2), is_ascii_hex_digit))
590
140
        return true;
591
592
    // 6. Return false.
593
388
    return false;
594
528
}
595
596
// https://url.spec.whatwg.org/#concept-domain-to-ascii
597
static ErrorOr<String> domain_to_ascii(StringView domain, bool be_strict)
598
1.44k
{
599
    // 1. Let result be the result of running Unicode ToASCII with domain_name set to domain, UseSTD3ASCIIRules set to beStrict, CheckHyphens set to false, CheckBidi set to true, CheckJoiners set to true, Transitional_Processing set to false, and VerifyDnsLength set to beStrict. [UTS46]
600
    // 2. If result is a failure value, domain-to-ASCII validation error, return failure.
601
602
    // OPTIMIZATION: If beStrict is false, domain is an ASCII string, and strictly splitting domain on U+002E (.)
603
    //               does not produce any item that starts with an ASCII case-insensitive match for "xn--", this
604
    //               step is equivalent to ASCII lowercasing domain.
605
1.44k
    if (!be_strict && all_of(domain, is_ascii)) {
606
        // 3. If result is the empty string, domain-to-ASCII validation error, return failure.
607
1.05k
        if (domain.is_empty())
608
0
            return Error::from_string_literal("Empty domain");
609
610
1.05k
        bool slow_path = false;
611
161k
        for (auto part : domain.split_view('.')) {
612
161k
            if (part.starts_with("xn--"sv, CaseSensitivity::CaseInsensitive)) {
613
5
                slow_path = true;
614
5
                break;
615
5
            }
616
161k
        }
617
618
1.05k
        if (!slow_path) {
619
1.05k
            auto lowercase_domain = domain.to_lowercase_string();
620
1.05k
            return String::from_utf8_without_validation(lowercase_domain.bytes());
621
1.05k
        }
622
1.05k
    }
623
624
392
    Unicode::IDNA::ToAsciiOptions const options {
625
392
        Unicode::IDNA::CheckHyphens::No,
626
392
        Unicode::IDNA::CheckBidi::Yes,
627
392
        Unicode::IDNA::CheckJoiners::Yes,
628
392
        be_strict ? Unicode::IDNA::UseStd3AsciiRules::Yes : Unicode::IDNA::UseStd3AsciiRules::No,
629
392
        Unicode::IDNA::TransitionalProcessing::No,
630
392
        be_strict ? Unicode::IDNA::VerifyDnsLength::Yes : Unicode::IDNA::VerifyDnsLength::No
631
392
    };
632
392
    auto result = TRY(Unicode::IDNA::to_ascii(Utf8View(domain), options));
633
634
    // 3. If result is the empty string, domain-to-ASCII validation error, return failure.
635
0
    if (result.is_empty())
636
0
        return Error::from_string_literal("Empty domain");
637
638
    // 4. Return result.
639
0
    return result;
640
0
}
641
642
// https://url.spec.whatwg.org/#concept-host-parser
643
static Optional<Host> parse_host(StringView input, bool is_opaque = false)
644
2.56k
{
645
    // 1. If input starts with U+005B ([), then:
646
2.56k
    if (input.starts_with('[')) {
647
        // 1. If input does not end with U+005D (]), IPv6-unclosed validation error, return failure.
648
572
        if (!input.ends_with(']')) {
649
25
            report_validation_error();
650
25
            return {};
651
25
        }
652
653
        // 2. Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed.
654
547
        auto address = parse_ipv6_address(input.substring_view(1, input.length() - 2));
655
547
        if (!address.has_value())
656
515
            return {};
657
32
        return address.release_value();
658
547
    }
659
660
    // 2. If isOpaque is true, then return the result of opaque-host parsing input.
661
1.99k
    if (is_opaque)
662
549
        return parse_opaque_host(input);
663
664
    // 3. Assert: input is not the empty string.
665
1.44k
    VERIFY(!input.is_empty());
666
667
    // FIXME: 4. Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
668
1.44k
    auto domain = percent_decode(input);
669
670
    // 5. Let asciiDomain be the result of running domain to ASCII with domain and false.
671
1.44k
    auto ascii_domain_or_error = domain_to_ascii(domain, false);
672
673
    // 6. If asciiDomain is failure, then return failure.
674
1.44k
    if (ascii_domain_or_error.is_error())
675
392
        return {};
676
677
1.05k
    auto ascii_domain = ascii_domain_or_error.release_value();
678
679
    // 7. If asciiDomain contains a forbidden domain code point, domain-invalid-code-point validation error, return failure.
680
18.0M
    for (auto character : ascii_domain.bytes_as_string_view()) {
681
18.0M
        if (is_forbidden_domain_code_point(character)) {
682
195
            report_validation_error();
683
195
            return {};
684
195
        }
685
18.0M
    }
686
687
    // 8. If asciiDomain ends in a number, then return the result of IPv4 parsing asciiDomain.
688
858
    if (ends_in_a_number_checker(ascii_domain)) {
689
470
        auto ipv4_host = parse_ipv4_address(ascii_domain);
690
470
        if (!ipv4_host.has_value())
691
191
            return {};
692
693
279
        return ipv4_host.release_value();
694
470
    }
695
696
    // 9. Return asciiDomain.
697
388
    return ascii_domain;
698
858
}
699
700
// https://url.spec.whatwg.org/#concept-host-serializer
701
ErrorOr<String> Parser::serialize_host(Host const& host)
702
0
{
703
    // 1. If host is an IPv4 address, return the result of running the IPv4 serializer on host.
704
0
    if (host.has<IPv4Address>())
705
0
        return serialize_ipv4_address(host.get<IPv4Address>());
706
707
    // 2. Otherwise, if host is an IPv6 address, return U+005B ([), followed by the result of running the IPv6 serializer on host, followed by U+005D (]).
708
0
    if (host.has<IPv6Address>()) {
709
0
        StringBuilder output;
710
0
        TRY(output.try_append('['));
711
0
        serialize_ipv6_address(host.get<IPv6Address>(), output);
712
0
        TRY(output.try_append(']'));
713
0
        return output.to_string();
714
0
    }
715
716
    // 3. Otherwise, host is a domain, opaque host, or empty host, return host.
717
0
    if (host.has<String>())
718
0
        return host.get<String>();
719
0
    return String {};
720
0
}
721
722
// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
723
constexpr bool starts_with_windows_drive_letter(StringView input)
724
0
{
725
0
    if (input.length() < 2)
726
0
        return false;
727
0
    if (!is_ascii_alpha(input[0]) || !(input[1] == ':' || input[1] == '|'))
728
0
        return false;
729
0
    if (input.length() == 2)
730
0
        return true;
731
0
    return "/\\?#"sv.contains(input[2]);
732
0
}
733
734
constexpr bool is_windows_drive_letter(StringView input)
735
2.35k
{
736
2.35k
    return input.length() == 2 && is_ascii_alpha(input[0]) && (input[1] == ':' || input[1] == '|');
737
2.35k
}
738
739
constexpr bool is_normalized_windows_drive_letter(StringView input)
740
2.11k
{
741
2.11k
    return input.length() == 2 && is_ascii_alpha(input[0]) && input[1] == ':';
742
2.11k
}
743
744
constexpr bool is_single_dot_path_segment(StringView input)
745
19.7M
{
746
19.7M
    return input == "."sv || input.equals_ignoring_ascii_case("%2e"sv);
747
19.7M
}
748
749
constexpr bool is_double_dot_path_segment(StringView input)
750
9.86M
{
751
9.86M
    return input == ".."sv || input.equals_ignoring_ascii_case(".%2e"sv) || input.equals_ignoring_ascii_case("%2e."sv) || input.equals_ignoring_ascii_case("%2e%2e"sv);
752
9.86M
}
753
754
// https://url.spec.whatwg.org/#shorten-a-urls-path
755
void Parser::shorten_urls_path(URL& url)
756
4.15k
{
757
    // 1. Assert: url does not have an opaque path.
758
4.15k
    VERIFY(!url.cannot_be_a_base_url());
759
760
    // 2. Let path be url’s path.
761
4.15k
    auto& path = url.m_data->paths;
762
763
    // 3. If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return.
764
4.15k
    if (url.scheme() == "file" && path.size() == 1 && is_normalized_windows_drive_letter(path[0]))
765
387
        return;
766
767
    // 4. Remove path’s last item, if any.
768
3.76k
    if (!path.is_empty())
769
2.68k
        path.take_last();
770
3.76k
}
771
772
// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
773
String Parser::percent_encode_after_encoding(TextCodec::Encoder& encoder, StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus)
774
1.23k
{
775
    // 1. Let encodeOutput be an empty I/O queue.
776
1.23k
    StringBuilder output;
777
778
    // 2. Set potentialError to the result of running encode or fail with inputQueue, encoder, and encodeOutput.
779
1.23k
    MUST(encoder.process(
780
1.23k
        Utf8View(input),
781
782
        // 3. For each byte of encodeOutput converted to a byte sequence:
783
1.23k
        [&](u8 byte) -> ErrorOr<void> {
784
            // 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
785
1.23k
            if (space_as_plus && byte == ' ') {
786
1.23k
                output.append('+');
787
1.23k
                return {};
788
1.23k
            }
789
790
            // 2. Let isomorph be a code point whose value is byte’s value.
791
1.23k
            u32 isomorph = byte;
792
793
            // 3. Assert: percentEncodeSet includes all non-ASCII code points.
794
795
            // 4. If isomorphic is not in percentEncodeSet, then append isomorph to output.
796
1.23k
            if (!code_point_is_in_percent_encode_set(isomorph, percent_encode_set)) {
797
1.23k
                output.append_code_point(isomorph);
798
1.23k
            }
799
800
            // 5. Otherwise, percent-encode byte and append the result to output.
801
1.23k
            else {
802
1.23k
                output.appendff("%{:02X}", byte);
803
1.23k
            }
804
805
1.23k
            return {};
806
1.23k
        },
807
808
        // 4. If potentialError is non-null, then append "%26%23", followed by the shortest sequence of ASCII digits
809
        //    representing potentialError in base ten, followed by "%3B", to output.
810
1.23k
        [&](u32 error) -> ErrorOr<void> {
811
1.23k
            output.appendff("%26%23{}%3B", error);
812
1.23k
            return {};
813
1.23k
        }));
814
815
    // 6. Return output.
816
1.23k
    return MUST(output.to_string());
817
1.23k
}
818
819
// https://url.spec.whatwg.org/#concept-basic-url-parser
820
URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, URL* url, Optional<State> state_override, Optional<StringView> encoding)
821
6.22k
{
822
6.22k
    dbgln_if(URL_PARSER_DEBUG, "URL::Parser::basic_parse: Parsing '{}'", raw_input);
823
824
6.22k
    size_t start_index = 0;
825
6.22k
    size_t end_index = raw_input.length();
826
827
    // 1. If url is not given:
828
6.22k
    auto url_buffer = URL();
829
6.22k
    if (!url) {
830
        // 1. Set url to a new URL.
831
6.22k
        url = &url_buffer;
832
833
        // 2. If input contains any leading or trailing C0 control or space, invalid-URL-unit validation error.
834
        // 3. Remove any leading and trailing C0 control or space from input.
835
6.22k
        bool has_validation_error = false;
836
837
1.05M
        for (; start_index < raw_input.length(); ++start_index) {
838
1.05M
            if (!is_ascii_c0_control_or_space(raw_input[start_index]))
839
6.21k
                break;
840
1.04M
            has_validation_error = true;
841
1.04M
        }
842
843
3.15M
        for (; end_index > start_index; --end_index) {
844
3.15M
            if (!is_ascii_c0_control_or_space(raw_input[end_index - 1]))
845
6.21k
                break;
846
3.14M
            has_validation_error = true;
847
3.14M
        }
848
849
6.22k
        if (has_validation_error)
850
72
            report_validation_error();
851
6.22k
    }
852
853
6.22k
    ByteString processed_input = raw_input.substring_view(start_index, end_index - start_index);
854
855
    // 2. If input contains any ASCII tab or newline, invalid-URL-unit validation error.
856
    // 3. Remove all ASCII tab or newline from input.
857
92.5M
    for (auto const ch : processed_input) {
858
92.5M
        if (ch == '\t' || ch == '\n' || ch == '\r') {
859
369
            report_validation_error();
860
369
            processed_input = processed_input.replace("\t"sv, ""sv, ReplaceMode::All).replace("\n"sv, ""sv, ReplaceMode::All).replace("\r"sv, ""sv, ReplaceMode::All);
861
369
            break;
862
369
        }
863
92.5M
    }
864
865
    // 4. Let state be state override if given, or scheme start state otherwise.
866
6.22k
    State state = state_override.value_or(State::SchemeStart);
867
868
    // 5. Set encoding to the result of getting an output encoding from encoding.
869
6.22k
    Optional<TextCodec::Encoder&> encoder = {};
870
6.22k
    if (encoding.has_value())
871
0
        encoder = TextCodec::encoder_for(TextCodec::get_output_encoding(*encoding));
872
6.22k
    if (!encoder.has_value())
873
6.22k
        encoder = TextCodec::encoder_for("utf-8"sv);
874
6.22k
    VERIFY(encoder.has_value());
875
876
    // 6. Let buffer be the empty string.
877
6.22k
    StringBuilder buffer;
878
879
    // 7. Let atSignSeen, insideBrackets, and passwordTokenSeen be false.
880
6.22k
    bool at_sign_seen = false;
881
6.22k
    bool inside_brackets = false;
882
6.22k
    bool password_token_seen = false;
883
884
6.22k
    Utf8View input(processed_input);
885
886
    // 8. Let pointer be a pointer for input.
887
6.22k
    Utf8CodePointIterator iterator = input.begin();
888
889
6.22k
    auto get_remaining = [&input, &iterator] {
890
3.73k
        return input.substring_view(iterator - input.begin() + iterator.underlying_code_point_length_in_bytes()).as_string();
891
3.73k
    };
892
893
189k
    auto remaining_starts_with_two_ascii_hex_digits = [&]() {
894
189k
        return is_ascii_hex_digit(iterator.peek(1).value_or(end_of_file)) && is_ascii_hex_digit(iterator.peek(2).value_or(end_of_file));
895
189k
    };
896
897
    // 9. Keep running the following state machine by switching on state. If after a run pointer points to the EOF code point, go to the next step. Otherwise, increase pointer by 1 and continue with the state machine.
898
    // NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
899
    //       ++iterator : "increase pointer by 1"
900
    //       continue   : "decrease pointer by 1"
901
136M
    for (;;) {
902
136M
        u32 code_point = end_of_file;
903
136M
        if (!iterator.done())
904
136M
            code_point = *iterator;
905
906
        if constexpr (URL_PARSER_DEBUG) {
907
            if (code_point == end_of_file)
908
                dbgln("URL::Parser::basic_parse: {} state with EOF.", state_name(state));
909
            else if (is_ascii_printable(code_point))
910
                dbgln("URL::Parser::basic_parse: {} state with code point U+{:04X} ({:c}).", state_name(state), code_point, code_point);
911
            else
912
                dbgln("URL::Parser::basic_parse: {} state with code point U+{:04X}.", state_name(state), code_point);
913
        }
914
915
136M
        switch (state) {
916
        // -> scheme start state, https://url.spec.whatwg.org/#scheme-start-state
917
6.22k
        case State::SchemeStart:
918
            // 1. If c is an ASCII alpha, append c, lowercased, to buffer, and set state to scheme state.
919
6.22k
            if (is_ascii_alpha(code_point)) {
920
6.10k
                buffer.append_as_lowercase(code_point);
921
6.10k
                state = State::Scheme;
922
6.10k
            }
923
            // 2. Otherwise, if state override is not given, set state to no scheme state and decrease pointer by 1.
924
124
            else if (!state_override.has_value()) {
925
124
                state = State::NoScheme;
926
124
                continue;
927
124
            }
928
            // 3. Otherwise, return failure.
929
0
            else {
930
0
                return {};
931
0
            }
932
6.10k
            break;
933
        // -> scheme state, https://url.spec.whatwg.org/#scheme-state
934
25.1M
        case State::Scheme:
935
            // 1. If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.), append c, lowercased, to buffer.
936
25.1M
            if (is_ascii_alphanumeric(code_point) || code_point == '+' || code_point == '-' || code_point == '.') {
937
25.1M
                buffer.append_as_lowercase(code_point);
938
25.1M
            }
939
            // 2. Otherwise, if c is U+003A (:), then:
940
6.10k
            else if (code_point == ':') {
941
                // 1. If state override is given, then:
942
5.92k
                if (state_override.has_value()) {
943
                    // 1. If url’s scheme is a special scheme and buffer is not a special scheme, then return.
944
0
                    if (is_special_scheme(url->scheme()) && !is_special_scheme(buffer.string_view()))
945
0
                        return *url;
946
947
                    // 2. If url’s scheme is not a special scheme and buffer is a special scheme, then return.
948
0
                    if (!is_special_scheme(url->scheme()) && is_special_scheme(buffer.string_view()))
949
0
                        return *url;
950
951
                    // 3. If url includes credentials or has a non-null port, and buffer is "file", then return.
952
0
                    if ((url->includes_credentials() || url->port().has_value()) && buffer.string_view() == "file"sv)
953
0
                        return *url;
954
955
                    // 4. If url’s scheme is "file" and its host is an empty host, then return.
956
0
                    if (url->scheme() == "file"sv && url->host() == String {})
957
0
                        return *url;
958
0
                }
959
960
                // 2. Set url’s scheme to buffer.
961
5.92k
                url->m_data->scheme = buffer.to_string_without_validation();
962
963
                // 3. If state override is given, then:
964
5.92k
                if (state_override.has_value()) {
965
                    // 1. If url’s port is url’s scheme’s default port, then set url’s port to null.
966
0
                    if (url->port() == default_port_for_scheme(url->scheme()))
967
0
                        url->m_data->port = {};
968
969
                    // 2. Return.
970
0
                    return *url;
971
0
                }
972
973
                // 4. Set buffer to the empty string.
974
5.92k
                buffer.clear();
975
976
                // 5. If url’s scheme is "file", then:
977
5.92k
                if (url->scheme() == "file") {
978
                    // 1. If remaining does not start with "//", special-scheme-missing-following-solidus validation error.
979
604
                    if (!get_remaining().starts_with("//"sv)) {
980
329
                        report_validation_error();
981
329
                    }
982
                    // 2. Set state to file state.
983
604
                    state = State::File;
984
604
                }
985
                // 6. Otherwise, if url is special, base is non-null, and base’s scheme is url’s scheme:
986
5.32k
                else if (url->is_special() && base_url.has_value() && base_url->scheme() == url->m_data->scheme) {
987
                    // 1. Assert: base is is special (and therefore does not have an opaque path).
988
0
                    VERIFY(base_url->is_special());
989
990
                    // 2. Set state to special relative or authority state.
991
0
                    state = State::SpecialRelativeOrAuthority;
992
0
                }
993
                // 7. Otherwise, if url is special, set state to special authority slashes state.
994
5.32k
                else if (url->is_special()) {
995
2.20k
                    state = State::SpecialAuthoritySlashes;
996
2.20k
                }
997
                // 8. Otherwise, if remaining starts with an U+002F (/), set state to path or authority state and increase pointer by 1.
998
3.12k
                else if (get_remaining().starts_with("/"sv)) {
999
1.33k
                    state = State::PathOrAuthority;
1000
1.33k
                    ++iterator;
1001
1.33k
                }
1002
                // 9. Otherwise, set url’s path to the empty string and set state to opaque path state.
1003
1.79k
                else {
1004
1.79k
                    url->m_data->cannot_be_a_base_url = true;
1005
1.79k
                    url->append_slash();
1006
1.79k
                    state = State::CannotBeABaseUrlPath;
1007
1.79k
                }
1008
5.92k
            }
1009
            // 3. Otherwise, if state override is not given, set buffer to the empty string, state to no scheme state, and start over (from the first code point in input).
1010
174
            else if (!state_override.has_value()) {
1011
174
                buffer.clear();
1012
174
                state = State::NoScheme;
1013
174
                iterator = input.begin();
1014
174
                continue;
1015
174
            }
1016
            // 4. Otherwise, return failure.
1017
0
            else {
1018
0
                return {};
1019
0
            }
1020
25.1M
            break;
1021
        // -> no scheme state, https://url.spec.whatwg.org/#no-scheme-state
1022
25.1M
        case State::NoScheme:
1023
            // 1. If base is null, or base has an opaque path and c is not U+0023 (#), missing-scheme-non-relative-URL validation error, return failure.
1024
298
            if (!base_url.has_value() || (base_url->m_data->cannot_be_a_base_url && code_point != '#')) {
1025
298
                report_validation_error();
1026
298
                return {};
1027
298
            }
1028
            // 2. Otherwise, if base has an opaque path and c is U+0023 (#), set url’s scheme to base’s scheme, url’s path to base’s path, url’s query
1029
            //    to base’s query,url’s fragment to the empty string, and set state to fragment state.
1030
0
            else if (base_url->m_data->cannot_be_a_base_url && code_point == '#') {
1031
0
                url->m_data->scheme = base_url->m_data->scheme;
1032
0
                url->m_data->paths = base_url->m_data->paths;
1033
0
                url->m_data->query = base_url->m_data->query;
1034
0
                url->m_data->fragment = String {};
1035
0
                url->m_data->cannot_be_a_base_url = true;
1036
0
                state = State::Fragment;
1037
0
            }
1038
            // 3. Otherwise, if base’s scheme is not "file", set state to relative state and decrease pointer by 1.
1039
0
            else if (base_url->m_data->scheme != "file") {
1040
0
                state = State::Relative;
1041
0
                continue;
1042
0
            }
1043
            // 4. Otherwise, set state to file state and decrease pointer by 1.
1044
0
            else {
1045
0
                state = State::File;
1046
0
                continue;
1047
0
            }
1048
0
            break;
1049
        // -> special relative or authority state, https://url.spec.whatwg.org/#special-relative-or-authority-state
1050
0
        case State::SpecialRelativeOrAuthority:
1051
            // 1. If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by 1.
1052
0
            if (code_point == '/' && get_remaining().starts_with("/"sv)) {
1053
0
                state = State::SpecialAuthorityIgnoreSlashes;
1054
0
                ++iterator;
1055
0
            }
1056
            // 2. Otherwise, special-scheme-missing-following-solidus validation error, set state to relative state and decrease pointer by 1.
1057
0
            else {
1058
0
                report_validation_error();
1059
0
                state = State::Relative;
1060
0
                continue;
1061
0
            }
1062
0
            break;
1063
        // -> path or authority state, https://url.spec.whatwg.org/#path-or-authority-state
1064
1.33k
        case State::PathOrAuthority:
1065
            // 1. If c is U+002F (/), then set state to authority state.
1066
1.33k
            if (code_point == '/') {
1067
591
                state = State::Authority;
1068
591
            }
1069
            // 2. Otherwise, set state to path state, and decrease pointer by 1.
1070
740
            else {
1071
740
                state = State::Path;
1072
740
                continue;
1073
740
            }
1074
591
            break;
1075
        // -> relative state, https://url.spec.whatwg.org/#relative-state
1076
591
        case State::Relative:
1077
            // 1. Assert: base’s scheme is not "file".
1078
0
            VERIFY(base_url->scheme() != "file");
1079
1080
            // 2. Set url’s scheme to base’s scheme.
1081
0
            url->m_data->scheme = base_url->m_data->scheme;
1082
1083
            // 3. If c is U+002F (/), then set state to relative slash state.
1084
0
            if (code_point == '/') {
1085
0
                state = State::RelativeSlash;
1086
0
            }
1087
            // 4. Otherwise, if url is special and c is U+005C (\), invalid-reverse-solidus validation error, set state to relative slash state.
1088
0
            else if (url->is_special() && code_point == '\\') {
1089
0
                report_validation_error();
1090
0
                state = State::RelativeSlash;
1091
0
            }
1092
            // 5. Otherwise:
1093
0
            else {
1094
                // 1. Set url’s username to base’s username, url’s password to base’s password, url’s host to base’s host, url’s port to base’s port, url’s path to a clone of base’s path, and url’s query to base’s query.
1095
0
                url->m_data->username = base_url->m_data->username;
1096
0
                url->m_data->password = base_url->m_data->password;
1097
0
                url->m_data->host = base_url->m_data->host;
1098
0
                url->m_data->port = base_url->m_data->port;
1099
0
                url->m_data->paths = base_url->m_data->paths;
1100
0
                url->m_data->query = base_url->m_data->query;
1101
1102
                // 2. If c is U+003F (?), then set url’s query to the empty string, and state to query state.
1103
0
                if (code_point == '?') {
1104
0
                    url->m_data->query = String {};
1105
0
                    state = State::Query;
1106
0
                }
1107
                // 3. Otherwise, if c is U+0023 (#), set url’s fragment to the empty string and state to fragment state.
1108
0
                else if (code_point == '#') {
1109
0
                    url->m_data->fragment = String {};
1110
0
                    state = State::Fragment;
1111
0
                }
1112
                // 4. Otherwise, if c is not the EOF code point:
1113
0
                else if (code_point != end_of_file) {
1114
                    // 1. Set url’s query to null.
1115
0
                    url->m_data->query = {};
1116
1117
                    // 2. Shorten url’s path.
1118
0
                    shorten_urls_path(*url);
1119
1120
                    // 3. Set state to path state and decrease pointer by 1.
1121
0
                    state = State::Path;
1122
0
                    continue;
1123
0
                }
1124
0
            }
1125
0
            break;
1126
        // -> relative slash state, https://url.spec.whatwg.org/#relative-slash-state
1127
0
        case State::RelativeSlash:
1128
            // 1. If url is special and c is U+002F (/) or U+005C (\), then:
1129
0
            if (url->is_special() && (code_point == '/' || code_point == '\\')) {
1130
                // 1. If c is U+005C (\), invalid-reverse-solidus validation error.
1131
0
                if (code_point == '\\')
1132
0
                    report_validation_error();
1133
1134
                // 2. Set state to special authority ignore slashes state.
1135
0
                state = State::SpecialAuthorityIgnoreSlashes;
1136
0
            }
1137
            // 2. Otherwise, if c is U+002F (/), then set state to authority state.
1138
0
            else if (code_point == '/') {
1139
0
                state = State::Authority;
1140
0
            }
1141
            // 3. Otherwise, set url’s username to base’s username, url’s password to base’s password, url’s host to base’s host, url’s port to base’s port, state to path state, and then, decrease pointer by 1.
1142
0
            else {
1143
0
                url->m_data->username = base_url->m_data->username;
1144
0
                url->m_data->password = base_url->m_data->password;
1145
0
                url->m_data->host = base_url->m_data->host;
1146
0
                url->m_data->port = base_url->m_data->port;
1147
0
                state = State::Path;
1148
0
                continue;
1149
0
            }
1150
0
            break;
1151
        // -> special authority slashes state, https://url.spec.whatwg.org/#special-authority-slashes-state
1152
2.20k
        case State::SpecialAuthoritySlashes:
1153
            // 1. If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by 1.
1154
2.20k
            if (code_point == '/' && get_remaining().starts_with("/"sv)) {
1155
2
                state = State::SpecialAuthorityIgnoreSlashes;
1156
2
                ++iterator;
1157
2
            }
1158
            // 2. Otherwise, special-scheme-missing-following-solidus validation error, set state to special authority ignore slashes state and decrease pointer by 1.
1159
2.19k
            else {
1160
2.19k
                report_validation_error();
1161
2.19k
                state = State::SpecialAuthorityIgnoreSlashes;
1162
2.19k
                continue;
1163
2.19k
            }
1164
2
            break;
1165
        // -> special authority ignore slashes state, https://url.spec.whatwg.org/#special-authority-ignore-slashes-state
1166
2.65k
        case State::SpecialAuthorityIgnoreSlashes:
1167
            // 1. If c is neither U+002F (/) nor U+005C (\), then set state to authority state and decrease pointer by 1.
1168
2.65k
            if (code_point != '/' && code_point != '\\') {
1169
2.20k
                state = State::Authority;
1170
2.20k
                continue;
1171
2.20k
            }
1172
            // 2. Otherwise, special-scheme-missing-following-solidus validation error.
1173
458
            else {
1174
458
                report_validation_error();
1175
458
            }
1176
458
            break;
1177
        // -> authority state, https://url.spec.whatwg.org/#authority-state
1178
39.2M
        case State::Authority:
1179
            // 1. If c is U+0040 (@), then:
1180
39.2M
            if (code_point == '@') {
1181
                // 1. Invalid-credentials validation error.
1182
6.36k
                report_validation_error();
1183
1184
                // 2. If atSignSeen is true, then prepend "%40" to buffer.
1185
6.36k
                if (at_sign_seen) {
1186
5.81k
                    auto content = buffer.to_byte_string();
1187
5.81k
                    buffer.clear();
1188
5.81k
                    buffer.append("%40"sv);
1189
5.81k
                    buffer.append(content);
1190
5.81k
                }
1191
1192
                // 3. Set atSignSeen to true.
1193
6.36k
                at_sign_seen = true;
1194
1195
6.36k
                StringBuilder username_builder;
1196
6.36k
                StringBuilder password_builder;
1197
1198
                // 4. For each codePoint in buffer:
1199
8.51M
                for (auto c : Utf8View(buffer.string_view())) {
1200
                    // 1. If codePoint is U+003A (:) and passwordTokenSeen is false, then set passwordTokenSeen to true and continue.
1201
8.51M
                    if (c == ':' && !password_token_seen) {
1202
156
                        password_token_seen = true;
1203
156
                        continue;
1204
156
                    }
1205
1206
                    // 2. Let encodedCodePoints be the result of running UTF-8 percent-encode codePoint using the userinfo percent-encode set.
1207
                    // NOTE: This is done inside of step 3 and 4 implementation
1208
1209
                    // 3. If passwordTokenSeen is true, then append encodedCodePoints to url’s password.
1210
8.51M
                    if (password_token_seen) {
1211
6.05M
                        if (password_builder.is_empty())
1212
2.93k
                            password_builder.append(url->m_data->password);
1213
1214
6.05M
                        append_percent_encoded_if_necessary(password_builder, c, PercentEncodeSet::Userinfo);
1215
6.05M
                    }
1216
                    // 4. Otherwise, append encodedCodePoints to url’s username.
1217
2.46M
                    else {
1218
2.46M
                        if (username_builder.is_empty())
1219
3.29k
                            username_builder.append(url->m_data->username);
1220
1221
2.46M
                        append_percent_encoded_if_necessary(username_builder, c, PercentEncodeSet::Userinfo);
1222
2.46M
                    }
1223
8.51M
                }
1224
1225
6.36k
                if (username_builder.string_view().length() > url->m_data->username.bytes().size())
1226
3.29k
                    url->m_data->username = username_builder.to_string().release_value_but_fixme_should_propagate_errors();
1227
6.36k
                if (password_builder.string_view().length() > url->m_data->password.bytes().size())
1228
2.93k
                    url->m_data->password = password_builder.to_string().release_value_but_fixme_should_propagate_errors();
1229
1230
                // 5. Set buffer to the empty string.
1231
6.36k
                buffer.clear();
1232
1233
6.36k
            }
1234
            // 2. Otherwise, if one of the following is true:
1235
            //    * c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
1236
            //    * url is special and c is U+005C (\)
1237
39.2M
            else if ((code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#')
1238
39.2M
                || (url->is_special() && code_point == '\\')) {
1239
                // then:
1240
1241
                // 1. If atSignSeen is true and buffer is the empty string, host-missing validation error, return failure.
1242
2.79k
                if (at_sign_seen && buffer.is_empty()) {
1243
298
                    report_validation_error();
1244
298
                    return {};
1245
298
                }
1246
1247
                // 2. Decrease pointer by buffer’s code point length + 1, set buffer to the empty string, and set state to host state.
1248
2.49k
                iterator = input.iterator_at_byte_offset(iterator - input.begin() - buffer.length() - 1);
1249
2.49k
                buffer.clear();
1250
2.49k
                state = State::Host;
1251
2.49k
            }
1252
            // 3. Otherwise, append c to buffer.
1253
39.2M
            else {
1254
39.2M
                buffer.append_code_point(code_point);
1255
39.2M
            }
1256
39.2M
            break;
1257
        // -> host state, https://url.spec.whatwg.org/#host-state
1258
        // -> hostname state, https://url.spec.whatwg.org/#hostname-state
1259
39.2M
        case State::Host:
1260
23.9M
        case State::Hostname:
1261
            // 1. If state override is given and url’s scheme is "file", then decrease pointer by 1 and set state to file host state.
1262
23.9M
            if (state_override.has_value() && url->scheme() == "file") {
1263
0
                state = State::FileHost;
1264
0
                continue;
1265
0
            }
1266
1267
            // 2. Otherwise, if c is U+003A (:) and insideBrackets is false, then:
1268
23.9M
            if (code_point == ':' && !inside_brackets) {
1269
                // 1. If buffer is the empty string, host-missing validation error, return failure.
1270
494
                if (buffer.is_empty()) {
1271
3
                    report_validation_error();
1272
3
                    return {};
1273
3
                }
1274
1275
                // 2. If state override is given and state override is hostname state, then return.
1276
491
                if (state_override.has_value() && *state_override == State::Hostname)
1277
0
                    return *url;
1278
1279
                // 3. Let host be the result of host parsing buffer with url is not special.
1280
491
                auto host = parse_host(buffer.string_view(), !url->is_special());
1281
1282
                // 4. If host is failure, then return failure.
1283
491
                if (!host.has_value())
1284
7
                    return {};
1285
1286
                // 5. Set url’s host to host, buffer to the empty string, and state to port state.
1287
484
                url->m_data->host = host.release_value();
1288
484
                buffer.clear();
1289
484
                state = State::Port;
1290
484
            }
1291
            // 3. Otherwise, if one of the following is true:
1292
            //    * c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
1293
            //    * url is special and c is U+005C (\)
1294
23.9M
            else if ((code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#')
1295
23.9M
                || (url->is_special() && code_point == '\\')) {
1296
                // then decrease pointer by 1, and then:
1297
                // NOTE: pointer decrement is done by the continue below
1298
1299
                // 1. If url is special and buffer is the empty string, host-missing validation error, return failure.
1300
1.82k
                if (url->is_special() && buffer.is_empty()) {
1301
30
                    report_validation_error();
1302
30
                    return {};
1303
30
                }
1304
1305
                // 2. Otherwise, if state override is given, buffer is the empty string, and either url includes credentials or url’s port is non-null, return.
1306
1.79k
                if (state_override.has_value() && buffer.is_empty() && (url->includes_credentials() || url->port().has_value()))
1307
0
                    return *url;
1308
1309
                // 3. Let host be the result of host parsing buffer with url is not special.
1310
1.79k
                auto host = parse_host(buffer.string_view(), !url->is_special());
1311
1312
                // 4. If host is failure, then return failure.
1313
1.79k
                if (!host.has_value())
1314
1.08k
                    return {};
1315
1316
                // 5. Set url’s host to host, buffer to the empty string, and state to path start state.
1317
706
                url->m_data->host = host.value();
1318
706
                buffer.clear();
1319
706
                state = State::Port;
1320
1321
                // 6. If state override is given, then return.
1322
706
                if (state_override.has_value())
1323
0
                    return *url;
1324
1325
706
                continue;
1326
1327
706
            }
1328
            // 4. Otherwise:
1329
23.9M
            else {
1330
                // 1. If c is U+005B ([), then set insideBrackets to true.
1331
23.9M
                if (code_point == '[') {
1332
703
                    inside_brackets = true;
1333
703
                }
1334
                // 2. If c is U+005D (]), then set insideBrackets to false.
1335
23.9M
                else if (code_point == ']') {
1336
759
                    inside_brackets = false;
1337
759
                }
1338
1339
                // 3. Append c to buffer.
1340
23.9M
                buffer.append_code_point(code_point);
1341
23.9M
            }
1342
23.9M
            break;
1343
        // -> port state, https://url.spec.whatwg.org/#port-state
1344
23.9M
        case State::Port:
1345
            // 1. If c is an ASCII digit, append c to buffer.
1346
2.09M
            if (is_ascii_digit(code_point)) {
1347
2.09M
                buffer.append_code_point(code_point);
1348
2.09M
            }
1349
1350
            // 2. Otherwise, if one of the following is true:
1351
            //    * c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
1352
            //    * url is special and c is U+005C (\)
1353
            //    * state override is given
1354
1.19k
            else if ((code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#')
1355
112
                || (url->is_special() && code_point == '\\')
1356
1.12k
                || state_override.has_value()) {
1357
                // then:
1358
1359
                // 1. If buffer is not the empty string, then:
1360
1.12k
                if (!buffer.is_empty()) {
1361
                    // 1. Let port be the mathematical integer value that is represented by buffer in radix-10 using ASCII digits for digits with values 0 through 9.
1362
348
                    auto port = buffer.string_view().to_number<u16>();
1363
1364
                    // 2. If port is greater than 2^16 − 1, port-out-of-range validation error, return failure.
1365
                    // NOTE: This is done by to_number.
1366
348
                    if (!port.has_value()) {
1367
7
                        report_validation_error();
1368
7
                        return {};
1369
7
                    }
1370
1371
                    // 3. Set url’s port to null, if port is url’s scheme’s default port; otherwise to port.
1372
341
                    if (port.value() == default_port_for_scheme(url->scheme()))
1373
1
                        url->m_data->port = {};
1374
340
                    else
1375
340
                        url->m_data->port = port.value();
1376
1377
                    // 4. Set buffer to the empty string.
1378
341
                    buffer.clear();
1379
341
                }
1380
1381
                // 2. If state override is given, then return.
1382
1.11k
                if (state_override.has_value())
1383
0
                    return *url;
1384
1385
                // 3. Set state to path start state and decrease pointer by 1.
1386
1.11k
                state = State::PathStart;
1387
1.11k
                continue;
1388
1.11k
            }
1389
            // 3. Otherwise, port-invalid validation error, return failure.
1390
66
            else {
1391
66
                report_validation_error();
1392
66
                return {};
1393
66
            }
1394
2.09M
            break;
1395
        // -> file state, https://url.spec.whatwg.org/#file-state
1396
2.09M
        case State::File:
1397
            // 1. Set url’s scheme to "file".
1398
604
            url->m_data->scheme = "file"_string;
1399
1400
            // 2. Set url’s host to the empty string.
1401
604
            url->m_data->host = String {};
1402
1403
            // 3. If c is U+002F (/) or U+005C (\), then:
1404
604
            if (code_point == '/' || code_point == '\\') {
1405
                // 1. If c is U+005C (\), invalid-reverse-solidus validation error.
1406
349
                if (code_point == '\\')
1407
15
                    report_validation_error();
1408
1409
                // 2. Set state to file slash state.
1410
349
                state = State::FileSlash;
1411
349
            }
1412
            // 4. Otherwise, if base is non-null and base’s scheme is "file":
1413
255
            else if (base_url.has_value() && base_url->m_data->scheme == "file") {
1414
                // 1. Set url’s host to base’s host, url’s path to a clone of base’s path, and url’s query to base’s query.
1415
0
                url->m_data->host = base_url->m_data->host;
1416
0
                url->m_data->paths = base_url->m_data->paths;
1417
0
                url->m_data->query = base_url->m_data->query;
1418
1419
                // 2. If c is U+003F (?), then set url’s query to the empty string and state to query state.
1420
0
                if (code_point == '?') {
1421
0
                    url->m_data->query = String {};
1422
0
                    state = State::Query;
1423
0
                }
1424
                // 3. Otherwise, if c is U+0023 (#), set url’s fragment to the empty string and state to fragment state.
1425
0
                else if (code_point == '#') {
1426
0
                    url->m_data->fragment = String {};
1427
0
                    state = State::Fragment;
1428
0
                }
1429
                // 4. Otherwise, if c is not the EOF code point:
1430
0
                else if (code_point != end_of_file) {
1431
                    // 1. Set url’s query to null.
1432
0
                    url->m_data->query = {};
1433
1434
                    // 2. If the code point substring from pointer to the end of input does not start with a Windows drive letter, then shorten url’s path.
1435
0
                    auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
1436
0
                    if (!starts_with_windows_drive_letter(substring_from_pointer)) {
1437
0
                        shorten_urls_path(*url);
1438
0
                    }
1439
                    // 3. Otherwise:
1440
0
                    else {
1441
                        // 1. File-invalid-Windows-drive-letter validation error.
1442
0
                        report_validation_error();
1443
1444
                        // 2. Set url’s path to « ».
1445
0
                        url->m_data->paths.clear();
1446
0
                    }
1447
1448
                    // 4. Set state to path state and decrease pointer by 1.
1449
0
                    state = State::Path;
1450
0
                    continue;
1451
0
                }
1452
0
            }
1453
            // 5. Otherwise, set state to path state, and decrease pointer by 1.
1454
255
            else {
1455
255
                state = State::Path;
1456
255
                continue;
1457
255
            }
1458
1459
349
            break;
1460
        // -> file slash state, https://url.spec.whatwg.org/#file-slash-state
1461
349
        case State::FileSlash:
1462
            // 1. If c is U+002F (/) or U+005C (\), then:
1463
349
            if (code_point == '/' || code_point == '\\') {
1464
                // 1. If c is U+005C (\), invalid-reverse-solidus validation error.
1465
313
                if (code_point == '\\')
1466
38
                    report_validation_error();
1467
1468
                // 2. Set state to file host state.
1469
313
                state = State::FileHost;
1470
313
            }
1471
            // 2. Otherwise:
1472
36
            else {
1473
                // 1. If base is non-null and base’s scheme is "file", then:
1474
36
                if (base_url.has_value() && base_url->m_data->scheme == "file") {
1475
                    // 1. Set url’s host to base’s host.
1476
0
                    url->m_data->host = base_url->m_data->host;
1477
1478
                    // 2. If the code point substring from pointer to the end of input does not start with a Windows drive letter and base’s path[0] is a normalized Windows drive letter, then append base’s path[0] to url’s path.
1479
0
                    auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
1480
0
                    if (!starts_with_windows_drive_letter(substring_from_pointer) && is_normalized_windows_drive_letter(base_url->m_data->paths[0]))
1481
0
                        url->m_data->paths.append(base_url->m_data->paths[0]);
1482
0
                }
1483
1484
                // 2. Set state to path state, and decrease pointer by 1.
1485
36
                state = State::Path;
1486
36
                continue;
1487
36
            }
1488
313
            break;
1489
        // -> file host state, https://url.spec.whatwg.org/#file-host-state
1490
13.0M
        case State::FileHost:
1491
            // 1. If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by 1 and then:
1492
            //    NOTE: decreasing the pointer is done at the bottom of this block.
1493
13.0M
            if (code_point == end_of_file || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
1494
                // 1. If state override is not given and buffer is a Windows drive letter, file-invalid-Windows-drive-letter-host validation error, set state to path state.
1495
313
                if (!state_override.has_value() && is_windows_drive_letter(buffer.string_view())) {
1496
2
                    report_validation_error();
1497
2
                    state = State::Path;
1498
2
                }
1499
                // 2. Otherwise, if buffer is the empty string, then:
1500
311
                else if (buffer.is_empty()) {
1501
                    // 1. Set url’s host to the empty string.
1502
30
                    url->m_data->host = String {};
1503
1504
                    // 2. If state override is given, then return.
1505
30
                    if (state_override.has_value())
1506
0
                        return *url;
1507
1508
                    // 3. Set state to path start state.
1509
30
                    state = State::PathStart;
1510
30
                }
1511
                // 3. Otherwise, run these steps:
1512
281
                else {
1513
                    // 1. Let host be the result of host parsing buffer with url is not special.
1514
281
                    auto host = parse_host(buffer.string_view(), !url->is_special());
1515
1516
                    // 2. If host is failure, then return failure.
1517
281
                    if (!host.has_value())
1518
235
                        return {};
1519
1520
                    // 3. If host is "localhost", then set host to the empty string.
1521
46
                    if (host.value().has<String>() && host.value().get<String>() == "localhost"sv)
1522
1
                        host = String {};
1523
1524
                    // 4. Set url’s host to host.
1525
46
                    url->m_data->host = host.release_value();
1526
1527
                    // 5. If state override is given, then return.
1528
46
                    if (state_override.has_value())
1529
0
                        return *url;
1530
1531
                    // 6. Set buffer to the empty string and state to path start state.
1532
46
                    buffer.clear();
1533
46
                    state = State::PathStart;
1534
46
                }
1535
1536
                // NOTE: Decrement specified at the top of this 'if' statement.
1537
78
                continue;
1538
13.0M
            } else {
1539
13.0M
                buffer.append_code_point(code_point);
1540
13.0M
            }
1541
13.0M
            break;
1542
        // -> path start state, https://url.spec.whatwg.org/#path-start-state
1543
13.0M
        case State::PathStart:
1544
            // 1. If url is special, then:
1545
1.19k
            if (url->is_special()) {
1546
                // 1. If c is U+005C (\), invalid-reverse-solidus validation error.
1547
660
                if (code_point == '\\')
1548
48
                    report_validation_error();
1549
1550
                // 2. Set state to path state.
1551
660
                state = State::Path;
1552
1553
                // 3. If c is neither U+002F (/) nor U+005C (\), then decrease pointer by 1.
1554
660
                if (code_point != '/' && code_point != '\\')
1555
569
                    continue;
1556
660
            }
1557
            // 2. Otherwise, if state override is not given and c is U+003F (?), set url’s query to the empty string and state to query state.
1558
533
            else if (!state_override.has_value() && code_point == '?') {
1559
3
                url->m_data->query = String {};
1560
3
                state = State::Query;
1561
3
            }
1562
            // 3. Otherwise, if state override is not given and c is U+0023 (#), set url’s fragment to the empty string and state to fragment state.
1563
530
            else if (!state_override.has_value() && code_point == '#') {
1564
5
                url->m_data->fragment = String {};
1565
5
                state = State::Fragment;
1566
5
            }
1567
            // 4. Otherwise, if c is not the EOF code point:
1568
525
            else if (code_point != end_of_file) {
1569
                // 1. Set state to path state.
1570
198
                state = State::Path;
1571
1572
                // 2. If c is not U+002F (/), then decrease pointer by 1.
1573
198
                if (code_point != '/')
1574
0
                    continue;
1575
198
            }
1576
            // 5. Otherwise, if state override is given and url’s host is null, append the empty string to url’s path.
1577
327
            else if (state_override.has_value() && url->host().has<Empty>()) {
1578
0
                url->append_slash();
1579
0
            }
1580
624
            break;
1581
        // -> path state, https://url.spec.whatwg.org/#path-state
1582
17.0M
        case State::Path:
1583
            // 1. If one of the following is true:
1584
            //    * c is the EOF code point or U+002F (/)
1585
            //    * url is special and c is U+005C (\)
1586
            //    * state override is not given and c is U+003F (?) or U+0023 (#)
1587
17.0M
            if ((code_point == end_of_file || code_point == '/')
1588
7.23M
                || (url->is_special() && code_point == '\\')
1589
9.86M
                || (!state_override.has_value() && (code_point == '?' || code_point == '#'))) {
1590
                // then:
1591
1592
                // 1. If url is special and c is U+005C (\), invalid-reverse-solidus validation error.
1593
9.86M
                if (url->is_special() && code_point == '\\')
1594
3.51k
                    report_validation_error();
1595
1596
                // 2. If buffer is a double-dot URL path segment, then:
1597
9.86M
                if (is_double_dot_path_segment(buffer.string_view())) {
1598
                    // 1. Shorten url’s path.
1599
4.15k
                    shorten_urls_path(*url);
1600
1601
                    // 2. If neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’s path.
1602
4.15k
                    if (code_point != '/' && !(url->is_special() && code_point == '\\'))
1603
125
                        url->append_slash();
1604
4.15k
                }
1605
                // 3. Otherwise, if buffer is a single-dot URL path segment and if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’s path.
1606
9.85M
                else if (is_single_dot_path_segment(buffer.string_view()) && code_point != '/' && !(url->is_special() && code_point == '\\')) {
1607
36
                    url->append_slash();
1608
36
                }
1609
                // 4. Otherwise, if buffer is not a single-dot URL path segment, then:
1610
9.85M
                else if (!is_single_dot_path_segment(buffer.string_view())) {
1611
                    // 1. If url’s scheme is "file", url’s path is empty, and buffer is a Windows drive letter, then replace the second code point in buffer with U+003A (:).
1612
9.84M
                    if (url->m_data->scheme == "file" && url->m_data->paths.is_empty() && is_windows_drive_letter(buffer.string_view())) {
1613
19
                        auto drive_letter = buffer.string_view()[0];
1614
19
                        buffer.clear();
1615
19
                        buffer.append(drive_letter);
1616
19
                        buffer.append(':');
1617
19
                    }
1618
                    // 2. Append buffer to url’s path.
1619
9.84M
                    url->m_data->paths.append(buffer.to_string_without_validation());
1620
9.84M
                }
1621
1622
                // 5. Set buffer to the empty string.
1623
9.86M
                buffer.clear();
1624
1625
                // 6. If c is U+003F (?), then set url’s query to the empty string and state to query state.
1626
9.86M
                if (code_point == '?') {
1627
203
                    url->m_data->query = String {};
1628
203
                    state = State::Query;
1629
203
                }
1630
                // 7. If c is U+0023 (#), then set url’s fragment to the empty string and state to fragment state.
1631
9.86M
                else if (code_point == '#') {
1632
24
                    url->m_data->fragment = String {};
1633
24
                    state = State::Fragment;
1634
24
                }
1635
9.86M
            }
1636
            // 2. Otherwise, run these steps
1637
7.23M
            else {
1638
                // 1. If c is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
1639
7.23M
                if (!is_url_code_point(code_point) && code_point != '%')
1640
24.4k
                    report_validation_error();
1641
1642
                // 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
1643
7.23M
                if (code_point == '%' && !remaining_starts_with_two_ascii_hex_digits())
1644
6.71k
                    report_validation_error();
1645
1646
                // 3. UTF-8 percent-encode c using the path percent-encode set and append the result to buffer.
1647
7.23M
                append_percent_encoded_if_necessary(buffer, code_point, PercentEncodeSet::Path);
1648
7.23M
            }
1649
17.0M
            break;
1650
        // -> opaque path state, https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state
1651
2.42M
        case State::CannotBeABaseUrlPath:
1652
            // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF.
1653
2.42M
            VERIFY(url->m_data->paths.size() == 1 && url->m_data->paths[0].is_empty());
1654
1655
            // 1. If c is U+003F (?), then set url’s query to the empty string and state to query state.
1656
2.42M
            if (code_point == '?') {
1657
493
                url->m_data->paths[0] = buffer.to_string_without_validation();
1658
493
                url->m_data->query = String {};
1659
493
                buffer.clear();
1660
493
                state = State::Query;
1661
493
            }
1662
            // 2. Otherwise, if c is U+0023 (#), then set url’s fragment to the empty string and state to fragment state.
1663
2.42M
            else if (code_point == '#') {
1664
                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
1665
490
                url->m_data->paths[0] = buffer.to_string_without_validation();
1666
490
                url->m_data->fragment = String {};
1667
490
                buffer.clear();
1668
490
                state = State::Fragment;
1669
490
            }
1670
            // 3. Otherwise:
1671
2.42M
            else {
1672
                // 1. If c is not the EOF code point, not a URL code point, and not U+0025 (%), invalid-URL-unit validation error.
1673
2.42M
                if (code_point != end_of_file && !is_url_code_point(code_point) && code_point != '%')
1674
47.2k
                    report_validation_error();
1675
1676
                // 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
1677
2.42M
                if (code_point == '%' && !remaining_starts_with_two_ascii_hex_digits())
1678
100k
                    report_validation_error();
1679
1680
                // 3. If c is not the EOF code point, UTF-8 percent-encode c using the C0 control percent-encode set and append the result to url’s path.
1681
2.42M
                if (code_point != end_of_file) {
1682
2.42M
                    append_percent_encoded_if_necessary(buffer, code_point, PercentEncodeSet::C0Control);
1683
2.42M
                } else {
1684
809
                    url->m_data->paths[0] = buffer.to_string_without_validation();
1685
809
                    buffer.clear();
1686
809
                }
1687
2.42M
            }
1688
2.42M
            break;
1689
        // -> query state, https://url.spec.whatwg.org/#query-state
1690
4.99M
        case State::Query:
1691
            // 1. If encoding is not UTF-8 and one of the following is true:
1692
            //     * url is not special
1693
            //     * url’s scheme is "ws" or "wss"
1694
            //  then set encoding to UTF-8.
1695
4.99M
            if (!url->is_special() || url->m_data->scheme == "ws" || url->m_data->scheme == "wss")
1696
2.81M
                encoder = TextCodec::encoder_for("utf-8"sv);
1697
1698
            // 2. If one of the following is true:
1699
            //    * state override is not given and c is U+0023 (#)
1700
            //    * c is the EOF code point
1701
4.99M
            if ((!state_override.has_value() && code_point == '#')
1702
4.99M
                || code_point == end_of_file) {
1703
                // then:
1704
1705
                // 1. Let queryPercentEncodeSet be the special-query percent-encode set if url is special; otherwise the query percent-encode set.
1706
699
                auto query_percent_encode_set = url->is_special() ? PercentEncodeSet::SpecialQuery : PercentEncodeSet::Query;
1707
1708
                // 2. Percent-encode after encoding, with encoding, buffer, and queryPercentEncodeSet, and append the result to url’s query.
1709
699
                url->m_data->query = percent_encode_after_encoding(*encoder, buffer.string_view(), query_percent_encode_set);
1710
1711
                // 3. Set buffer to the empty string.
1712
699
                buffer.clear();
1713
1714
                // 4. If c is U+0023 (#), then set url’s fragment to the empty string and state to fragment state.
1715
699
                if (code_point == '#') {
1716
21
                    url->m_data->fragment = String {};
1717
21
                    state = State::Fragment;
1718
21
                }
1719
699
            }
1720
            // 3. Otherwise, if c is not the EOF code point:
1721
4.99M
            else if (code_point != end_of_file) {
1722
                // 1. If c is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
1723
4.99M
                if (!is_url_code_point(code_point) && code_point != '%')
1724
19.2k
                    report_validation_error();
1725
1726
                // 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
1727
4.99M
                if (code_point == '%' && !remaining_starts_with_two_ascii_hex_digits())
1728
14.5k
                    report_validation_error();
1729
1730
                // 3. Append c to buffer.
1731
4.99M
                buffer.append_code_point(code_point);
1732
4.99M
            }
1733
4.99M
            break;
1734
        // -> fragment state, https://url.spec.whatwg.org/#fragment-state
1735
8.14M
        case State::Fragment:
1736
            // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
1737
            // 1. If c is not the EOF code point, then:
1738
8.14M
            if (code_point != end_of_file) {
1739
                // 1. If c is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
1740
8.14M
                if (!is_url_code_point(code_point) && code_point != '%')
1741
132k
                    report_validation_error();
1742
1743
                // 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
1744
8.14M
                if (code_point == '%' && !remaining_starts_with_two_ascii_hex_digits())
1745
55.8k
                    report_validation_error();
1746
1747
                // 3. UTF-8 percent-encode c using the fragment percent-encode set and append the result to url’s fragment.
1748
                // NOTE: The percent-encode is done on EOF on the entire buffer.
1749
8.14M
                buffer.append_code_point(code_point);
1750
8.14M
            } else {
1751
540
                url->m_data->fragment = percent_encode_after_encoding(*TextCodec::encoder_for("utf-8"sv), buffer.string_view(), PercentEncodeSet::Fragment);
1752
540
                buffer.clear();
1753
540
            }
1754
8.14M
            break;
1755
0
        default:
1756
0
            VERIFY_NOT_REACHED();
1757
136M
        }
1758
1759
136M
        if (iterator.done())
1760
4.19k
            break;
1761
136M
        ++iterator;
1762
136M
    }
1763
1764
4.19k
    url->m_data->valid = true;
1765
4.19k
    dbgln_if(URL_PARSER_DEBUG, "URL::Parser::basic_parse: Parsed URL to be '{}'.", url->serialize());
1766
1767
    // 10. Return url.
1768
4.19k
    return *url;
1769
6.22k
}
1770
1771
}