Coverage Report

Created: 2025-08-28 06:26

/src/serenity/Userland/Libraries/LibURL/URL.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
3
 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
4
 *
5
 * SPDX-License-Identifier: BSD-2-Clause
6
 */
7
8
#include <AK/Base64.h>
9
#include <AK/CharacterTypes.h>
10
#include <AK/Debug.h>
11
#include <AK/LexicalPath.h>
12
#include <AK/StringBuilder.h>
13
#include <AK/Utf8View.h>
14
#include <LibURL/Parser.h>
15
#include <LibURL/URL.h>
16
17
namespace URL {
18
19
// FIXME: It could make sense to force users of URL to use URL::Parser::basic_parse() explicitly instead of using a constructor.
20
URL::URL(StringView string)
21
6.10k
    : URL(Parser::basic_parse(string))
22
6.10k
{
23
    if constexpr (URL_PARSER_DEBUG) {
24
        if (m_data->valid)
25
            dbgln("URL constructor: Parsed URL to be '{}'.", serialize());
26
        else
27
            dbgln("URL constructor: Parsed URL to be invalid.");
28
    }
29
6.10k
}
30
31
URL URL::complete_url(StringView relative_url) const
32
10.3k
{
33
10.3k
    if (!is_valid())
34
10.3k
        return {};
35
36
0
    return Parser::basic_parse(relative_url, *this);
37
10.3k
}
38
39
ByteString URL::path_segment_at_index(size_t index) const
40
0
{
41
0
    VERIFY(index < path_segment_count());
42
0
    return percent_decode(m_data->paths[index]);
43
0
}
44
45
ByteString URL::basename() const
46
0
{
47
0
    if (!m_data->valid)
48
0
        return {};
49
0
    if (m_data->paths.is_empty())
50
0
        return {};
51
0
    auto& last_segment = m_data->paths.last();
52
0
    return percent_decode(last_segment);
53
0
}
54
55
void URL::set_scheme(String scheme)
56
0
{
57
0
    m_data->scheme = move(scheme);
58
0
    m_data->valid = compute_validity();
59
0
}
60
61
// https://url.spec.whatwg.org/#set-the-username
62
void URL::set_username(StringView username)
63
0
{
64
    // To set the username given a url and username, set url’s username to the result of running UTF-8 percent-encode on username using the userinfo percent-encode set.
65
0
    m_data->username = percent_encode(username, PercentEncodeSet::Userinfo);
66
0
    m_data->valid = compute_validity();
67
0
}
68
69
// https://url.spec.whatwg.org/#set-the-password
70
void URL::set_password(StringView password)
71
0
{
72
    // To set the password given a url and password, set url’s password to the result of running UTF-8 percent-encode on password using the userinfo percent-encode set.
73
0
    m_data->password = percent_encode(password, PercentEncodeSet::Userinfo);
74
0
    m_data->valid = compute_validity();
75
0
}
76
77
void URL::set_host(Host host)
78
0
{
79
0
    m_data->host = move(host);
80
0
    m_data->valid = compute_validity();
81
0
}
82
83
// https://url.spec.whatwg.org/#concept-host-serializer
84
ErrorOr<String> URL::serialized_host() const
85
0
{
86
0
    return Parser::serialize_host(m_data->host);
87
0
}
88
89
void URL::set_port(Optional<u16> port)
90
0
{
91
0
    if (port == default_port_for_scheme(m_data->scheme)) {
92
0
        m_data->port = {};
93
0
        return;
94
0
    }
95
0
    m_data->port = move(port);
96
0
    m_data->valid = compute_validity();
97
0
}
98
99
void URL::set_paths(Vector<ByteString> const& paths)
100
1.16k
{
101
1.16k
    m_data->paths.clear_with_capacity();
102
1.16k
    m_data->paths.ensure_capacity(paths.size());
103
1.16k
    for (auto const& segment : paths)
104
1.16k
        m_data->paths.unchecked_append(percent_encode(segment, PercentEncodeSet::Path));
105
1.16k
    m_data->valid = compute_validity();
106
1.16k
}
107
108
void URL::append_path(StringView path)
109
0
{
110
0
    m_data->paths.append(percent_encode(path, PercentEncodeSet::Path));
111
0
}
112
113
// https://url.spec.whatwg.org/#cannot-have-a-username-password-port
114
bool URL::cannot_have_a_username_or_password_or_port() const
115
0
{
116
    // A URL cannot have a username/password/port if its host is null or the empty string, or its scheme is "file".
117
0
    return m_data->host.has<Empty>() || m_data->host == String {} || m_data->scheme == "file"sv;
118
0
}
119
120
// FIXME: This is by no means complete.
121
// NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong.
122
bool URL::compute_validity() const
123
1.16k
{
124
1.16k
    if (m_data->scheme.is_empty())
125
1.16k
        return false;
126
127
0
    if (m_data->cannot_be_a_base_url) {
128
0
        if (m_data->paths.size() != 1)
129
0
            return false;
130
0
        if (m_data->paths[0].is_empty())
131
0
            return false;
132
0
    } else {
133
0
        if (m_data->scheme.is_one_of("about", "mailto"))
134
0
            return false;
135
        // NOTE: Maybe it is allowed to have a zero-segment path.
136
0
        if (m_data->paths.size() == 0)
137
0
            return false;
138
0
    }
139
140
    // NOTE: A file URL's host should be the empty string for localhost, not null.
141
0
    if (m_data->scheme == "file" && m_data->host.has<Empty>())
142
0
        return false;
143
144
0
    return true;
145
0
}
146
147
// https://url.spec.whatwg.org/#default-port
148
Optional<u16> default_port_for_scheme(StringView scheme)
149
356
{
150
    // Spec defined mappings with port:
151
356
    if (scheme == "ftp")
152
1
        return 21;
153
355
    if (scheme == "http")
154
1
        return 80;
155
354
    if (scheme == "https")
156
1
        return 443;
157
353
    if (scheme == "ws")
158
37
        return 80;
159
316
    if (scheme == "wss")
160
3
        return 443;
161
162
    // NOTE: not in spec, but we support these too
163
313
    if (scheme == "gemini")
164
1
        return 1965;
165
312
    if (scheme == "irc")
166
3
        return 6667;
167
309
    if (scheme == "ircs")
168
1
        return 6697;
169
170
308
    return {};
171
309
}
172
173
URL create_with_file_scheme(ByteString const& path, ByteString const& fragment, ByteString const& hostname)
174
0
{
175
0
    LexicalPath lexical_path(path);
176
0
    if (!lexical_path.is_absolute())
177
0
        return {};
178
179
0
    URL url;
180
0
    url.set_scheme("file"_string);
181
0
    url.set_host(hostname == "localhost" ? String {} : String::from_byte_string(hostname).release_value_but_fixme_should_propagate_errors());
182
0
    url.set_paths(lexical_path.parts());
183
0
    if (path.ends_with('/'))
184
0
        url.append_slash();
185
0
    if (!fragment.is_empty())
186
0
        url.set_fragment(String::from_byte_string(fragment).release_value_but_fixme_should_propagate_errors());
187
0
    return url;
188
0
}
189
190
URL create_with_help_scheme(ByteString const& path, ByteString const& fragment, ByteString const& hostname)
191
0
{
192
0
    LexicalPath lexical_path(path);
193
194
0
    URL url;
195
0
    url.set_scheme("help"_string);
196
0
    url.set_host(hostname == "localhost" ? String {} : String::from_byte_string(hostname).release_value_but_fixme_should_propagate_errors());
197
198
0
    url.set_paths(lexical_path.parts());
199
0
    if (path.ends_with('/'))
200
0
        url.append_slash();
201
0
    if (!fragment.is_empty())
202
0
        url.set_fragment(String::from_byte_string(fragment).release_value_but_fixme_should_propagate_errors());
203
0
    return url;
204
0
}
205
206
URL create_with_url_or_path(ByteString const& url_or_path)
207
0
{
208
0
    URL url = url_or_path;
209
0
    if (url.is_valid())
210
0
        return url;
211
212
0
    ByteString path = LexicalPath::canonicalized_path(url_or_path);
213
0
    return create_with_file_scheme(path);
214
0
}
215
216
URL create_with_data(StringView mime_type, StringView payload, bool is_base64)
217
0
{
218
0
    URL url;
219
0
    url.set_cannot_be_a_base_url(true);
220
0
    url.set_scheme("data"_string);
221
222
0
    StringBuilder builder;
223
0
    builder.append(mime_type);
224
0
    if (is_base64)
225
0
        builder.append(";base64"sv);
226
0
    builder.append(',');
227
0
    builder.append(payload);
228
0
    url.set_paths({ builder.to_byte_string() });
229
0
    return url;
230
0
}
231
232
// https://url.spec.whatwg.org/#special-scheme
233
bool is_special_scheme(StringView scheme)
234
110M
{
235
110M
    return scheme.is_one_of("ftp", "file", "http", "https", "ws", "wss");
236
110M
}
237
238
// https://url.spec.whatwg.org/#url-path-serializer
239
String URL::serialize_path() const
240
0
{
241
    // 1. If url has an opaque path, then return url’s path.
242
    // FIXME: Reimplement this step once we modernize the URL implementation to meet the spec.
243
0
    if (cannot_be_a_base_url())
244
0
        return m_data->paths[0];
245
246
    // 2. Let output be the empty string.
247
0
    StringBuilder output;
248
249
    // 3. For each segment of url’s path: append U+002F (/) followed by segment to output.
250
0
    for (auto const& segment : m_data->paths) {
251
0
        output.append('/');
252
0
        output.append(segment);
253
0
    }
254
255
    // 4. Return output.
256
0
    return output.to_string_without_validation();
257
0
}
258
259
// https://url.spec.whatwg.org/#concept-url-serializer
260
ByteString URL::serialize(ExcludeFragment exclude_fragment) const
261
9.18k
{
262
    // 1. Let output be url’s scheme and U+003A (:) concatenated.
263
9.18k
    StringBuilder output;
264
9.18k
    output.append(m_data->scheme);
265
9.18k
    output.append(':');
266
267
    // 2. If url’s host is non-null:
268
9.18k
    if (!m_data->host.has<Empty>()) {
269
        // 1. Append "//" to output.
270
0
        output.append("//"sv);
271
272
        // 2. If url includes credentials, then:
273
0
        if (includes_credentials()) {
274
            // 1. Append url’s username to output.
275
0
            output.append(m_data->username);
276
277
            // 2. If url’s password is not the empty string, then append U+003A (:), followed by url’s password, to output.
278
0
            if (!m_data->password.is_empty()) {
279
0
                output.append(':');
280
0
                output.append(m_data->password);
281
0
            }
282
283
            // 3. Append U+0040 (@) to output.
284
0
            output.append('@');
285
0
        }
286
287
        // 3. Append url’s host, serialized, to output.
288
0
        output.append(serialized_host().release_value_but_fixme_should_propagate_errors());
289
290
        // 4. If url’s port is non-null, append U+003A (:) followed by url’s port, serialized, to output.
291
0
        if (m_data->port.has_value())
292
0
            output.appendff(":{}", *m_data->port);
293
0
    }
294
295
    // 3. If url’s host is null, url does not have an opaque path, url’s path’s size is greater than 1, and url’s path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output.
296
    // 4. Append the result of URL path serializing url to output.
297
    // FIXME: Implement this closer to spec steps.
298
9.18k
    if (cannot_be_a_base_url()) {
299
0
        output.append(m_data->paths[0]);
300
9.18k
    } else {
301
9.18k
        if (m_data->host.has<Empty>() && m_data->paths.size() > 1 && m_data->paths[0].is_empty())
302
0
            output.append("/."sv);
303
9.18k
        for (auto& segment : m_data->paths) {
304
0
            output.append('/');
305
0
            output.append(segment);
306
0
        }
307
9.18k
    }
308
309
    // 5. If url’s query is non-null, append U+003F (?), followed by url’s query, to output.
310
9.18k
    if (m_data->query.has_value()) {
311
0
        output.append('?');
312
0
        output.append(*m_data->query);
313
0
    }
314
315
    // 6. If exclude fragment is false and url’s fragment is non-null, then append U+0023 (#), followed by url’s fragment, to output.
316
9.18k
    if (exclude_fragment == ExcludeFragment::No && m_data->fragment.has_value()) {
317
0
        output.append('#');
318
0
        output.append(*m_data->fragment);
319
0
    }
320
321
    // 7. Return output.
322
9.18k
    return output.to_byte_string();
323
9.18k
}
324
325
// https://url.spec.whatwg.org/#url-rendering
326
// NOTE: This does e.g. not display credentials.
327
// FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points
328
//        resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible.
329
ByteString URL::serialize_for_display() const
330
0
{
331
0
    VERIFY(m_data->valid);
332
333
0
    StringBuilder builder;
334
0
    builder.append(m_data->scheme);
335
0
    builder.append(':');
336
337
0
    if (!m_data->host.has<Empty>()) {
338
0
        builder.append("//"sv);
339
0
        builder.append(serialized_host().release_value_but_fixme_should_propagate_errors());
340
0
        if (m_data->port.has_value())
341
0
            builder.appendff(":{}", *m_data->port);
342
0
    }
343
344
0
    if (cannot_be_a_base_url()) {
345
0
        builder.append(m_data->paths[0]);
346
0
    } else {
347
0
        if (m_data->host.has<Empty>() && m_data->paths.size() > 1 && m_data->paths[0].is_empty())
348
0
            builder.append("/."sv);
349
0
        for (auto& segment : m_data->paths) {
350
0
            builder.append('/');
351
0
            builder.append(segment);
352
0
        }
353
0
    }
354
355
0
    if (m_data->query.has_value()) {
356
0
        builder.append('?');
357
0
        builder.append(*m_data->query);
358
0
    }
359
360
0
    if (m_data->fragment.has_value()) {
361
0
        builder.append('#');
362
0
        builder.append(*m_data->fragment);
363
0
    }
364
365
0
    return builder.to_byte_string();
366
0
}
367
368
ErrorOr<String> URL::to_string() const
369
0
{
370
0
    return String::from_byte_string(serialize());
371
0
}
372
373
// https://url.spec.whatwg.org/#concept-url-origin
374
Origin URL::origin() const
375
0
{
376
    // The origin of a URL url is the origin returned by running these steps, switching on url’s scheme:
377
    // -> "blob"
378
0
    if (scheme() == "blob"sv) {
379
0
        auto url_string = to_string().release_value_but_fixme_should_propagate_errors();
380
381
        // 1. If url’s blob URL entry is non-null, then return url’s blob URL entry’s environment’s origin.
382
0
        if (blob_url_entry().has_value())
383
0
            return blob_url_entry()->environment_origin;
384
385
        // 2. Let pathURL be the result of parsing the result of URL path serializing url.
386
0
        auto path_url = Parser::basic_parse(serialize_path());
387
388
        // 3. If pathURL is failure, then return a new opaque origin.
389
0
        if (!path_url.is_valid())
390
0
            return Origin {};
391
392
        // 4. If pathURL’s scheme is "http", "https", or "file", then return pathURL’s origin.
393
0
        if (path_url.scheme().is_one_of("http"sv, "https"sv, "file"sv))
394
0
            return path_url.origin();
395
396
        // 5. Return a new opaque origin.
397
0
        return Origin {};
398
0
    }
399
400
    // -> "ftp"
401
    // -> "http"
402
    // -> "https"
403
    // -> "ws"
404
    // -> "wss"
405
0
    if (scheme().is_one_of("ftp"sv, "http"sv, "https"sv, "ws"sv, "wss"sv)) {
406
        // Return the tuple origin (url’s scheme, url’s host, url’s port, null).
407
0
        return Origin(scheme().to_byte_string(), host(), port());
408
0
    }
409
410
    // -> "file"
411
    // AD-HOC: Our resource:// is basically an alias to file://
412
0
    if (scheme() == "file"sv || scheme() == "resource"sv) {
413
        // Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin.
414
        // Note: We must return an origin with the `file://' protocol for `file://' iframes to work from `file://' pages.
415
0
        return Origin(scheme().to_byte_string(), String {}, {});
416
0
    }
417
418
    // -> Otherwise
419
    // Return a new opaque origin.
420
0
    return Origin {};
421
0
}
422
423
bool URL::equals(URL const& other, ExcludeFragment exclude_fragments) const
424
0
{
425
0
    if (this == &other)
426
0
        return true;
427
0
    if (!m_data->valid || !other.m_data->valid)
428
0
        return false;
429
0
    return serialize(exclude_fragments) == other.serialize(exclude_fragments);
430
0
}
431
432
void append_percent_encoded(StringBuilder& builder, u32 code_point)
433
27.8M
{
434
27.8M
    if (code_point <= 0x7f)
435
1.16M
        builder.appendff("%{:02X}", code_point);
436
26.7M
    else if (code_point <= 0x07ff)
437
2.38k
        builder.appendff("%{:02X}%{:02X}", ((code_point >> 6) & 0x1f) | 0xc0, (code_point & 0x3f) | 0x80);
438
26.7M
    else if (code_point <= 0xffff)
439
26.7M
        builder.appendff("%{:02X}%{:02X}%{:02X}", ((code_point >> 12) & 0x0f) | 0xe0, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
440
6.92k
    else if (code_point <= 0x10ffff)
441
6.92k
        builder.appendff("%{:02X}%{:02X}%{:02X}%{:02X}", ((code_point >> 18) & 0x07) | 0xf0, ((code_point >> 12) & 0x3f) | 0x80, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
442
0
    else
443
0
        VERIFY_NOT_REACHED();
444
27.8M
}
445
446
// https://url.spec.whatwg.org/#c0-control-percent-encode-set
447
bool code_point_is_in_percent_encode_set(u32 code_point, PercentEncodeSet set)
448
236M
{
449
    // NOTE: Once we've checked for presence in the C0Control set, we know that the code point is
450
    //       a valid ASCII character in the range 0x20..0x7E, so we can safely cast it to char.
451
236M
    switch (set) {
452
98.0M
    case PercentEncodeSet::C0Control:
453
98.0M
        return code_point < 0x20 || code_point > 0x7E;
454
50.3M
    case PercentEncodeSet::Fragment:
455
50.3M
        return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::C0Control) || " \"<>`"sv.contains(static_cast<char>(code_point));
456
43.5M
    case PercentEncodeSet::Query:
457
43.5M
        return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::C0Control) || " \"#<>"sv.contains(static_cast<char>(code_point));
458
777k
    case PercentEncodeSet::SpecialQuery:
459
777k
        return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Query) || code_point == '\'';
460
28.2M
    case PercentEncodeSet::Path:
461
28.2M
        return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Query) || "?`{}"sv.contains(static_cast<char>(code_point));
462
15.6M
    case PercentEncodeSet::Userinfo:
463
15.6M
        return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Path) || "/:;=@[\\]^|"sv.contains(static_cast<char>(code_point));
464
0
    case PercentEncodeSet::Component:
465
0
        return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Userinfo) || "$%&+,"sv.contains(static_cast<char>(code_point));
466
0
    case PercentEncodeSet::ApplicationXWWWFormUrlencoded:
467
0
        return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Component) || "!'()~"sv.contains(static_cast<char>(code_point));
468
0
    case PercentEncodeSet::EncodeURI:
469
        // NOTE: This is the same percent encode set that JS encodeURI() uses.
470
        // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
471
0
        return code_point > 0x7E || (!is_ascii_alphanumeric(code_point) && !";,/?:@&=+$-_.!~*'()#"sv.contains(static_cast<char>(code_point)));
472
0
    default:
473
0
        VERIFY_NOT_REACHED();
474
236M
    }
475
236M
}
476
477
void append_percent_encoded_if_necessary(StringBuilder& builder, u32 code_point, PercentEncodeSet set)
478
32.3M
{
479
32.3M
    if (code_point_is_in_percent_encode_set(code_point, set))
480
27.8M
        append_percent_encoded(builder, code_point);
481
4.51M
    else
482
4.51M
        builder.append_code_point(code_point);
483
32.3M
}
484
485
String percent_encode(StringView input, PercentEncodeSet set, SpaceAsPlus space_as_plus)
486
1.72k
{
487
1.72k
    StringBuilder builder;
488
2.13M
    for (auto code_point : Utf8View(input)) {
489
2.13M
        if (space_as_plus == SpaceAsPlus::Yes && code_point == ' ')
490
0
            builder.append('+');
491
2.13M
        else
492
2.13M
            append_percent_encoded_if_necessary(builder, code_point, set);
493
2.13M
    }
494
1.72k
    return MUST(builder.to_string());
495
1.72k
}
496
497
ByteString percent_decode(StringView input)
498
1.53k
{
499
1.53k
    if (!input.contains('%'))
500
1.19k
        return input;
501
340
    StringBuilder builder;
502
340
    Utf8View utf8_view(input);
503
4.77M
    for (auto it = utf8_view.begin(); !it.done(); ++it) {
504
4.77M
        if (*it != '%') {
505
4.76M
            builder.append_code_point(*it);
506
4.76M
        } else if (!is_ascii_hex_digit(it.peek(1).value_or(0)) || !is_ascii_hex_digit(it.peek(2).value_or(0))) {
507
9.93k
            builder.append_code_point(*it);
508
9.93k
        } else {
509
6.75k
            ++it;
510
6.75k
            u8 byte = parse_ascii_hex_digit(*it) << 4;
511
6.75k
            ++it;
512
6.75k
            byte += parse_ascii_hex_digit(*it);
513
6.75k
            builder.append(byte);
514
6.75k
        }
515
4.77M
    }
516
340
    return builder.to_byte_string();
517
1.53k
}
518
519
}