Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org> |
3 | | * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch> |
4 | | * |
5 | | * SPDX-License-Identifier: BSD-2-Clause |
6 | | */ |
7 | | |
8 | | #include <AK/CharacterTypes.h> |
9 | | #include <AK/Debug.h> |
10 | | #include <AK/LexicalPath.h> |
11 | | #include <AK/StringBuilder.h> |
12 | | #include <AK/URL.h> |
13 | | #include <AK/URLParser.h> |
14 | | #include <AK/Utf8View.h> |
15 | | |
16 | | namespace AK { |
17 | | |
18 | | // FIXME: It could make sense to force users of URL to use URLParser::parse() explicitly instead of using a constructor. |
19 | | URL::URL(StringView string) |
20 | | : URL(URLParser::parse(string)) |
21 | 0 | { |
22 | 0 | if constexpr (URL_PARSER_DEBUG) { |
23 | 0 | if (m_valid) |
24 | 0 | dbgln("URL constructor: Parsed URL to be '{}'.", serialize()); |
25 | 0 | else |
26 | 0 | dbgln("URL constructor: Parsed URL to be invalid."); |
27 | 0 | } |
28 | 0 | } |
29 | | |
30 | | String URL::path() const |
31 | 0 | { |
32 | 0 | if (cannot_be_a_base_url()) |
33 | 0 | return paths()[0]; |
34 | 0 | StringBuilder builder; |
35 | 0 | for (auto& path : m_paths) { |
36 | 0 | builder.append('/'); |
37 | 0 | builder.append(path); |
38 | 0 | } |
39 | 0 | return builder.to_string(); |
40 | 0 | } |
41 | | |
42 | | URL URL::complete_url(String const& string) const |
43 | 0 | { |
44 | 0 | if (!is_valid()) |
45 | 0 | return {}; |
46 | | |
47 | 0 | return URLParser::parse(string, this); |
48 | 0 | } |
49 | | |
50 | | void URL::set_scheme(String scheme) |
51 | 0 | { |
52 | 0 | m_scheme = move(scheme); |
53 | 0 | m_valid = compute_validity(); |
54 | 0 | } |
55 | | |
56 | | void URL::set_username(String username) |
57 | 0 | { |
58 | 0 | m_username = move(username); |
59 | 0 | m_valid = compute_validity(); |
60 | 0 | } |
61 | | |
62 | | void URL::set_password(String password) |
63 | 0 | { |
64 | 0 | m_password = move(password); |
65 | 0 | m_valid = compute_validity(); |
66 | 0 | } |
67 | | |
68 | | void URL::set_host(String host) |
69 | 0 | { |
70 | 0 | m_host = move(host); |
71 | 0 | m_valid = compute_validity(); |
72 | 0 | } |
73 | | |
74 | | void URL::set_port(Optional<u16> port) |
75 | 0 | { |
76 | 0 | if (port == default_port_for_scheme(m_scheme)) { |
77 | 0 | m_port = {}; |
78 | 0 | return; |
79 | 0 | } |
80 | 0 | m_port = move(port); |
81 | 0 | m_valid = compute_validity(); |
82 | 0 | } |
83 | | |
84 | | void URL::set_paths(Vector<String> paths) |
85 | 0 | { |
86 | 0 | m_paths = move(paths); |
87 | 0 | m_valid = compute_validity(); |
88 | 0 | } |
89 | | |
90 | | void URL::set_query(String query) |
91 | 0 | { |
92 | 0 | m_query = move(query); |
93 | 0 | } |
94 | | |
95 | | void URL::set_fragment(String fragment) |
96 | 0 | { |
97 | 0 | m_fragment = move(fragment); |
98 | 0 | } |
99 | | |
100 | | // FIXME: This is by no means complete. |
101 | | // NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong. |
102 | | bool URL::compute_validity() const |
103 | 0 | { |
104 | 0 | if (m_scheme.is_empty()) |
105 | 0 | return false; |
106 | | |
107 | 0 | if (m_scheme == "data") { |
108 | 0 | if (m_data_mime_type.is_empty()) |
109 | 0 | return false; |
110 | 0 | if (m_data_payload_is_base64) { |
111 | 0 | if (m_data_payload.length() % 4 != 0) |
112 | 0 | return false; |
113 | 0 | for (auto character : m_data_payload) { |
114 | 0 | if (!is_ascii_alphanumeric(character) || character == '+' || character == '/' || character == '=') |
115 | 0 | return false; |
116 | 0 | } |
117 | 0 | } |
118 | 0 | } else if (m_cannot_be_a_base_url) { |
119 | 0 | if (m_paths.size() != 1) |
120 | 0 | return false; |
121 | 0 | if (m_paths[0].is_empty()) |
122 | 0 | return false; |
123 | 0 | } else { |
124 | 0 | if (m_scheme.is_one_of("about", "mailto")) |
125 | 0 | return false; |
126 | | // NOTE: Maybe it is allowed to have a zero-segment path. |
127 | 0 | if (m_paths.size() == 0) |
128 | 0 | return false; |
129 | 0 | } |
130 | | |
131 | | // NOTE: A file URL's host should be the empty string for localhost, not null. |
132 | 0 | if (m_scheme == "file" && m_host.is_null()) |
133 | 0 | return false; |
134 | | |
135 | 0 | return true; |
136 | 0 | } |
137 | | |
138 | | bool URL::scheme_requires_port(StringView scheme) |
139 | 0 | { |
140 | 0 | return (default_port_for_scheme(scheme) != 0); |
141 | 0 | } |
142 | | |
143 | | u16 URL::default_port_for_scheme(StringView scheme) |
144 | 0 | { |
145 | 0 | if (scheme == "http") |
146 | 0 | return 80; |
147 | 0 | if (scheme == "https") |
148 | 0 | return 443; |
149 | 0 | if (scheme == "gemini") |
150 | 0 | return 1965; |
151 | 0 | if (scheme == "irc") |
152 | 0 | return 6667; |
153 | 0 | if (scheme == "ircs") |
154 | 0 | return 6697; |
155 | 0 | if (scheme == "ws") |
156 | 0 | return 80; |
157 | 0 | if (scheme == "wss") |
158 | 0 | return 443; |
159 | 0 | return 0; |
160 | 0 | } |
161 | | |
162 | | URL URL::create_with_file_scheme(String const& path, String const& fragment, String const& hostname) |
163 | 0 | { |
164 | 0 | LexicalPath lexical_path(path); |
165 | 0 | if (!lexical_path.is_absolute()) |
166 | 0 | return {}; |
167 | | |
168 | 0 | URL url; |
169 | 0 | url.set_scheme("file"); |
170 | | // NOTE: If the hostname is localhost (or null, which implies localhost), it should be set to the empty string. |
171 | | // This is because a file URL always needs a non-null hostname. |
172 | 0 | url.set_host(hostname.is_null() || hostname == "localhost" ? String::empty() : hostname); |
173 | 0 | url.set_paths(lexical_path.parts()); |
174 | | // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment. |
175 | 0 | if (path.ends_with('/')) |
176 | 0 | url.append_path(""); |
177 | 0 | url.set_fragment(fragment); |
178 | 0 | return url; |
179 | 0 | } |
180 | | |
181 | | URL URL::create_with_help_scheme(String const& path, String const& fragment, String const& hostname) |
182 | 0 | { |
183 | 0 | LexicalPath lexical_path(path); |
184 | |
|
185 | 0 | URL url; |
186 | 0 | url.set_scheme("help"); |
187 | | // NOTE: If the hostname is localhost (or null, which implies localhost), it should be set to the empty string. |
188 | | // This is because a file URL always needs a non-null hostname. |
189 | 0 | url.set_host(hostname.is_null() || hostname == "localhost" ? String::empty() : hostname); |
190 | 0 | url.set_paths(lexical_path.parts()); |
191 | | // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment. |
192 | 0 | if (path.ends_with('/')) |
193 | 0 | url.append_path(""); |
194 | 0 | url.set_fragment(fragment); |
195 | 0 | return url; |
196 | 0 | } |
197 | | |
198 | | URL URL::create_with_url_or_path(String const& url_or_path) |
199 | 0 | { |
200 | 0 | URL url = url_or_path; |
201 | 0 | if (url.is_valid()) |
202 | 0 | return url; |
203 | | |
204 | 0 | String path = LexicalPath::canonicalized_path(url_or_path); |
205 | 0 | return URL::create_with_file_scheme(path); |
206 | 0 | } |
207 | | |
208 | | // https://url.spec.whatwg.org/#special-scheme |
209 | | bool URL::is_special_scheme(StringView scheme) |
210 | 0 | { |
211 | 0 | return scheme.is_one_of("ftp", "file", "http", "https", "ws", "wss"); |
212 | 0 | } |
213 | | |
214 | | String URL::serialize_data_url() const |
215 | 0 | { |
216 | 0 | VERIFY(m_scheme == "data"); |
217 | 0 | VERIFY(!m_data_mime_type.is_null()); |
218 | 0 | VERIFY(!m_data_payload.is_null()); |
219 | 0 | StringBuilder builder; |
220 | 0 | builder.append(m_scheme); |
221 | 0 | builder.append(':'); |
222 | 0 | builder.append(m_data_mime_type); |
223 | 0 | if (m_data_payload_is_base64) |
224 | 0 | builder.append(";base64"); |
225 | 0 | builder.append(','); |
226 | | // NOTE: The specification does not say anything about encoding this, but we should encode at least control and non-ASCII |
227 | | // characters (since this is also a valid representation of the same data URL). |
228 | 0 | builder.append(URL::percent_encode(m_data_payload, PercentEncodeSet::C0Control)); |
229 | 0 | return builder.to_string(); |
230 | 0 | } |
231 | | |
232 | | // https://url.spec.whatwg.org/#concept-url-serializer |
233 | | String URL::serialize(ExcludeFragment exclude_fragment) const |
234 | 0 | { |
235 | 0 | if (m_scheme == "data") |
236 | 0 | return serialize_data_url(); |
237 | 0 | StringBuilder builder; |
238 | 0 | builder.append(m_scheme); |
239 | 0 | builder.append(':'); |
240 | |
|
241 | 0 | if (!m_host.is_null()) { |
242 | 0 | builder.append("//"); |
243 | |
|
244 | 0 | if (includes_credentials()) { |
245 | 0 | builder.append(percent_encode(m_username, PercentEncodeSet::Userinfo)); |
246 | 0 | if (!m_password.is_empty()) { |
247 | 0 | builder.append(':'); |
248 | 0 | builder.append(percent_encode(m_password, PercentEncodeSet::Userinfo)); |
249 | 0 | } |
250 | 0 | builder.append('@'); |
251 | 0 | } |
252 | |
|
253 | 0 | builder.append(m_host); |
254 | 0 | if (m_port.has_value()) |
255 | 0 | builder.appendff(":{}", *m_port); |
256 | 0 | } |
257 | |
|
258 | 0 | if (cannot_be_a_base_url()) { |
259 | 0 | builder.append(percent_encode(m_paths[0], PercentEncodeSet::Path)); |
260 | 0 | } else { |
261 | 0 | if (m_host.is_null() && m_paths.size() > 1 && m_paths[0].is_empty()) |
262 | 0 | builder.append("/."); |
263 | 0 | for (auto& segment : m_paths) { |
264 | 0 | builder.append('/'); |
265 | 0 | builder.append(percent_encode(segment, PercentEncodeSet::Path)); |
266 | 0 | } |
267 | 0 | } |
268 | |
|
269 | 0 | if (!m_query.is_null()) { |
270 | 0 | builder.append('?'); |
271 | 0 | builder.append(percent_encode(m_query, is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query)); |
272 | 0 | } |
273 | |
|
274 | 0 | if (exclude_fragment == ExcludeFragment::No && !m_fragment.is_null()) { |
275 | 0 | builder.append('#'); |
276 | 0 | builder.append(percent_encode(m_fragment, PercentEncodeSet::Fragment)); |
277 | 0 | } |
278 | |
|
279 | 0 | return builder.to_string(); |
280 | 0 | } |
281 | | |
282 | | // https://url.spec.whatwg.org/#url-rendering |
283 | | // NOTE: This does e.g. not display credentials. |
284 | | // FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points |
285 | | // resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible. |
286 | | String URL::serialize_for_display() const |
287 | 0 | { |
288 | 0 | VERIFY(m_valid); |
289 | 0 | if (m_scheme == "data") |
290 | 0 | return serialize_data_url(); |
291 | 0 | StringBuilder builder; |
292 | 0 | builder.append(m_scheme); |
293 | 0 | builder.append(':'); |
294 | |
|
295 | 0 | if (!m_host.is_null()) { |
296 | 0 | builder.append("//"); |
297 | 0 | builder.append(m_host); |
298 | 0 | if (m_port.has_value()) |
299 | 0 | builder.appendff(":{}", *m_port); |
300 | 0 | } |
301 | |
|
302 | 0 | if (cannot_be_a_base_url()) { |
303 | 0 | builder.append(percent_encode(m_paths[0], PercentEncodeSet::Path)); |
304 | 0 | } else { |
305 | 0 | if (m_host.is_null() && m_paths.size() > 1 && m_paths[0].is_empty()) |
306 | 0 | builder.append("/."); |
307 | 0 | for (auto& segment : m_paths) { |
308 | 0 | builder.append('/'); |
309 | 0 | builder.append(percent_encode(segment, PercentEncodeSet::Path)); |
310 | 0 | } |
311 | 0 | } |
312 | |
|
313 | 0 | if (!m_query.is_null()) { |
314 | 0 | builder.append('?'); |
315 | 0 | builder.append(percent_encode(m_query, is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query)); |
316 | 0 | } |
317 | |
|
318 | 0 | if (!m_fragment.is_null()) { |
319 | 0 | builder.append('#'); |
320 | 0 | builder.append(percent_encode(m_fragment, PercentEncodeSet::Fragment)); |
321 | 0 | } |
322 | |
|
323 | 0 | return builder.to_string(); |
324 | 0 | } |
325 | | |
326 | | // https://html.spec.whatwg.org/multipage/origin.html#ascii-serialisation-of-an-origin |
327 | | // https://url.spec.whatwg.org/#concept-url-origin |
328 | | String URL::serialize_origin() const |
329 | 0 | { |
330 | 0 | VERIFY(m_valid); |
331 | | |
332 | 0 | if (m_scheme == "blob"sv) { |
333 | | // TODO: 1. If URL’s blob URL entry is non-null, then return URL’s blob URL entry’s environment’s origin. |
334 | | // 2. Let url be the result of parsing URL’s path[0]. |
335 | 0 | VERIFY(!m_paths.is_empty()); |
336 | 0 | URL url = m_paths[0]; |
337 | | // 3. Return a new opaque origin, if url is failure, and url’s origin otherwise. |
338 | 0 | if (!url.is_valid()) |
339 | 0 | return "null"; |
340 | 0 | return url.serialize_origin(); |
341 | 0 | } else if (!m_scheme.is_one_of("ftp"sv, "http"sv, "https"sv, "ws"sv, "wss"sv)) { // file: "Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin." |
342 | 0 | return "null"; |
343 | 0 | } |
344 | | |
345 | 0 | StringBuilder builder; |
346 | 0 | builder.append(m_scheme); |
347 | 0 | builder.append("://"sv); |
348 | 0 | builder.append(m_host); |
349 | 0 | if (m_port.has_value()) |
350 | 0 | builder.append(":{}", *m_port); |
351 | 0 | return builder.build(); |
352 | 0 | } |
353 | | |
354 | | bool URL::equals(URL const& other, ExcludeFragment exclude_fragments) const |
355 | 0 | { |
356 | 0 | if (this == &other) |
357 | 0 | return true; |
358 | 0 | if (!m_valid || !other.m_valid) |
359 | 0 | return false; |
360 | 0 | return serialize(exclude_fragments) == other.serialize(exclude_fragments); |
361 | 0 | } |
362 | | |
363 | | String URL::basename() const |
364 | 0 | { |
365 | 0 | if (!m_valid) |
366 | 0 | return {}; |
367 | 0 | if (m_paths.is_empty()) |
368 | 0 | return {}; |
369 | 0 | return m_paths.last(); |
370 | 0 | } |
371 | | |
372 | | void URL::append_percent_encoded(StringBuilder& builder, u32 code_point) |
373 | 0 | { |
374 | 0 | if (code_point <= 0x7f) |
375 | 0 | builder.appendff("%{:02X}", code_point); |
376 | 0 | else if (code_point <= 0x07ff) |
377 | 0 | builder.appendff("%{:02X}%{:02X}", ((code_point >> 6) & 0x1f) | 0xc0, (code_point & 0x3f) | 0x80); |
378 | 0 | else if (code_point <= 0xffff) |
379 | 0 | builder.appendff("%{:02X}%{:02X}%{:02X}", ((code_point >> 12) & 0x0f) | 0xe0, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80); |
380 | 0 | else if (code_point <= 0x10ffff) |
381 | 0 | builder.appendff("%{:02X}%{:02X}%{:02X}%{:02X}", ((code_point >> 18) & 0x07) | 0xf0, ((code_point >> 12) & 0x3f) | 0x80, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80); |
382 | 0 | else |
383 | 0 | VERIFY_NOT_REACHED(); |
384 | 0 | } |
385 | | |
386 | | // https://url.spec.whatwg.org/#c0-control-percent-encode-set |
387 | | bool URL::code_point_is_in_percent_encode_set(u32 code_point, URL::PercentEncodeSet set) |
388 | 0 | { |
389 | 0 | switch (set) { |
390 | 0 | case URL::PercentEncodeSet::C0Control: |
391 | 0 | return code_point < 0x20 || code_point > 0x7E; |
392 | 0 | case URL::PercentEncodeSet::Fragment: |
393 | 0 | return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"<>`"sv.contains(code_point); |
394 | 0 | case URL::PercentEncodeSet::Query: |
395 | 0 | return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"#<>"sv.contains(code_point); |
396 | 0 | case URL::PercentEncodeSet::SpecialQuery: |
397 | 0 | return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || code_point == '\''; |
398 | 0 | case URL::PercentEncodeSet::Path: |
399 | 0 | return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || "?`{}"sv.contains(code_point); |
400 | 0 | case URL::PercentEncodeSet::Userinfo: |
401 | 0 | return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Path) || "/:;=@[\\]^|"sv.contains(code_point); |
402 | 0 | case URL::PercentEncodeSet::Component: |
403 | 0 | return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Userinfo) || "$%&+,"sv.contains(code_point); |
404 | 0 | case URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded: |
405 | 0 | return code_point >= 0x7E || !(is_ascii_alphanumeric(code_point) || "!'()~"sv.contains(code_point)); |
406 | 0 | case URL::PercentEncodeSet::EncodeURI: |
407 | | // NOTE: This is the same percent encode set that JS encodeURI() uses. |
408 | | // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI |
409 | 0 | return code_point >= 0x7E || (!is_ascii_alphanumeric(code_point) && !";,/?:@&=+$-_.!~*'()#"sv.contains(code_point)); |
410 | 0 | default: |
411 | 0 | VERIFY_NOT_REACHED(); |
412 | 0 | } |
413 | 0 | } |
414 | | |
415 | | void URL::append_percent_encoded_if_necessary(StringBuilder& builder, u32 code_point, URL::PercentEncodeSet set) |
416 | 0 | { |
417 | 0 | if (code_point_is_in_percent_encode_set(code_point, set)) |
418 | 0 | append_percent_encoded(builder, code_point); |
419 | 0 | else |
420 | 0 | builder.append_code_point(code_point); |
421 | 0 | } |
422 | | |
423 | | String URL::percent_encode(StringView input, URL::PercentEncodeSet set, SpaceAsPlus space_as_plus) |
424 | 0 | { |
425 | 0 | StringBuilder builder; |
426 | 0 | for (auto code_point : Utf8View(input)) { |
427 | 0 | if (space_as_plus == SpaceAsPlus::Yes && code_point == ' ') |
428 | 0 | builder.append('+'); |
429 | 0 | else |
430 | 0 | append_percent_encoded_if_necessary(builder, code_point, set); |
431 | 0 | } |
432 | 0 | return builder.to_string(); |
433 | 0 | } |
434 | | |
435 | | String URL::percent_decode(StringView input) |
436 | 872 | { |
437 | 872 | if (!input.contains('%')) |
438 | 215 | return input; |
439 | 657 | StringBuilder builder; |
440 | 657 | Utf8View utf8_view(input); |
441 | 1.51M | for (auto it = utf8_view.begin(); !it.done(); ++it) { |
442 | 1.51M | if (*it != '%') { |
443 | 1.40M | builder.append_code_point(*it); |
444 | 1.40M | } else if (!is_ascii_hex_digit(it.peek(1).value_or(0)) || !is_ascii_hex_digit(it.peek(2).value_or(0))) { |
445 | 95.7k | builder.append_code_point(*it); |
446 | 95.7k | } else { |
447 | 9.90k | ++it; |
448 | 9.90k | u8 byte = parse_ascii_hex_digit(*it) << 4; |
449 | 9.90k | ++it; |
450 | 9.90k | byte += parse_ascii_hex_digit(*it); |
451 | 9.90k | builder.append(byte); |
452 | 9.90k | } |
453 | 1.51M | } |
454 | 657 | return builder.to_string(); |
455 | 872 | } |
456 | | |
457 | | } |