Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/yarl/_parse.py: 83%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""URL parsing utilities."""
3import re
4import unicodedata
5from functools import lru_cache
6from urllib.parse import scheme_chars, uses_netloc
8from ._quoters import QUOTER, UNQUOTER_PLUS
10# Leading and trailing C0 control and space to be stripped per WHATWG spec.
11# == "".join([chr(i) for i in range(0, 0x20 + 1)])
12WHATWG_C0_CONTROL_OR_SPACE = (
13 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10"
14 "\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f "
15)
17# Unsafe bytes to be removed per WHATWG spec
18UNSAFE_URL_BYTES_TO_REMOVE = ["\t", "\r", "\n"]
19USES_AUTHORITY = frozenset(uses_netloc)
21SplitURLType = tuple[str, str, str, str, str]
24def split_url(url: str) -> SplitURLType:
25 """Split URL into parts."""
26 # Adapted from urllib.parse.urlsplit
27 # Only lstrip url as some applications rely on preserving trailing space.
28 # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
29 url = url.lstrip(WHATWG_C0_CONTROL_OR_SPACE)
30 for b in UNSAFE_URL_BYTES_TO_REMOVE:
31 if b in url:
32 url = url.replace(b, "")
34 scheme = netloc = query = fragment = ""
35 i = url.find(":")
36 if i > 0 and url[0] in scheme_chars:
37 for c in url[1:i]:
38 if c not in scheme_chars:
39 break
40 else:
41 scheme, url = url[:i].lower(), url[i + 1 :]
42 has_hash = "#" in url
43 has_question_mark = "?" in url
44 if url[:2] == "//":
45 delim = len(url) # position of end of domain part of url, default is end
46 if has_hash and has_question_mark:
47 delim_chars = "/?#"
48 elif has_question_mark:
49 delim_chars = "/?"
50 elif has_hash:
51 delim_chars = "/#"
52 else:
53 delim_chars = "/"
54 for c in delim_chars: # look for delimiters; the order is NOT important
55 wdelim = url.find(c, 2) # find first of this delim
56 if wdelim >= 0 and wdelim < delim: # if found
57 delim = wdelim # use earliest delim position
58 netloc = url[2:delim]
59 url = url[delim:]
60 # Backslash is not valid in the authority component per RFC 3986.
61 # WHATWG parsers treat \ as a path separator for special schemes, so
62 # accepting it in the authority can cause host parsing ambiguity.
63 if "\\" in netloc:
64 raise ValueError(
65 "Invalid URL: backslash ('\\') is not allowed in the authority "
66 "component per RFC 3986."
67 )
68 has_left_bracket = "[" in netloc
69 has_right_bracket = "]" in netloc
70 if (has_left_bracket and not has_right_bracket) or (
71 has_right_bracket and not has_left_bracket
72 ):
73 raise ValueError("Invalid IPv6 URL")
74 if has_left_bracket:
75 # Per RFC 3986, brackets are only valid at the START of the host
76 # for IP-literal addresses. Text before '[' (e.g. '127.0.0.1[::1]')
77 # is invalid and must be rejected to prevent SSRF bypasses. The
78 # count checks reject URLs with more than one bracket pair in the
79 # host subcomponent (e.g. 'http://[:localhost[]].google:80'),
80 # which would otherwise resolve to an unintended host.
81 hostinfo = netloc.rpartition("@")[2]
82 if hostinfo[0] != "[" or hostinfo.count("[") > 1 or hostinfo.count("]") > 1:
83 raise ValueError("Invalid IPv6 URL")
84 bracketed_host, _, after_bracket = hostinfo[1:].partition("]")
85 # Per RFC 3986 §3.2.2, after the closing ']' of an IP-literal
86 # only ":" <port> or end-of-authority is valid. Any other text
87 # (e.g. '[::1]allowed.example:1') must be rejected to prevent
88 # host-confusion where the suffix is silently dropped.
89 if after_bracket and after_bracket[0] != ":":
90 raise ValueError("Invalid IPv6 URL")
91 # Valid bracketed hosts are defined in
92 # https://www.rfc-editor.org/rfc/rfc3986#page-49
93 # https://url.spec.whatwg.org/
94 if bracketed_host and bracketed_host[0] == "v":
95 if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", bracketed_host):
96 raise ValueError("IPvFuture address is invalid")
97 elif ":" not in bracketed_host:
98 raise ValueError("The IPv6 content between brackets is not valid")
99 if has_hash:
100 url, _, fragment = url.partition("#")
101 if has_question_mark:
102 url, _, query = url.partition("?")
103 if netloc and not netloc.isascii():
104 _check_netloc(netloc)
105 return scheme, netloc, url, query, fragment
108def _check_netloc(netloc: str) -> None:
109 # Adapted from urllib.parse._checknetloc
110 # looking for characters like \u2100 that expand to 'a/c'
111 # IDNA uses NFKC equivalence, so normalize for this check
113 # ignore characters already included
114 # but not the surrounding text
115 n = netloc.replace("@", "").replace(":", "").replace("#", "").replace("?", "")
116 normalized_netloc = unicodedata.normalize("NFKC", n)
117 if n == normalized_netloc:
118 return
119 # Note that there are no unicode decompositions for the character '@' so
120 # its currently impossible to have test coverage for this branch, however if the
121 # one should be added in the future we want to make sure its still checked.
122 for c in "/?#@:": # pragma: no branch
123 if c in normalized_netloc:
124 raise ValueError(
125 f"netloc '{netloc}' contains invalid "
126 "characters under NFKC normalization"
127 )
130@lru_cache # match the same size as urlsplit
131def split_netloc(
132 netloc: str,
133) -> tuple[str | None, str | None, str | None, int | None]:
134 """Split netloc into username, password, host and port."""
135 if "@" not in netloc:
136 username: str | None = None
137 password: str | None = None
138 hostinfo = netloc
139 else:
140 userinfo, _, hostinfo = netloc.rpartition("@")
141 username, have_password, password = userinfo.partition(":")
142 if not have_password:
143 password = None
145 if "[" in hostinfo:
146 if hostinfo[0] != "[" or hostinfo.count("[") > 1 or hostinfo.count("]") > 1:
147 raise ValueError("Invalid IPv6 URL")
148 _, _, bracketed = hostinfo.partition("[")
149 hostname, _, port_str = bracketed.partition("]")
150 # Defense-in-depth: after ']' only ':port' or empty is valid.
151 # split_url() should have already rejected invalid suffixes,
152 # but guard here too for callers that use split_netloc() directly.
153 if port_str and port_str[0] != ":":
154 raise ValueError("Invalid IPv6 URL")
155 _, _, port_str = port_str.partition(":")
156 else:
157 hostname, _, port_str = hostinfo.partition(":")
159 if not port_str:
160 return username or None, password, hostname or None, None
162 try:
163 port = int(port_str)
164 except ValueError:
165 raise ValueError("Invalid URL: port can't be converted to integer")
166 if not (0 <= port <= 65535):
167 raise ValueError("Port out of range 0-65535")
168 return username or None, password, hostname or None, port
171def unsplit_result(
172 scheme: str, netloc: str, url: str, query: str, fragment: str
173) -> str:
174 """Unsplit a URL without any normalization."""
175 if netloc or (scheme and scheme in USES_AUTHORITY) or url[:2] == "//":
176 if url and url[:1] != "/":
177 url = f"{scheme}://{netloc}/{url}" if scheme else f"{scheme}:{url}"
178 else:
179 url = f"{scheme}://{netloc}{url}" if scheme else f"//{netloc}{url}"
180 elif scheme:
181 url = f"{scheme}:{url}"
182 if query:
183 url = f"{url}?{query}"
184 return f"{url}#{fragment}" if fragment else url
187@lru_cache # match the same size as urlsplit
188def make_netloc(
189 user: str | None,
190 password: str | None,
191 host: str | None,
192 port: int | None,
193 encode: bool = False,
194) -> str:
195 """Make netloc from parts.
197 The user and password are encoded if encode is True.
199 The host must already be encoded with _encode_host.
200 """
201 if host is None:
202 return ""
203 ret = host
204 if port is not None:
205 ret = f"{ret}:{port}"
206 if user is None and password is None:
207 return ret
208 if password is not None:
209 if not user:
210 user = ""
211 elif encode:
212 user = QUOTER(user)
213 if encode:
214 password = QUOTER(password)
215 user = f"{user}:{password}"
216 elif user and encode:
217 user = QUOTER(user)
218 return f"{user}@{ret}" if user else ret
221def query_to_pairs(query_string: str) -> list[tuple[str, str]]:
222 """Parse a query given as a string argument.
224 Works like urllib.parse.parse_qsl with keep empty values.
225 """
226 pairs: list[tuple[str, str]] = []
227 if not query_string:
228 return pairs
229 for k_v in query_string.split("&"):
230 k, _, v = k_v.partition("=")
231 pairs.append((UNQUOTER_PLUS(k), UNQUOTER_PLUS(v)))
232 return pairs