Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/httpx/_urlparse.py: 22%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2An implementation of `urlparse` that provides URL validation and normalization
3as described by RFC3986.
5We rely on this implementation rather than the one in Python's stdlib, because:
7* It provides more complete URL validation.
8* It properly differentiates between an empty querystring and an absent querystring,
9 to distinguish URLs with a trailing '?'.
10* It handles scheme, hostname, port, and path normalization.
11* It supports IDNA hostnames, normalizing them to their encoded form.
12* The API supports passing individual components, as well as the complete URL string.
14Previously we relied on the excellent `rfc3986` package to handle URL parsing and
15validation, but this module provides a simpler alternative, with less indirection
16required.
17"""
18from __future__ import annotations
20import ipaddress
21import re
22import typing
24import idna
26from ._exceptions import InvalidURL
28MAX_URL_LENGTH = 65536
30# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3
31UNRESERVED_CHARACTERS = (
32 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
33)
34SUB_DELIMS = "!$&'()*+,;="
36PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")
39# {scheme}: (optional)
40# //{authority} (optional)
41# {path}
42# ?{query} (optional)
43# #{fragment} (optional)
44URL_REGEX = re.compile(
45 (
46 r"(?:(?P<scheme>{scheme}):)?"
47 r"(?://(?P<authority>{authority}))?"
48 r"(?P<path>{path})"
49 r"(?:\?(?P<query>{query}))?"
50 r"(?:#(?P<fragment>{fragment}))?"
51 ).format(
52 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?",
53 authority="[^/?#]*",
54 path="[^?#]*",
55 query="[^#]*",
56 fragment=".*",
57 )
58)
60# {userinfo}@ (optional)
61# {host}
62# :{port} (optional)
63AUTHORITY_REGEX = re.compile(
64 (
65 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?"
66 ).format(
67 userinfo=".*", # Any character sequence.
68 host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@',
69 # or an IPv6 address enclosed within square brackets.
70 port=".*", # Any character sequence.
71 )
72)
75# If we call urlparse with an individual component, then we need to regex
76# validate that component individually.
77# Note that we're duplicating the same strings as above. Shock! Horror!!
78COMPONENT_REGEX = {
79 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"),
80 "authority": re.compile("[^/?#]*"),
81 "path": re.compile("[^?#]*"),
82 "query": re.compile("[^#]*"),
83 "fragment": re.compile(".*"),
84 "userinfo": re.compile("[^@]*"),
85 "host": re.compile("(\\[.*\\]|[^:]*)"),
86 "port": re.compile(".*"),
87}
90# We use these simple regexs as a first pass before handing off to
91# the stdlib 'ipaddress' module for IP address validation.
92IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$")
93IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$")
96class ParseResult(typing.NamedTuple):
97 scheme: str
98 userinfo: str
99 host: str
100 port: int | None
101 path: str
102 query: str | None
103 fragment: str | None
105 @property
106 def authority(self) -> str:
107 return "".join(
108 [
109 f"{self.userinfo}@" if self.userinfo else "",
110 f"[{self.host}]" if ":" in self.host else self.host,
111 f":{self.port}" if self.port is not None else "",
112 ]
113 )
115 @property
116 def netloc(self) -> str:
117 return "".join(
118 [
119 f"[{self.host}]" if ":" in self.host else self.host,
120 f":{self.port}" if self.port is not None else "",
121 ]
122 )
124 def copy_with(self, **kwargs: str | None) -> ParseResult:
125 if not kwargs:
126 return self
128 defaults = {
129 "scheme": self.scheme,
130 "authority": self.authority,
131 "path": self.path,
132 "query": self.query,
133 "fragment": self.fragment,
134 }
135 defaults.update(kwargs)
136 return urlparse("", **defaults)
138 def __str__(self) -> str:
139 authority = self.authority
140 return "".join(
141 [
142 f"{self.scheme}:" if self.scheme else "",
143 f"//{authority}" if authority else "",
144 self.path,
145 f"?{self.query}" if self.query is not None else "",
146 f"#{self.fragment}" if self.fragment is not None else "",
147 ]
148 )
151def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
152 # Initial basic checks on allowable URLs.
153 # ---------------------------------------
155 # Hard limit the maximum allowable URL length.
156 if len(url) > MAX_URL_LENGTH:
157 raise InvalidURL("URL too long")
159 # If a URL includes any ASCII control characters including \t, \r, \n,
160 # then treat it as invalid.
161 if any(char.isascii() and not char.isprintable() for char in url):
162 raise InvalidURL("Invalid non-printable ASCII character in URL")
164 # Some keyword arguments require special handling.
165 # ------------------------------------------------
167 # Coerce "port" to a string, if it is provided as an integer.
168 if "port" in kwargs:
169 port = kwargs["port"]
170 kwargs["port"] = str(port) if isinstance(port, int) else port
172 # Replace "netloc" with "host and "port".
173 if "netloc" in kwargs:
174 netloc = kwargs.pop("netloc") or ""
175 kwargs["host"], _, kwargs["port"] = netloc.partition(":")
177 # Replace "username" and/or "password" with "userinfo".
178 if "username" in kwargs or "password" in kwargs:
179 username = quote(kwargs.pop("username", "") or "")
180 password = quote(kwargs.pop("password", "") or "")
181 kwargs["userinfo"] = f"{username}:{password}" if password else username
183 # Replace "raw_path" with "path" and "query".
184 if "raw_path" in kwargs:
185 raw_path = kwargs.pop("raw_path") or ""
186 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?")
187 if not seperator:
188 kwargs["query"] = None
190 # Ensure that IPv6 "host" addresses are always escaped with "[...]".
191 if "host" in kwargs:
192 host = kwargs.get("host") or ""
193 if ":" in host and not (host.startswith("[") and host.endswith("]")):
194 kwargs["host"] = f"[{host}]"
196 # If any keyword arguments are provided, ensure they are valid.
197 # -------------------------------------------------------------
199 for key, value in kwargs.items():
200 if value is not None:
201 if len(value) > MAX_URL_LENGTH:
202 raise InvalidURL(f"URL component '{key}' too long")
204 # If a component includes any ASCII control characters including \t, \r, \n,
205 # then treat it as invalid.
206 if any(char.isascii() and not char.isprintable() for char in value):
207 raise InvalidURL(
208 f"Invalid non-printable ASCII character in URL component '{key}'"
209 )
211 # Ensure that keyword arguments match as a valid regex.
212 if not COMPONENT_REGEX[key].fullmatch(value):
213 raise InvalidURL(f"Invalid URL component '{key}'")
215 # The URL_REGEX will always match, but may have empty components.
216 url_match = URL_REGEX.match(url)
217 assert url_match is not None
218 url_dict = url_match.groupdict()
220 # * 'scheme', 'authority', and 'path' may be empty strings.
221 # * 'query' may be 'None', indicating no trailing "?" portion.
222 # Any string including the empty string, indicates a trailing "?".
223 # * 'fragment' may be 'None', indicating no trailing "#" portion.
224 # Any string including the empty string, indicates a trailing "#".
225 scheme = kwargs.get("scheme", url_dict["scheme"]) or ""
226 authority = kwargs.get("authority", url_dict["authority"]) or ""
227 path = kwargs.get("path", url_dict["path"]) or ""
228 query = kwargs.get("query", url_dict["query"])
229 fragment = kwargs.get("fragment", url_dict["fragment"])
231 # The AUTHORITY_REGEX will always match, but may have empty components.
232 authority_match = AUTHORITY_REGEX.match(authority)
233 assert authority_match is not None
234 authority_dict = authority_match.groupdict()
236 # * 'userinfo' and 'host' may be empty strings.
237 # * 'port' may be 'None'.
238 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or ""
239 host = kwargs.get("host", authority_dict["host"]) or ""
240 port = kwargs.get("port", authority_dict["port"])
242 # Normalize and validate each component.
243 # We end up with a parsed representation of the URL,
244 # with components that are plain ASCII bytestrings.
245 parsed_scheme: str = scheme.lower()
246 parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":")
247 parsed_host: str = encode_host(host)
248 parsed_port: int | None = normalize_port(port, scheme)
250 has_scheme = parsed_scheme != ""
251 has_authority = (
252 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None
253 )
254 validate_path(path, has_scheme=has_scheme, has_authority=has_authority)
255 if has_authority:
256 path = normalize_path(path)
258 # The GEN_DELIMS set is... : / ? # [ ] @
259 # These do not need to be percent-quoted unless they serve as delimiters for the
260 # specific component.
262 # For 'path' we need to drop ? and # from the GEN_DELIMS set.
263 parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@")
264 # For 'query' we need to drop '#' from the GEN_DELIMS set.
265 parsed_query: str | None = (
266 None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@")
267 )
268 # For 'fragment' we can include all of the GEN_DELIMS set.
269 parsed_fragment: str | None = (
270 None if fragment is None else quote(fragment, safe=SUB_DELIMS + ":/?#[]@")
271 )
273 # The parsed ASCII bytestrings are our canonical form.
274 # All properties of the URL are derived from these.
275 return ParseResult(
276 parsed_scheme,
277 parsed_userinfo,
278 parsed_host,
279 parsed_port,
280 parsed_path,
281 parsed_query,
282 parsed_fragment,
283 )
286def encode_host(host: str) -> str:
287 if not host:
288 return ""
290 elif IPv4_STYLE_HOSTNAME.match(host):
291 # Validate IPv4 hostnames like #.#.#.#
292 #
293 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
294 #
295 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
296 try:
297 ipaddress.IPv4Address(host)
298 except ipaddress.AddressValueError:
299 raise InvalidURL(f"Invalid IPv4 address: {host!r}")
300 return host
302 elif IPv6_STYLE_HOSTNAME.match(host):
303 # Validate IPv6 hostnames like [...]
304 #
305 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
306 #
307 # "A host identified by an Internet Protocol literal address, version 6
308 # [RFC3513] or later, is distinguished by enclosing the IP literal
309 # within square brackets ("[" and "]"). This is the only place where
310 # square bracket characters are allowed in the URI syntax."
311 try:
312 ipaddress.IPv6Address(host[1:-1])
313 except ipaddress.AddressValueError:
314 raise InvalidURL(f"Invalid IPv6 address: {host!r}")
315 return host[1:-1]
317 elif host.isascii():
318 # Regular ASCII hostnames
319 #
320 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
321 #
322 # reg-name = *( unreserved / pct-encoded / sub-delims )
323 return quote(host.lower(), safe=SUB_DELIMS)
325 # IDNA hostnames
326 try:
327 return idna.encode(host.lower()).decode("ascii")
328 except idna.IDNAError:
329 raise InvalidURL(f"Invalid IDNA hostname: {host!r}")
332def normalize_port(port: str | int | None, scheme: str) -> int | None:
333 # From https://tools.ietf.org/html/rfc3986#section-3.2.3
334 #
335 # "A scheme may define a default port. For example, the "http" scheme
336 # defines a default port of "80", corresponding to its reserved TCP
337 # port number. The type of port designated by the port number (e.g.,
338 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and
339 # normalizers should omit the port component and its ":" delimiter if
340 # port is empty or if its value would be the same as that of the
341 # scheme's default."
342 if port is None or port == "":
343 return None
345 try:
346 port_as_int = int(port)
347 except ValueError:
348 raise InvalidURL(f"Invalid port: {port!r}")
350 # See https://url.spec.whatwg.org/#url-miscellaneous
351 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get(
352 scheme
353 )
354 if port_as_int == default_port:
355 return None
356 return port_as_int
359def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:
360 """
361 Path validation rules that depend on if the URL contains
362 a scheme or authority component.
364 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3
365 """
366 if has_authority:
367 # If a URI contains an authority component, then the path component
368 # must either be empty or begin with a slash ("/") character."
369 if path and not path.startswith("/"):
370 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")
371 else:
372 # If a URI does not contain an authority component, then the path cannot begin
373 # with two slash characters ("//").
374 if path.startswith("//"):
375 raise InvalidURL(
376 "URLs with no authority component cannot have a path starting with '//'"
377 )
378 # In addition, a URI reference (Section 4.1) may be a relative-path reference,
379 # in which case the first path segment cannot contain a colon (":") character.
380 if path.startswith(":") and not has_scheme:
381 raise InvalidURL(
382 "URLs with no scheme component cannot have a path starting with ':'"
383 )
386def normalize_path(path: str) -> str:
387 """
388 Drop "." and ".." segments from a URL path.
390 For example:
392 normalize_path("/path/./to/somewhere/..") == "/path/to"
393 """
394 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
395 components = path.split("/")
396 output: list[str] = []
397 for component in components:
398 if component == ".":
399 pass
400 elif component == "..":
401 if output and output != [""]:
402 output.pop()
403 else:
404 output.append(component)
405 return "/".join(output)
408def percent_encode(char: str) -> str:
409 """
410 Replace a single character with the percent-encoded representation.
412 Characters outside the ASCII range are represented with their a percent-encoded
413 representation of their UTF-8 byte sequence.
415 For example:
417 percent_encode(" ") == "%20"
418 """
419 return "".join([f"%{byte:02x}" for byte in char.encode("utf-8")]).upper()
422def is_safe(string: str, safe: str = "/") -> bool:
423 """
424 Determine if a given string is already quote-safe.
425 """
426 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe + "%"
428 # All characters must already be non-escaping or '%'
429 for char in string:
430 if char not in NON_ESCAPED_CHARS:
431 return False
433 return True
436def percent_encoded(string: str, safe: str = "/") -> str:
437 """
438 Use percent-encoding to quote a string.
439 """
440 if is_safe(string, safe=safe):
441 return string
443 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe
444 return "".join(
445 [char if char in NON_ESCAPED_CHARS else percent_encode(char) for char in string]
446 )
449def quote(string: str, safe: str = "/") -> str:
450 """
451 Use percent-encoding to quote a string, omitting existing '%xx' escape sequences.
453 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1
455 * `string`: The string to be percent-escaped.
456 * `safe`: A string containing characters that may be treated as safe, and do not
457 need to be escaped. Unreserved characters are always treated as safe.
458 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3
459 """
460 parts = []
461 current_position = 0
462 for match in re.finditer(PERCENT_ENCODED_REGEX, string):
463 start_position, end_position = match.start(), match.end()
464 matched_text = match.group(0)
465 # Add any text up to the '%xx' escape sequence.
466 if start_position != current_position:
467 leading_text = string[current_position:start_position]
468 parts.append(percent_encoded(leading_text, safe=safe))
470 # Add the '%xx' escape sequence.
471 parts.append(matched_text)
472 current_position = end_position
474 # Add any text after the final '%xx' escape sequence.
475 if current_position != len(string):
476 trailing_text = string[current_position:]
477 parts.append(percent_encoded(trailing_text, safe=safe))
479 return "".join(parts)
482def urlencode(items: list[tuple[str, str]]) -> str:
483 """
484 We can use a much simpler version of the stdlib urlencode here because
485 we don't need to handle a bunch of different typing cases, such as bytes vs str.
487 https://github.com/python/cpython/blob/b2f7b2ef0b5421e01efb8c7bee2ef95d3bab77eb/Lib/urllib/parse.py#L926
489 Note that we use '%20' encoding for spaces. and '%2F for '/'.
490 This is slightly different than `requests`, but is the behaviour that browsers use.
492 See
493 - https://github.com/encode/httpx/issues/2536
494 - https://github.com/encode/httpx/issues/2721
495 - https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode
496 """
497 return "&".join(
498 [
499 percent_encoded(k, safe="") + "=" + percent_encoded(v, safe="")
500 for k, v in items
501 ]
502 )