Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/httpx/_urlparse.py: 23%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2An implementation of `urlparse` that provides URL validation and normalization
3as described by RFC3986.
5We rely on this implementation rather than the one in Python's stdlib, because:
7* It provides more complete URL validation.
8* It properly differentiates between an empty querystring and an absent querystring,
9 to distinguish URLs with a trailing '?'.
10* It handles scheme, hostname, port, and path normalization.
11* It supports IDNA hostnames, normalizing them to their encoded form.
12* The API supports passing individual components, as well as the complete URL string.
14Previously we relied on the excellent `rfc3986` package to handle URL parsing and
15validation, but this module provides a simpler alternative, with less indirection
16required.
17"""
19from __future__ import annotations
21import ipaddress
22import re
23import typing
25import idna
27from ._exceptions import InvalidURL
29MAX_URL_LENGTH = 65536
31# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3
32UNRESERVED_CHARACTERS = (
33 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
34)
35SUB_DELIMS = "!$&'()*+,;="
37PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")
39# https://url.spec.whatwg.org/#percent-encoded-bytes
41# The fragment percent-encode set is the C0 control percent-encode set
42# and U+0020 SPACE, U+0022 ("), U+003C (<), U+003E (>), and U+0060 (`).
43FRAG_SAFE = "".join(
44 [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x3C, 0x3E, 0x60)]
45)
47# The query percent-encode set is the C0 control percent-encode set
48# and U+0020 SPACE, U+0022 ("), U+0023 (#), U+003C (<), and U+003E (>).
49QUERY_SAFE = "".join(
50 [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E)]
51)
53# The path percent-encode set is the query percent-encode set
54# and U+003F (?), U+0060 (`), U+007B ({), and U+007D (}).
55PATH_SAFE = "".join(
56 [
57 chr(i)
58 for i in range(0x20, 0x7F)
59 if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + (0x3F, 0x60, 0x7B, 0x7D)
60 ]
61)
63# The userinfo percent-encode set is the path percent-encode set
64# and U+002F (/), U+003A (:), U+003B (;), U+003D (=), U+0040 (@),
65# U+005B ([) to U+005E (^), inclusive, and U+007C (|).
66USERNAME_SAFE = "".join(
67 [
68 chr(i)
69 for i in range(0x20, 0x7F)
70 if i
71 not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
72 + (0x3F, 0x60, 0x7B, 0x7D)
73 + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
74 ]
75)
76PASSWORD_SAFE = "".join(
77 [
78 chr(i)
79 for i in range(0x20, 0x7F)
80 if i
81 not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
82 + (0x3F, 0x60, 0x7B, 0x7D)
83 + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
84 ]
85)
86# Note... The terminology 'userinfo' percent-encode set in the WHATWG document
87# is used for the username and password quoting. For the joint userinfo component
88# we remove U+003A (:) from the safe set.
89USERINFO_SAFE = "".join(
90 [
91 chr(i)
92 for i in range(0x20, 0x7F)
93 if i
94 not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
95 + (0x3F, 0x60, 0x7B, 0x7D)
96 + (0x2F, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
97 ]
98)
101# {scheme}: (optional)
102# //{authority} (optional)
103# {path}
104# ?{query} (optional)
105# #{fragment} (optional)
106URL_REGEX = re.compile(
107 (
108 r"(?:(?P<scheme>{scheme}):)?"
109 r"(?://(?P<authority>{authority}))?"
110 r"(?P<path>{path})"
111 r"(?:\?(?P<query>{query}))?"
112 r"(?:#(?P<fragment>{fragment}))?"
113 ).format(
114 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?",
115 authority="[^/?#]*",
116 path="[^?#]*",
117 query="[^#]*",
118 fragment=".*",
119 )
120)
122# {userinfo}@ (optional)
123# {host}
124# :{port} (optional)
125AUTHORITY_REGEX = re.compile(
126 (
127 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?"
128 ).format(
129 userinfo=".*", # Any character sequence.
130 host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@',
131 # or an IPv6 address enclosed within square brackets.
132 port=".*", # Any character sequence.
133 )
134)
137# If we call urlparse with an individual component, then we need to regex
138# validate that component individually.
139# Note that we're duplicating the same strings as above. Shock! Horror!!
140COMPONENT_REGEX = {
141 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"),
142 "authority": re.compile("[^/?#]*"),
143 "path": re.compile("[^?#]*"),
144 "query": re.compile("[^#]*"),
145 "fragment": re.compile(".*"),
146 "userinfo": re.compile("[^@]*"),
147 "host": re.compile("(\\[.*\\]|[^:]*)"),
148 "port": re.compile(".*"),
149}
152# We use these simple regexs as a first pass before handing off to
153# the stdlib 'ipaddress' module for IP address validation.
154IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$")
155IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$")
158class ParseResult(typing.NamedTuple):
159 scheme: str
160 userinfo: str
161 host: str
162 port: int | None
163 path: str
164 query: str | None
165 fragment: str | None
167 @property
168 def authority(self) -> str:
169 return "".join(
170 [
171 f"{self.userinfo}@" if self.userinfo else "",
172 f"[{self.host}]" if ":" in self.host else self.host,
173 f":{self.port}" if self.port is not None else "",
174 ]
175 )
177 @property
178 def netloc(self) -> str:
179 return "".join(
180 [
181 f"[{self.host}]" if ":" in self.host else self.host,
182 f":{self.port}" if self.port is not None else "",
183 ]
184 )
186 def copy_with(self, **kwargs: str | None) -> ParseResult:
187 if not kwargs:
188 return self
190 defaults = {
191 "scheme": self.scheme,
192 "authority": self.authority,
193 "path": self.path,
194 "query": self.query,
195 "fragment": self.fragment,
196 }
197 defaults.update(kwargs)
198 return urlparse("", **defaults)
200 def __str__(self) -> str:
201 authority = self.authority
202 return "".join(
203 [
204 f"{self.scheme}:" if self.scheme else "",
205 f"//{authority}" if authority else "",
206 self.path,
207 f"?{self.query}" if self.query is not None else "",
208 f"#{self.fragment}" if self.fragment is not None else "",
209 ]
210 )
213def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
214 # Initial basic checks on allowable URLs.
215 # ---------------------------------------
217 # Hard limit the maximum allowable URL length.
218 if len(url) > MAX_URL_LENGTH:
219 raise InvalidURL("URL too long")
221 # If a URL includes any ASCII control characters including \t, \r, \n,
222 # then treat it as invalid.
223 if any(char.isascii() and not char.isprintable() for char in url):
224 char = next(char for char in url if char.isascii() and not char.isprintable())
225 idx = url.find(char)
226 error = (
227 f"Invalid non-printable ASCII character in URL, {char!r} at position {idx}."
228 )
229 raise InvalidURL(error)
231 # Some keyword arguments require special handling.
232 # ------------------------------------------------
234 # Coerce "port" to a string, if it is provided as an integer.
235 if "port" in kwargs:
236 port = kwargs["port"]
237 kwargs["port"] = str(port) if isinstance(port, int) else port
239 # Replace "netloc" with "host and "port".
240 if "netloc" in kwargs:
241 netloc = kwargs.pop("netloc") or ""
242 kwargs["host"], _, kwargs["port"] = netloc.partition(":")
244 # Replace "username" and/or "password" with "userinfo".
245 if "username" in kwargs or "password" in kwargs:
246 username = quote(kwargs.pop("username", "") or "", safe=USERNAME_SAFE)
247 password = quote(kwargs.pop("password", "") or "", safe=PASSWORD_SAFE)
248 kwargs["userinfo"] = f"{username}:{password}" if password else username
250 # Replace "raw_path" with "path" and "query".
251 if "raw_path" in kwargs:
252 raw_path = kwargs.pop("raw_path") or ""
253 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?")
254 if not seperator:
255 kwargs["query"] = None
257 # Ensure that IPv6 "host" addresses are always escaped with "[...]".
258 if "host" in kwargs:
259 host = kwargs.get("host") or ""
260 if ":" in host and not (host.startswith("[") and host.endswith("]")):
261 kwargs["host"] = f"[{host}]"
263 # If any keyword arguments are provided, ensure they are valid.
264 # -------------------------------------------------------------
266 for key, value in kwargs.items():
267 if value is not None:
268 if len(value) > MAX_URL_LENGTH:
269 raise InvalidURL(f"URL component '{key}' too long")
271 # If a component includes any ASCII control characters including \t, \r, \n,
272 # then treat it as invalid.
273 if any(char.isascii() and not char.isprintable() for char in value):
274 char = next(
275 char for char in value if char.isascii() and not char.isprintable()
276 )
277 idx = value.find(char)
278 error = (
279 f"Invalid non-printable ASCII character in URL {key} component, "
280 f"{char!r} at position {idx}."
281 )
282 raise InvalidURL(error)
284 # Ensure that keyword arguments match as a valid regex.
285 if not COMPONENT_REGEX[key].fullmatch(value):
286 raise InvalidURL(f"Invalid URL component '{key}'")
288 # The URL_REGEX will always match, but may have empty components.
289 url_match = URL_REGEX.match(url)
290 assert url_match is not None
291 url_dict = url_match.groupdict()
293 # * 'scheme', 'authority', and 'path' may be empty strings.
294 # * 'query' may be 'None', indicating no trailing "?" portion.
295 # Any string including the empty string, indicates a trailing "?".
296 # * 'fragment' may be 'None', indicating no trailing "#" portion.
297 # Any string including the empty string, indicates a trailing "#".
298 scheme = kwargs.get("scheme", url_dict["scheme"]) or ""
299 authority = kwargs.get("authority", url_dict["authority"]) or ""
300 path = kwargs.get("path", url_dict["path"]) or ""
301 query = kwargs.get("query", url_dict["query"])
302 frag = kwargs.get("fragment", url_dict["fragment"])
304 # The AUTHORITY_REGEX will always match, but may have empty components.
305 authority_match = AUTHORITY_REGEX.match(authority)
306 assert authority_match is not None
307 authority_dict = authority_match.groupdict()
309 # * 'userinfo' and 'host' may be empty strings.
310 # * 'port' may be 'None'.
311 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or ""
312 host = kwargs.get("host", authority_dict["host"]) or ""
313 port = kwargs.get("port", authority_dict["port"])
315 # Normalize and validate each component.
316 # We end up with a parsed representation of the URL,
317 # with components that are plain ASCII bytestrings.
318 parsed_scheme: str = scheme.lower()
319 parsed_userinfo: str = quote(userinfo, safe=USERINFO_SAFE)
320 parsed_host: str = encode_host(host)
321 parsed_port: int | None = normalize_port(port, scheme)
323 has_scheme = parsed_scheme != ""
324 has_authority = (
325 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None
326 )
327 validate_path(path, has_scheme=has_scheme, has_authority=has_authority)
328 if has_scheme or has_authority:
329 path = normalize_path(path)
331 parsed_path: str = quote(path, safe=PATH_SAFE)
332 parsed_query: str | None = None if query is None else quote(query, safe=QUERY_SAFE)
333 parsed_frag: str | None = None if frag is None else quote(frag, safe=FRAG_SAFE)
335 # The parsed ASCII bytestrings are our canonical form.
336 # All properties of the URL are derived from these.
337 return ParseResult(
338 parsed_scheme,
339 parsed_userinfo,
340 parsed_host,
341 parsed_port,
342 parsed_path,
343 parsed_query,
344 parsed_frag,
345 )
348def encode_host(host: str) -> str:
349 if not host:
350 return ""
352 elif IPv4_STYLE_HOSTNAME.match(host):
353 # Validate IPv4 hostnames like #.#.#.#
354 #
355 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
356 #
357 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
358 try:
359 ipaddress.IPv4Address(host)
360 except ipaddress.AddressValueError:
361 raise InvalidURL(f"Invalid IPv4 address: {host!r}")
362 return host
364 elif IPv6_STYLE_HOSTNAME.match(host):
365 # Validate IPv6 hostnames like [...]
366 #
367 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
368 #
369 # "A host identified by an Internet Protocol literal address, version 6
370 # [RFC3513] or later, is distinguished by enclosing the IP literal
371 # within square brackets ("[" and "]"). This is the only place where
372 # square bracket characters are allowed in the URI syntax."
373 try:
374 ipaddress.IPv6Address(host[1:-1])
375 except ipaddress.AddressValueError:
376 raise InvalidURL(f"Invalid IPv6 address: {host!r}")
377 return host[1:-1]
379 elif host.isascii():
380 # Regular ASCII hostnames
381 #
382 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
383 #
384 # reg-name = *( unreserved / pct-encoded / sub-delims )
385 WHATWG_SAFE = '"`{}%|\\'
386 return quote(host.lower(), safe=SUB_DELIMS + WHATWG_SAFE)
388 # IDNA hostnames
389 try:
390 return idna.encode(host.lower()).decode("ascii")
391 except idna.IDNAError:
392 raise InvalidURL(f"Invalid IDNA hostname: {host!r}")
395def normalize_port(port: str | int | None, scheme: str) -> int | None:
396 # From https://tools.ietf.org/html/rfc3986#section-3.2.3
397 #
398 # "A scheme may define a default port. For example, the "http" scheme
399 # defines a default port of "80", corresponding to its reserved TCP
400 # port number. The type of port designated by the port number (e.g.,
401 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and
402 # normalizers should omit the port component and its ":" delimiter if
403 # port is empty or if its value would be the same as that of the
404 # scheme's default."
405 if port is None or port == "":
406 return None
408 try:
409 port_as_int = int(port)
410 except ValueError:
411 raise InvalidURL(f"Invalid port: {port!r}")
413 # See https://url.spec.whatwg.org/#url-miscellaneous
414 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get(
415 scheme
416 )
417 if port_as_int == default_port:
418 return None
419 return port_as_int
422def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:
423 """
424 Path validation rules that depend on if the URL contains
425 a scheme or authority component.
427 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3
428 """
429 if has_authority:
430 # If a URI contains an authority component, then the path component
431 # must either be empty or begin with a slash ("/") character."
432 if path and not path.startswith("/"):
433 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")
435 if not has_scheme and not has_authority:
436 # If a URI does not contain an authority component, then the path cannot begin
437 # with two slash characters ("//").
438 if path.startswith("//"):
439 raise InvalidURL("Relative URLs cannot have a path starting with '//'")
441 # In addition, a URI reference (Section 4.1) may be a relative-path reference,
442 # in which case the first path segment cannot contain a colon (":") character.
443 if path.startswith(":"):
444 raise InvalidURL("Relative URLs cannot have a path starting with ':'")
447def normalize_path(path: str) -> str:
448 """
449 Drop "." and ".." segments from a URL path.
451 For example:
453 normalize_path("/path/./to/somewhere/..") == "/path/to"
454 """
455 # Fast return when no '.' characters in the path.
456 if "." not in path:
457 return path
459 components = path.split("/")
461 # Fast return when no '.' or '..' components in the path.
462 if "." not in components and ".." not in components:
463 return path
465 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
466 output: list[str] = []
467 for component in components:
468 if component == ".":
469 pass
470 elif component == "..":
471 if output and output != [""]:
472 output.pop()
473 else:
474 output.append(component)
475 return "/".join(output)
478def PERCENT(string: str) -> str:
479 return "".join([f"%{byte:02X}" for byte in string.encode("utf-8")])
482def percent_encoded(string: str, safe: str) -> str:
483 """
484 Use percent-encoding to quote a string.
485 """
486 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe
488 # Fast path for strings that don't need escaping.
489 if not string.rstrip(NON_ESCAPED_CHARS):
490 return string
492 return "".join(
493 [char if char in NON_ESCAPED_CHARS else PERCENT(char) for char in string]
494 )
497def quote(string: str, safe: str) -> str:
498 """
499 Use percent-encoding to quote a string, omitting existing '%xx' escape sequences.
501 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1
503 * `string`: The string to be percent-escaped.
504 * `safe`: A string containing characters that may be treated as safe, and do not
505 need to be escaped. Unreserved characters are always treated as safe.
506 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3
507 """
508 parts = []
509 current_position = 0
510 for match in re.finditer(PERCENT_ENCODED_REGEX, string):
511 start_position, end_position = match.start(), match.end()
512 matched_text = match.group(0)
513 # Add any text up to the '%xx' escape sequence.
514 if start_position != current_position:
515 leading_text = string[current_position:start_position]
516 parts.append(percent_encoded(leading_text, safe=safe))
518 # Add the '%xx' escape sequence.
519 parts.append(matched_text)
520 current_position = end_position
522 # Add any text after the final '%xx' escape sequence.
523 if current_position != len(string):
524 trailing_text = string[current_position:]
525 parts.append(percent_encoded(trailing_text, safe=safe))
527 return "".join(parts)