Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/httpx/_urlparse.py: 21%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2An implementation of `urlparse` that provides URL validation and normalization
3as described by RFC3986.
5We rely on this implementation rather than the one in Python's stdlib, because:
7* It provides more complete URL validation.
8* It properly differentiates between an empty querystring and an absent querystring,
9 to distinguish URLs with a trailing '?'.
10* It handles scheme, hostname, port, and path normalization.
11* It supports IDNA hostnames, normalizing them to their encoded form.
12* The API supports passing individual components, as well as the complete URL string.
14Previously we relied on the excellent `rfc3986` package to handle URL parsing and
15validation, but this module provides a simpler alternative, with less indirection
16required.
17"""
19from __future__ import annotations
21import ipaddress
22import re
23import typing
25import idna
27from ._exceptions import InvalidURL
29MAX_URL_LENGTH = 65536
31# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3
32UNRESERVED_CHARACTERS = (
33 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
34)
35SUB_DELIMS = "!$&'()*+,;="
37PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")
40# {scheme}: (optional)
41# //{authority} (optional)
42# {path}
43# ?{query} (optional)
44# #{fragment} (optional)
45URL_REGEX = re.compile(
46 (
47 r"(?:(?P<scheme>{scheme}):)?"
48 r"(?://(?P<authority>{authority}))?"
49 r"(?P<path>{path})"
50 r"(?:\?(?P<query>{query}))?"
51 r"(?:#(?P<fragment>{fragment}))?"
52 ).format(
53 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?",
54 authority="[^/?#]*",
55 path="[^?#]*",
56 query="[^#]*",
57 fragment=".*",
58 )
59)
61# {userinfo}@ (optional)
62# {host}
63# :{port} (optional)
64AUTHORITY_REGEX = re.compile(
65 (
66 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?"
67 ).format(
68 userinfo=".*", # Any character sequence.
69 host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@',
70 # or an IPv6 address enclosed within square brackets.
71 port=".*", # Any character sequence.
72 )
73)
76# If we call urlparse with an individual component, then we need to regex
77# validate that component individually.
78# Note that we're duplicating the same strings as above. Shock! Horror!!
79COMPONENT_REGEX = {
80 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"),
81 "authority": re.compile("[^/?#]*"),
82 "path": re.compile("[^?#]*"),
83 "query": re.compile("[^#]*"),
84 "fragment": re.compile(".*"),
85 "userinfo": re.compile("[^@]*"),
86 "host": re.compile("(\\[.*\\]|[^:]*)"),
87 "port": re.compile(".*"),
88}
91# We use these simple regexs as a first pass before handing off to
92# the stdlib 'ipaddress' module for IP address validation.
93IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$")
94IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$")
97class ParseResult(typing.NamedTuple):
98 scheme: str
99 userinfo: str
100 host: str
101 port: int | None
102 path: str
103 query: str | None
104 fragment: str | None
106 @property
107 def authority(self) -> str:
108 return "".join(
109 [
110 f"{self.userinfo}@" if self.userinfo else "",
111 f"[{self.host}]" if ":" in self.host else self.host,
112 f":{self.port}" if self.port is not None else "",
113 ]
114 )
116 @property
117 def netloc(self) -> str:
118 return "".join(
119 [
120 f"[{self.host}]" if ":" in self.host else self.host,
121 f":{self.port}" if self.port is not None else "",
122 ]
123 )
125 def copy_with(self, **kwargs: str | None) -> ParseResult:
126 if not kwargs:
127 return self
129 defaults = {
130 "scheme": self.scheme,
131 "authority": self.authority,
132 "path": self.path,
133 "query": self.query,
134 "fragment": self.fragment,
135 }
136 defaults.update(kwargs)
137 return urlparse("", **defaults)
139 def __str__(self) -> str:
140 authority = self.authority
141 return "".join(
142 [
143 f"{self.scheme}:" if self.scheme else "",
144 f"//{authority}" if authority else "",
145 self.path,
146 f"?{self.query}" if self.query is not None else "",
147 f"#{self.fragment}" if self.fragment is not None else "",
148 ]
149 )
152def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
153 # Initial basic checks on allowable URLs.
154 # ---------------------------------------
156 # Hard limit the maximum allowable URL length.
157 if len(url) > MAX_URL_LENGTH:
158 raise InvalidURL("URL too long")
160 # If a URL includes any ASCII control characters including \t, \r, \n,
161 # then treat it as invalid.
162 if any(char.isascii() and not char.isprintable() for char in url):
163 char = next(char for char in url if char.isascii() and not char.isprintable())
164 idx = url.find(char)
165 error = (
166 f"Invalid non-printable ASCII character in URL, {char!r} at position {idx}."
167 )
168 raise InvalidURL(error)
170 # Some keyword arguments require special handling.
171 # ------------------------------------------------
173 # Coerce "port" to a string, if it is provided as an integer.
174 if "port" in kwargs:
175 port = kwargs["port"]
176 kwargs["port"] = str(port) if isinstance(port, int) else port
178 # Replace "netloc" with "host and "port".
179 if "netloc" in kwargs:
180 netloc = kwargs.pop("netloc") or ""
181 kwargs["host"], _, kwargs["port"] = netloc.partition(":")
183 # Replace "username" and/or "password" with "userinfo".
184 if "username" in kwargs or "password" in kwargs:
185 username = quote(kwargs.pop("username", "") or "")
186 password = quote(kwargs.pop("password", "") or "")
187 kwargs["userinfo"] = f"{username}:{password}" if password else username
189 # Replace "raw_path" with "path" and "query".
190 if "raw_path" in kwargs:
191 raw_path = kwargs.pop("raw_path") or ""
192 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?")
193 if not seperator:
194 kwargs["query"] = None
196 # Ensure that IPv6 "host" addresses are always escaped with "[...]".
197 if "host" in kwargs:
198 host = kwargs.get("host") or ""
199 if ":" in host and not (host.startswith("[") and host.endswith("]")):
200 kwargs["host"] = f"[{host}]"
202 # If any keyword arguments are provided, ensure they are valid.
203 # -------------------------------------------------------------
205 for key, value in kwargs.items():
206 if value is not None:
207 if len(value) > MAX_URL_LENGTH:
208 raise InvalidURL(f"URL component '{key}' too long")
210 # If a component includes any ASCII control characters including \t, \r, \n,
211 # then treat it as invalid.
212 if any(char.isascii() and not char.isprintable() for char in value):
213 char = next(
214 char for char in value if char.isascii() and not char.isprintable()
215 )
216 idx = value.find(char)
217 error = (
218 f"Invalid non-printable ASCII character in URL {key} component, "
219 f"{char!r} at position {idx}."
220 )
221 raise InvalidURL(error)
223 # Ensure that keyword arguments match as a valid regex.
224 if not COMPONENT_REGEX[key].fullmatch(value):
225 raise InvalidURL(f"Invalid URL component '{key}'")
227 # The URL_REGEX will always match, but may have empty components.
228 url_match = URL_REGEX.match(url)
229 assert url_match is not None
230 url_dict = url_match.groupdict()
232 # * 'scheme', 'authority', and 'path' may be empty strings.
233 # * 'query' may be 'None', indicating no trailing "?" portion.
234 # Any string including the empty string, indicates a trailing "?".
235 # * 'fragment' may be 'None', indicating no trailing "#" portion.
236 # Any string including the empty string, indicates a trailing "#".
237 scheme = kwargs.get("scheme", url_dict["scheme"]) or ""
238 authority = kwargs.get("authority", url_dict["authority"]) or ""
239 path = kwargs.get("path", url_dict["path"]) or ""
240 query = kwargs.get("query", url_dict["query"])
241 fragment = kwargs.get("fragment", url_dict["fragment"])
243 # The AUTHORITY_REGEX will always match, but may have empty components.
244 authority_match = AUTHORITY_REGEX.match(authority)
245 assert authority_match is not None
246 authority_dict = authority_match.groupdict()
248 # * 'userinfo' and 'host' may be empty strings.
249 # * 'port' may be 'None'.
250 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or ""
251 host = kwargs.get("host", authority_dict["host"]) or ""
252 port = kwargs.get("port", authority_dict["port"])
254 # Normalize and validate each component.
255 # We end up with a parsed representation of the URL,
256 # with components that are plain ASCII bytestrings.
257 parsed_scheme: str = scheme.lower()
258 parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":")
259 parsed_host: str = encode_host(host)
260 parsed_port: int | None = normalize_port(port, scheme)
262 has_scheme = parsed_scheme != ""
263 has_authority = (
264 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None
265 )
266 validate_path(path, has_scheme=has_scheme, has_authority=has_authority)
267 if has_scheme or has_authority:
268 path = normalize_path(path)
270 # The GEN_DELIMS set is... : / ? # [ ] @
271 # These do not need to be percent-quoted unless they serve as delimiters for the
272 # specific component.
273 WHATWG_SAFE = '`{}%|^\\"'
275 # For 'path' we need to drop ? and # from the GEN_DELIMS set.
276 parsed_path: str = quote(path, safe=SUB_DELIMS + WHATWG_SAFE + ":/[]@")
277 # For 'query' we need to drop '#' from the GEN_DELIMS set.
278 parsed_query: str | None = (
279 None
280 if query is None
281 else quote(query, safe=SUB_DELIMS + WHATWG_SAFE + ":/?[]@")
282 )
283 # For 'fragment' we can include all of the GEN_DELIMS set.
284 parsed_fragment: str | None = (
285 None
286 if fragment is None
287 else quote(fragment, safe=SUB_DELIMS + WHATWG_SAFE + ":/?#[]@")
288 )
290 # The parsed ASCII bytestrings are our canonical form.
291 # All properties of the URL are derived from these.
292 return ParseResult(
293 parsed_scheme,
294 parsed_userinfo,
295 parsed_host,
296 parsed_port,
297 parsed_path,
298 parsed_query,
299 parsed_fragment,
300 )
303def encode_host(host: str) -> str:
304 if not host:
305 return ""
307 elif IPv4_STYLE_HOSTNAME.match(host):
308 # Validate IPv4 hostnames like #.#.#.#
309 #
310 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
311 #
312 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
313 try:
314 ipaddress.IPv4Address(host)
315 except ipaddress.AddressValueError:
316 raise InvalidURL(f"Invalid IPv4 address: {host!r}")
317 return host
319 elif IPv6_STYLE_HOSTNAME.match(host):
320 # Validate IPv6 hostnames like [...]
321 #
322 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
323 #
324 # "A host identified by an Internet Protocol literal address, version 6
325 # [RFC3513] or later, is distinguished by enclosing the IP literal
326 # within square brackets ("[" and "]"). This is the only place where
327 # square bracket characters are allowed in the URI syntax."
328 try:
329 ipaddress.IPv6Address(host[1:-1])
330 except ipaddress.AddressValueError:
331 raise InvalidURL(f"Invalid IPv6 address: {host!r}")
332 return host[1:-1]
334 elif host.isascii():
335 # Regular ASCII hostnames
336 #
337 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
338 #
339 # reg-name = *( unreserved / pct-encoded / sub-delims )
340 WHATWG_SAFE = '"`{}%|\\'
341 return quote(host.lower(), safe=SUB_DELIMS + WHATWG_SAFE)
343 # IDNA hostnames
344 try:
345 return idna.encode(host.lower()).decode("ascii")
346 except idna.IDNAError:
347 raise InvalidURL(f"Invalid IDNA hostname: {host!r}")
350def normalize_port(port: str | int | None, scheme: str) -> int | None:
351 # From https://tools.ietf.org/html/rfc3986#section-3.2.3
352 #
353 # "A scheme may define a default port. For example, the "http" scheme
354 # defines a default port of "80", corresponding to its reserved TCP
355 # port number. The type of port designated by the port number (e.g.,
356 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and
357 # normalizers should omit the port component and its ":" delimiter if
358 # port is empty or if its value would be the same as that of the
359 # scheme's default."
360 if port is None or port == "":
361 return None
363 try:
364 port_as_int = int(port)
365 except ValueError:
366 raise InvalidURL(f"Invalid port: {port!r}")
368 # See https://url.spec.whatwg.org/#url-miscellaneous
369 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get(
370 scheme
371 )
372 if port_as_int == default_port:
373 return None
374 return port_as_int
377def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:
378 """
379 Path validation rules that depend on if the URL contains
380 a scheme or authority component.
382 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3
383 """
384 if has_authority:
385 # If a URI contains an authority component, then the path component
386 # must either be empty or begin with a slash ("/") character."
387 if path and not path.startswith("/"):
388 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")
390 if not has_scheme and not has_authority:
391 # If a URI does not contain an authority component, then the path cannot begin
392 # with two slash characters ("//").
393 if path.startswith("//"):
394 raise InvalidURL("Relative URLs cannot have a path starting with '//'")
396 # In addition, a URI reference (Section 4.1) may be a relative-path reference,
397 # in which case the first path segment cannot contain a colon (":") character.
398 if path.startswith(":"):
399 raise InvalidURL("Relative URLs cannot have a path starting with ':'")
402def normalize_path(path: str) -> str:
403 """
404 Drop "." and ".." segments from a URL path.
406 For example:
408 normalize_path("/path/./to/somewhere/..") == "/path/to"
409 """
410 # Fast return when no '.' characters in the path.
411 if "." not in path:
412 return path
414 components = path.split("/")
416 # Fast return when no '.' or '..' components in the path.
417 if "." not in components and ".." not in components:
418 return path
420 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
421 output: list[str] = []
422 for component in components:
423 if component == ".":
424 pass
425 elif component == "..":
426 if output and output != [""]:
427 output.pop()
428 else:
429 output.append(component)
430 return "/".join(output)
433def PERCENT(string: str) -> str:
434 return "".join([f"%{byte:02X}" for byte in string.encode("utf-8")])
437def percent_encoded(string: str, safe: str = "/") -> str:
438 """
439 Use percent-encoding to quote a string.
440 """
441 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe
443 # Fast path for strings that don't need escaping.
444 if not string.rstrip(NON_ESCAPED_CHARS):
445 return string
447 return "".join(
448 [char if char in NON_ESCAPED_CHARS else PERCENT(char) for char in string]
449 )
452def quote(string: str, safe: str = "/") -> str:
453 """
454 Use percent-encoding to quote a string, omitting existing '%xx' escape sequences.
456 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1
458 * `string`: The string to be percent-escaped.
459 * `safe`: A string containing characters that may be treated as safe, and do not
460 need to be escaped. Unreserved characters are always treated as safe.
461 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3
462 """
463 parts = []
464 current_position = 0
465 for match in re.finditer(PERCENT_ENCODED_REGEX, string):
466 start_position, end_position = match.start(), match.end()
467 matched_text = match.group(0)
468 # Add any text up to the '%xx' escape sequence.
469 if start_position != current_position:
470 leading_text = string[current_position:start_position]
471 parts.append(percent_encoded(leading_text, safe=safe))
473 # Add the '%xx' escape sequence.
474 parts.append(matched_text)
475 current_position = end_position
477 # Add any text after the final '%xx' escape sequence.
478 if current_position != len(string):
479 trailing_text = string[current_position:]
480 parts.append(percent_encoded(trailing_text, safe=safe))
482 return "".join(parts)
485def urlencode(items: list[tuple[str, str]]) -> str:
486 """
487 We can use a much simpler version of the stdlib urlencode here because
488 we don't need to handle a bunch of different typing cases, such as bytes vs str.
490 https://github.com/python/cpython/blob/b2f7b2ef0b5421e01efb8c7bee2ef95d3bab77eb/Lib/urllib/parse.py#L926
492 Note that we use '%20' encoding for spaces. and '%2F for '/'.
493 This is slightly different than `requests`, but is the behaviour that browsers use.
495 See
496 - https://github.com/encode/httpx/issues/2536
497 - https://github.com/encode/httpx/issues/2721
498 - https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode
499 """
500 return "&".join(
501 [
502 percent_encoded(k, safe="") + "=" + percent_encoded(v, safe="")
503 for k, v in items
504 ]
505 )