Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/httpx/_urlparse.py: 80%
162 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 07:19 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 07:19 +0000
1"""
2An implementation of `urlparse` that provides URL validation and normalization
3as described by RFC3986.
5We rely on this implementation rather than the one in Python's stdlib, because:
7* It provides more complete URL validation.
8* It properly differentiates between an empty querystring and an absent querystring,
9 to distinguish URLs with a trailing '?'.
10* It handles scheme, hostname, port, and path normalization.
11* It supports IDNA hostnames, normalizing them to their encoded form.
12* The API supports passing individual components, as well as the complete URL string.
14Previously we relied on the excellent `rfc3986` package to handle URL parsing and
15validation, but this module provides a simpler alternative, with less indirection
16required.
17"""
18import ipaddress
19import re
20import typing
22import idna
24from ._exceptions import InvalidURL
26MAX_URL_LENGTH = 65536
28# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3
29UNRESERVED_CHARACTERS = (
30 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
31)
32SUB_DELIMS = "!$&'()*+,;="
34PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")
37# {scheme}: (optional)
38# //{authority} (optional)
39# {path}
40# ?{query} (optional)
41# #{fragment} (optional)
42URL_REGEX = re.compile(
43 (
44 r"(?:(?P<scheme>{scheme}):)?"
45 r"(?://(?P<authority>{authority}))?"
46 r"(?P<path>{path})"
47 r"(?:\?(?P<query>{query}))?"
48 r"(?:#(?P<fragment>{fragment}))?"
49 ).format(
50 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?",
51 authority="[^/?#]*",
52 path="[^?#]*",
53 query="[^#]*",
54 fragment=".*",
55 )
56)
58# {userinfo}@ (optional)
59# {host}
60# :{port} (optional)
61AUTHORITY_REGEX = re.compile(
62 (
63 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?"
64 ).format(
65 userinfo="[^@]*", # Any character sequence not including '@'.
66 host="(\\[.*\\]|[^:]*)", # Either any character sequence not including ':',
67 # or an IPv6 address enclosed within square brackets.
68 port=".*", # Any character sequence.
69 )
70)
73# If we call urlparse with an individual component, then we need to regex
74# validate that component individually.
75# Note that we're duplicating the same strings as above. Shock! Horror!!
76COMPONENT_REGEX = {
77 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"),
78 "authority": re.compile("[^/?#]*"),
79 "path": re.compile("[^?#]*"),
80 "query": re.compile("[^#]*"),
81 "fragment": re.compile(".*"),
82 "userinfo": re.compile("[^@]*"),
83 "host": re.compile("(\\[.*\\]|[^:]*)"),
84 "port": re.compile(".*"),
85}
88# We use these simple regexs as a first pass before handing off to
89# the stdlib 'ipaddress' module for IP address validation.
90IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+.[0-9]+.[0-9]+.[0-9]+$")
91IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$")
94class ParseResult(typing.NamedTuple):
95 scheme: str
96 userinfo: str
97 host: str
98 port: typing.Optional[int]
99 path: str
100 query: typing.Optional[str]
101 fragment: typing.Optional[str]
103 @property
104 def authority(self) -> str:
105 return "".join(
106 [
107 f"{self.userinfo}@" if self.userinfo else "",
108 f"[{self.host}]" if ":" in self.host else self.host,
109 f":{self.port}" if self.port is not None else "",
110 ]
111 )
113 @property
114 def netloc(self) -> str:
115 return "".join(
116 [
117 f"[{self.host}]" if ":" in self.host else self.host,
118 f":{self.port}" if self.port is not None else "",
119 ]
120 )
122 def copy_with(self, **kwargs: typing.Optional[str]) -> "ParseResult":
123 if not kwargs:
124 return self
126 defaults = {
127 "scheme": self.scheme,
128 "authority": self.authority,
129 "path": self.path,
130 "query": self.query,
131 "fragment": self.fragment,
132 }
133 defaults.update(kwargs)
134 return urlparse("", **defaults)
136 def __str__(self) -> str:
137 authority = self.authority
138 return "".join(
139 [
140 f"{self.scheme}:" if self.scheme else "",
141 f"//{authority}" if authority else "",
142 self.path,
143 f"?{self.query}" if self.query is not None else "",
144 f"#{self.fragment}" if self.fragment is not None else "",
145 ]
146 )
149def urlparse(url: str = "", **kwargs: typing.Optional[str]) -> ParseResult:
150 # Initial basic checks on allowable URLs.
151 # ---------------------------------------
153 # Hard limit the maximum allowable URL length.
154 if len(url) > MAX_URL_LENGTH:
155 raise InvalidURL("URL too long")
157 # If a URL includes any ASCII control characters including \t, \r, \n,
158 # then treat it as invalid.
159 if any(char.isascii() and not char.isprintable() for char in url):
160 raise InvalidURL("Invalid non-printable ASCII character in URL")
162 # Some keyword arguments require special handling.
163 # ------------------------------------------------
165 # Coerce "port" to a string, if it is provided as an integer.
166 if "port" in kwargs:
167 port = kwargs["port"]
168 kwargs["port"] = str(port) if isinstance(port, int) else port
170 # Replace "netloc" with "host and "port".
171 if "netloc" in kwargs:
172 netloc = kwargs.pop("netloc") or ""
173 kwargs["host"], _, kwargs["port"] = netloc.partition(":")
175 # Replace "username" and/or "password" with "userinfo".
176 if "username" in kwargs or "password" in kwargs:
177 username = quote(kwargs.pop("username", "") or "")
178 password = quote(kwargs.pop("password", "") or "")
179 kwargs["userinfo"] = f"{username}:{password}" if password else username
181 # Replace "raw_path" with "path" and "query".
182 if "raw_path" in kwargs:
183 raw_path = kwargs.pop("raw_path") or ""
184 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?")
185 if not seperator:
186 kwargs["query"] = None
188 # Ensure that IPv6 "host" addresses are always escaped with "[...]".
189 if "host" in kwargs:
190 host = kwargs.get("host") or ""
191 if ":" in host and not (host.startswith("[") and host.endswith("]")):
192 kwargs["host"] = f"[{host}]"
194 # If any keyword arguments are provided, ensure they are valid.
195 # -------------------------------------------------------------
197 for key, value in kwargs.items():
198 if value is not None:
199 if len(value) > MAX_URL_LENGTH:
200 raise InvalidURL(f"URL component '{key}' too long")
202 # If a component includes any ASCII control characters including \t, \r, \n,
203 # then treat it as invalid.
204 if any(char.isascii() and not char.isprintable() for char in value):
205 raise InvalidURL(
206 f"Invalid non-printable ASCII character in URL component '{key}'"
207 )
209 # Ensure that keyword arguments match as a valid regex.
210 if not COMPONENT_REGEX[key].fullmatch(value):
211 raise InvalidURL(f"Invalid URL component '{key}'")
213 # The URL_REGEX will always match, but may have empty components.
214 url_match = URL_REGEX.match(url)
215 assert url_match is not None
216 url_dict = url_match.groupdict()
218 # * 'scheme', 'authority', and 'path' may be empty strings.
219 # * 'query' may be 'None', indicating no trailing "?" portion.
220 # Any string including the empty string, indicates a trailing "?".
221 # * 'fragment' may be 'None', indicating no trailing "#" portion.
222 # Any string including the empty string, indicates a trailing "#".
223 scheme = kwargs.get("scheme", url_dict["scheme"]) or ""
224 authority = kwargs.get("authority", url_dict["authority"]) or ""
225 path = kwargs.get("path", url_dict["path"]) or ""
226 query = kwargs.get("query", url_dict["query"])
227 fragment = kwargs.get("fragment", url_dict["fragment"])
229 # The AUTHORITY_REGEX will always match, but may have empty components.
230 authority_match = AUTHORITY_REGEX.match(authority)
231 assert authority_match is not None
232 authority_dict = authority_match.groupdict()
234 # * 'userinfo' and 'host' may be empty strings.
235 # * 'port' may be 'None'.
236 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or ""
237 host = kwargs.get("host", authority_dict["host"]) or ""
238 port = kwargs.get("port", authority_dict["port"])
240 # Normalize and validate each component.
241 # We end up with a parsed representation of the URL,
242 # with components that are plain ASCII bytestrings.
243 parsed_scheme: str = scheme.lower()
244 parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":")
245 parsed_host: str = encode_host(host)
246 parsed_port: typing.Optional[int] = normalize_port(port, scheme)
248 has_scheme = parsed_scheme != ""
249 has_authority = (
250 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None
251 )
252 validate_path(path, has_scheme=has_scheme, has_authority=has_authority)
253 if has_authority:
254 path = normalize_path(path)
256 # The GEN_DELIMS set is... : / ? # [ ] @
257 # These do not need to be percent-quoted unless they serve as delimiters for the
258 # specific component.
260 # For 'path' we need to drop ? and # from the GEN_DELIMS set.
261 parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@")
262 # For 'query' we need to drop '#' from the GEN_DELIMS set.
263 parsed_query: typing.Optional[str] = (
264 None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@")
265 )
266 # For 'fragment' we can include all of the GEN_DELIMS set.
267 parsed_fragment: typing.Optional[str] = (
268 None if fragment is None else quote(fragment, safe=SUB_DELIMS + ":/?#[]@")
269 )
271 # The parsed ASCII bytestrings are our canonical form.
272 # All properties of the URL are derived from these.
273 return ParseResult(
274 parsed_scheme,
275 parsed_userinfo,
276 parsed_host,
277 parsed_port,
278 parsed_path,
279 parsed_query,
280 parsed_fragment,
281 )
284def encode_host(host: str) -> str:
285 if not host:
286 return ""
288 elif IPv4_STYLE_HOSTNAME.match(host):
289 # Validate IPv4 hostnames like #.#.#.#
290 #
291 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
292 #
293 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
294 try:
295 ipaddress.IPv4Address(host)
296 except ipaddress.AddressValueError:
297 raise InvalidURL(f"Invalid IPv4 address: {host!r}")
298 return host
300 elif IPv6_STYLE_HOSTNAME.match(host):
301 # Validate IPv6 hostnames like [...]
302 #
303 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
304 #
305 # "A host identified by an Internet Protocol literal address, version 6
306 # [RFC3513] or later, is distinguished by enclosing the IP literal
307 # within square brackets ("[" and "]"). This is the only place where
308 # square bracket characters are allowed in the URI syntax."
309 try:
310 ipaddress.IPv6Address(host[1:-1])
311 except ipaddress.AddressValueError:
312 raise InvalidURL(f"Invalid IPv6 address: {host!r}")
313 return host[1:-1]
315 elif host.isascii():
316 # Regular ASCII hostnames
317 #
318 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
319 #
320 # reg-name = *( unreserved / pct-encoded / sub-delims )
321 return quote(host.lower(), safe=SUB_DELIMS)
323 # IDNA hostnames
324 try:
325 return idna.encode(host.lower()).decode("ascii")
326 except idna.IDNAError:
327 raise InvalidURL(f"Invalid IDNA hostname: {host!r}")
330def normalize_port(
331 port: typing.Optional[typing.Union[str, int]], scheme: str
332) -> typing.Optional[int]:
333 # From https://tools.ietf.org/html/rfc3986#section-3.2.3
334 #
335 # "A scheme may define a default port. For example, the "http" scheme
336 # defines a default port of "80", corresponding to its reserved TCP
337 # port number. The type of port designated by the port number (e.g.,
338 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and
339 # normalizers should omit the port component and its ":" delimiter if
340 # port is empty or if its value would be the same as that of the
341 # scheme's default."
342 if port is None or port == "":
343 return None
345 try:
346 port_as_int = int(port)
347 except ValueError:
348 raise InvalidURL(f"Invalid port: {port!r}")
350 # See https://url.spec.whatwg.org/#url-miscellaneous
351 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get(
352 scheme
353 )
354 if port_as_int == default_port:
355 return None
356 return port_as_int
359def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:
360 """
361 Path validation rules that depend on if the URL contains a scheme or authority component.
363 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3
364 """
365 if has_authority:
366 # > If a URI contains an authority component, then the path component
367 # > must either be empty or begin with a slash ("/") character."
368 if path and not path.startswith("/"):
369 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")
370 else:
371 # > If a URI does not contain an authority component, then the path cannot begin
372 # > with two slash characters ("//").
373 if path.startswith("//"):
374 raise InvalidURL(
375 "URLs with no authority component cannot have a path starting with '//'"
376 )
377 # > In addition, a URI reference (Section 4.1) may be a relative-path reference, in which
378 # > case the first path segment cannot contain a colon (":") character.
379 if path.startswith(":") and not has_scheme:
380 raise InvalidURL(
381 "URLs with no scheme component cannot have a path starting with ':'"
382 )
385def normalize_path(path: str) -> str:
386 """
387 Drop "." and ".." segments from a URL path.
389 For example:
391 normalize_path("/path/./to/somewhere/..") == "/path/to"
392 """
393 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
394 components = path.split("/")
395 output: typing.List[str] = []
396 for component in components:
397 if component == ".":
398 pass
399 elif component == "..":
400 if output and output != [""]:
401 output.pop()
402 else:
403 output.append(component)
404 return "/".join(output)
407def percent_encode(char: str) -> str:
408 """
409 Replace a single character with the percent-encoded representation.
411 Characters outside the ASCII range are represented with their a percent-encoded
412 representation of their UTF-8 byte sequence.
414 For example:
416 percent_encode(" ") == "%20"
417 """
418 return "".join([f"%{byte:02x}" for byte in char.encode("utf-8")]).upper()
421def is_safe(string: str, safe: str = "/") -> bool:
422 """
423 Determine if a given string is already quote-safe.
424 """
425 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe + "%"
427 # All characters must already be non-escaping or '%'
428 for char in string:
429 if char not in NON_ESCAPED_CHARS:
430 return False
432 # Any '%' characters must be valid '%xx' escape sequences.
433 return string.count("%") == len(PERCENT_ENCODED_REGEX.findall(string))
436def quote(string: str, safe: str = "/") -> str:
437 """
438 Use percent-encoding to quote a string if required.
439 """
440 if is_safe(string, safe=safe):
441 return string
443 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe
444 return "".join(
445 [char if char in NON_ESCAPED_CHARS else percent_encode(char) for char in string]
446 )
449def urlencode(items: typing.List[typing.Tuple[str, str]]) -> str:
450 # We can use a much simpler version of the stdlib urlencode here because
451 # we don't need to handle a bunch of different typing cases, such as bytes vs str.
452 #
453 # https://github.com/python/cpython/blob/b2f7b2ef0b5421e01efb8c7bee2ef95d3bab77eb/Lib/urllib/parse.py#L926
454 #
455 # Note that we use '%20' encoding for spaces, and treat '/' as a safe
456 # character. This means our query params have the same escaping as other
457 # characters in the URL path. This is slightly different to `requests`,
458 # but is the behaviour that browsers use.
459 #
460 # See https://github.com/encode/httpx/issues/2536 and
461 # https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode
462 return "&".join([quote(k) + "=" + quote(v) for k, v in items])