Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/yarl/_parse.py: 48%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""URL parsing utilities."""
3import re
4import unicodedata
5from functools import lru_cache
6from typing import Union
7from urllib.parse import scheme_chars, uses_netloc
9from ._quoters import QUOTER, UNQUOTER_PLUS
11# Leading and trailing C0 control and space to be stripped per WHATWG spec.
12# == "".join([chr(i) for i in range(0, 0x20 + 1)])
13WHATWG_C0_CONTROL_OR_SPACE = (
14 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10"
15 "\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f "
16)
18# Unsafe bytes to be removed per WHATWG spec
19UNSAFE_URL_BYTES_TO_REMOVE = ["\t", "\r", "\n"]
20USES_AUTHORITY = frozenset(uses_netloc)
22SplitURLType = tuple[str, str, str, str, str]
25def split_url(url: str) -> SplitURLType:
26 """Split URL into parts."""
27 # Adapted from urllib.parse.urlsplit
28 # Only lstrip url as some applications rely on preserving trailing space.
29 # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
30 url = url.lstrip(WHATWG_C0_CONTROL_OR_SPACE)
31 for b in UNSAFE_URL_BYTES_TO_REMOVE:
32 if b in url:
33 url = url.replace(b, "")
35 scheme = netloc = query = fragment = ""
36 i = url.find(":")
37 if i > 0 and url[0] in scheme_chars:
38 for c in url[1:i]:
39 if c not in scheme_chars:
40 break
41 else:
42 scheme, url = url[:i].lower(), url[i + 1 :]
43 has_hash = "#" in url
44 has_question_mark = "?" in url
45 if url[:2] == "//":
46 delim = len(url) # position of end of domain part of url, default is end
47 if has_hash and has_question_mark:
48 delim_chars = "/?#"
49 elif has_question_mark:
50 delim_chars = "/?"
51 elif has_hash:
52 delim_chars = "/#"
53 else:
54 delim_chars = "/"
55 for c in delim_chars: # look for delimiters; the order is NOT important
56 wdelim = url.find(c, 2) # find first of this delim
57 if wdelim >= 0 and wdelim < delim: # if found
58 delim = wdelim # use earliest delim position
59 netloc = url[2:delim]
60 url = url[delim:]
61 has_left_bracket = "[" in netloc
62 has_right_bracket = "]" in netloc
63 if (has_left_bracket and not has_right_bracket) or (
64 has_right_bracket and not has_left_bracket
65 ):
66 raise ValueError("Invalid IPv6 URL")
67 if has_left_bracket:
68 bracketed_host = netloc.partition("[")[2].partition("]")[0]
69 # Valid bracketed hosts are defined in
70 # https://www.rfc-editor.org/rfc/rfc3986#page-49
71 # https://url.spec.whatwg.org/
72 if bracketed_host and bracketed_host[0] == "v":
73 if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", bracketed_host):
74 raise ValueError("IPvFuture address is invalid")
75 elif ":" not in bracketed_host:
76 raise ValueError("The IPv6 content between brackets is not valid")
77 if has_hash:
78 url, _, fragment = url.partition("#")
79 if has_question_mark:
80 url, _, query = url.partition("?")
81 if netloc and not netloc.isascii():
82 _check_netloc(netloc)
83 return scheme, netloc, url, query, fragment
86def _check_netloc(netloc: str) -> None:
87 # Adapted from urllib.parse._checknetloc
88 # looking for characters like \u2100 that expand to 'a/c'
89 # IDNA uses NFKC equivalence, so normalize for this check
91 # ignore characters already included
92 # but not the surrounding text
93 n = netloc.replace("@", "").replace(":", "").replace("#", "").replace("?", "")
94 normalized_netloc = unicodedata.normalize("NFKC", n)
95 if n == normalized_netloc:
96 return
97 # Note that there are no unicode decompositions for the character '@' so
98 # its currently impossible to have test coverage for this branch, however if the
99 # one should be added in the future we want to make sure its still checked.
100 for c in "/?#@:": # pragma: no branch
101 if c in normalized_netloc:
102 raise ValueError(
103 f"netloc '{netloc}' contains invalid "
104 "characters under NFKC normalization"
105 )
108@lru_cache # match the same size as urlsplit
109def split_netloc(
110 netloc: str,
111) -> tuple[Union[str, None], Union[str, None], Union[str, None], Union[int, None]]:
112 """Split netloc into username, password, host and port."""
113 if "@" not in netloc:
114 username: Union[str, None] = None
115 password: Union[str, None] = None
116 hostinfo = netloc
117 else:
118 userinfo, _, hostinfo = netloc.rpartition("@")
119 username, have_password, password = userinfo.partition(":")
120 if not have_password:
121 password = None
123 if "[" in hostinfo:
124 _, _, bracketed = hostinfo.partition("[")
125 hostname, _, port_str = bracketed.partition("]")
126 _, _, port_str = port_str.partition(":")
127 else:
128 hostname, _, port_str = hostinfo.partition(":")
130 if not port_str:
131 return username or None, password, hostname or None, None
133 try:
134 port = int(port_str)
135 except ValueError:
136 raise ValueError("Invalid URL: port can't be converted to integer")
137 if not (0 <= port <= 65535):
138 raise ValueError("Port out of range 0-65535")
139 return username or None, password, hostname or None, port
142def unsplit_result(
143 scheme: str, netloc: str, url: str, query: str, fragment: str
144) -> str:
145 """Unsplit a URL without any normalization."""
146 if netloc or (scheme and scheme in USES_AUTHORITY) or url[:2] == "//":
147 if url and url[:1] != "/":
148 url = f"{scheme}://{netloc}/{url}" if scheme else f"{scheme}:{url}"
149 else:
150 url = f"{scheme}://{netloc}{url}" if scheme else f"//{netloc}{url}"
151 elif scheme:
152 url = f"{scheme}:{url}"
153 if query:
154 url = f"{url}?{query}"
155 return f"{url}#{fragment}" if fragment else url
158@lru_cache # match the same size as urlsplit
159def make_netloc(
160 user: Union[str, None],
161 password: Union[str, None],
162 host: Union[str, None],
163 port: Union[int, None],
164 encode: bool = False,
165) -> str:
166 """Make netloc from parts.
168 The user and password are encoded if encode is True.
170 The host must already be encoded with _encode_host.
171 """
172 if host is None:
173 return ""
174 ret = host
175 if port is not None:
176 ret = f"{ret}:{port}"
177 if user is None and password is None:
178 return ret
179 if password is not None:
180 if not user:
181 user = ""
182 elif encode:
183 user = QUOTER(user)
184 if encode:
185 password = QUOTER(password)
186 user = f"{user}:{password}"
187 elif user and encode:
188 user = QUOTER(user)
189 return f"{user}@{ret}" if user else ret
192def query_to_pairs(query_string: str) -> list[tuple[str, str]]:
193 """Parse a query given as a string argument.
195 Works like urllib.parse.parse_qsl with keep empty values.
196 """
197 pairs: list[tuple[str, str]] = []
198 if not query_string:
199 return pairs
200 for k_v in query_string.split("&"):
201 k, _, v = k_v.partition("=")
202 pairs.append((UNQUOTER_PLUS(k), UNQUOTER_PLUS(v)))
203 return pairs