Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/yarl/

1"""URL parsing utilities."""

3import re

4import unicodedata

5from functools import lru_cache

6from urllib.parse import scheme_chars, uses_netloc

8from ._quoters import QUOTER, UNQUOTER_PLUS

10# Leading and trailing C0 control and space to be stripped per WHATWG spec.

11# == "".join([chr(i) for i in range(0, 0x20 + 1)])

12WHATWG_C0_CONTROL_OR_SPACE = (

13 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10"

14 "\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f "

15)

17# Unsafe bytes to be removed per WHATWG spec

18UNSAFE_URL_BYTES_TO_REMOVE = ["\t", "\r", "\n"]

19USES_AUTHORITY = frozenset(uses_netloc)

21SplitURLType = tuple[str, str, str, str, str]

24def split_url(url: str) -> SplitURLType:

25 """Split URL into parts."""

26 # Adapted from urllib.parse.urlsplit

27 # Only lstrip url as some applications rely on preserving trailing space.

28 # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)

29 url = url.lstrip(WHATWG_C0_CONTROL_OR_SPACE)

30 for b in UNSAFE_URL_BYTES_TO_REMOVE:

31 if b in url:

32 url = url.replace(b, "")

34 scheme = netloc = query = fragment = ""

35 i = url.find(":")

36 if i > 0 and url[0] in scheme_chars:

37 for c in url[1:i]:

38 if c not in scheme_chars:

39 break

40 else:

41 scheme, url = url[:i].lower(), url[i + 1 :]

42 has_hash = "#" in url

43 has_question_mark = "?" in url

44 if url[:2] == "//":

45 delim = len(url) # position of end of domain part of url, default is end

46 if has_hash and has_question_mark:

47 delim_chars = "/?#"

48 elif has_question_mark:

49 delim_chars = "/?"

50 elif has_hash:

51 delim_chars = "/#"

52 else:

53 delim_chars = "/"

54 for c in delim_chars: # look for delimiters; the order is NOT important

55 wdelim = url.find(c, 2) # find first of this delim

56 if wdelim >= 0 and wdelim < delim: # if found

57 delim = wdelim # use earliest delim position

58 netloc = url[2:delim]

59 url = url[delim:]

60 # Backslash is not valid in the authority component per RFC 3986.

61 # WHATWG parsers treat \ as a path separator for special schemes, so

62 # accepting it in the authority can cause host parsing ambiguity.

63 if "\\" in netloc:

64 raise ValueError(

65 "Invalid URL: backslash ('\\') is not allowed in the authority "

66 "component per RFC 3986."

67 )

68 has_left_bracket = "[" in netloc

69 has_right_bracket = "]" in netloc

70 if (has_left_bracket and not has_right_bracket) or (

71 has_right_bracket and not has_left_bracket

72 ):

73 raise ValueError("Invalid IPv6 URL")

74 if has_left_bracket:

75 # Per RFC 3986, brackets are only valid at the START of the host

76 # for IP-literal addresses. Text before '[' (e.g. '127.0.0.1[::1]')

77 # is invalid and must be rejected to prevent SSRF bypasses. The

78 # count checks reject URLs with more than one bracket pair in the

79 # host subcomponent (e.g. 'http://[:localhost[]].google:80'),

80 # which would otherwise resolve to an unintended host.

81 hostinfo = netloc.rpartition("@")[2]

82 if hostinfo[0] != "[" or hostinfo.count("[") > 1 or hostinfo.count("]") > 1:

83 raise ValueError("Invalid IPv6 URL")

84 bracketed_host, _, after_bracket = hostinfo[1:].partition("]")

85 # Per RFC 3986 §3.2.2, after the closing ']' of an IP-literal

86 # only ":" <port> or end-of-authority is valid. Any other text

87 # (e.g. '[::1]allowed.example:1') must be rejected to prevent

88 # host-confusion where the suffix is silently dropped.

89 if after_bracket and after_bracket[0] != ":":

90 raise ValueError("Invalid IPv6 URL")

91 # Valid bracketed hosts are defined in

92 # https://www.rfc-editor.org/rfc/rfc3986#page-49

93 # https://url.spec.whatwg.org/

94 if bracketed_host and bracketed_host[0] == "v":

95 if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", bracketed_host):

96 raise ValueError("IPvFuture address is invalid")

97 elif ":" not in bracketed_host:

98 raise ValueError("The IPv6 content between brackets is not valid")

99 if has_hash:

100 url, _, fragment = url.partition("#")

101 if has_question_mark:

102 url, _, query = url.partition("?")

103 if netloc and not netloc.isascii():

104 _check_netloc(netloc)

105 return scheme, netloc, url, query, fragment

106

107

108def _check_netloc(netloc: str) -> None:

109 # Adapted from urllib.parse._checknetloc

110 # looking for characters like \u2100 that expand to 'a/c'

111 # IDNA uses NFKC equivalence, so normalize for this check

112

113 # ignore characters already included

114 # but not the surrounding text

115 n = netloc.replace("@", "").replace(":", "").replace("#", "").replace("?", "")

116 normalized_netloc = unicodedata.normalize("NFKC", n)

117 if n == normalized_netloc:

118 return

119 # Note that there are no unicode decompositions for the character '@' so

120 # its currently impossible to have test coverage for this branch, however if the

121 # one should be added in the future we want to make sure its still checked.

122 for c in "/?#@:": # pragma: no branch

123 if c in normalized_netloc:

124 raise ValueError(

125 f"netloc '{netloc}' contains invalid "

126 "characters under NFKC normalization"

127 )

128

129

130@lru_cache # match the same size as urlsplit

131def split_netloc(

132 netloc: str,

133) -> tuple[str | None, str | None, str | None, int | None]:

134 """Split netloc into username, password, host and port."""

135 if "@" not in netloc:

136 username: str | None = None

137 password: str | None = None

138 hostinfo = netloc

139 else:

140 userinfo, _, hostinfo = netloc.rpartition("@")

141 username, have_password, password = userinfo.partition(":")

142 if not have_password:

143 password = None

144

145 if "[" in hostinfo:

146 if hostinfo[0] != "[" or hostinfo.count("[") > 1 or hostinfo.count("]") > 1:

147 raise ValueError("Invalid IPv6 URL")

148 _, _, bracketed = hostinfo.partition("[")

149 hostname, _, port_str = bracketed.partition("]")

150 # Defense-in-depth: after ']' only ':port' or empty is valid.

151 # split_url() should have already rejected invalid suffixes,

152 # but guard here too for callers that use split_netloc() directly.

153 if port_str and port_str[0] != ":":

154 raise ValueError("Invalid IPv6 URL")

155 _, _, port_str = port_str.partition(":")

156 else:

157 hostname, _, port_str = hostinfo.partition(":")

158

159 if not port_str:

160 return username or None, password, hostname or None, None

161

162 try:

163 port = int(port_str)

164 except ValueError:

165 raise ValueError("Invalid URL: port can't be converted to integer")

166 if not (0 <= port <= 65535):

167 raise ValueError("Port out of range 0-65535")

168 return username or None, password, hostname or None, port

169

170

171def unsplit_result(

172 scheme: str, netloc: str, url: str, query: str, fragment: str

173) -> str:

174 """Unsplit a URL without any normalization."""

175 if netloc or (scheme and scheme in USES_AUTHORITY) or url[:2] == "//":

176 if url and url[:1] != "/":

177 url = f"{scheme}://{netloc}/{url}" if scheme else f"{scheme}:{url}"

178 else:

179 url = f"{scheme}://{netloc}{url}" if scheme else f"//{netloc}{url}"

180 elif scheme:

181 url = f"{scheme}:{url}"

182 if query:

183 url = f"{url}?{query}"

184 return f"{url}#{fragment}" if fragment else url

185

186

187@lru_cache # match the same size as urlsplit

188def make_netloc(

189 user: str | None,

190 password: str | None,

191 host: str | None,

192 port: int | None,

193 encode: bool = False,

194) -> str:

195 """Make netloc from parts.

196

197 The user and password are encoded if encode is True.

198

199 The host must already be encoded with _encode_host.

200 """

201 if host is None:

202 return ""

203 ret = host

204 if port is not None:

205 ret = f"{ret}:{port}"

206 if user is None and password is None:

207 return ret

208 if password is not None:

209 if not user:

210 user = ""

211 elif encode:

212 user = QUOTER(user)

213 if encode:

214 password = QUOTER(password)

215 user = f"{user}:{password}"

216 elif user and encode:

217 user = QUOTER(user)

218 return f"{user}@{ret}" if user else ret

219

220

221def query_to_pairs(query_string: str) -> list[tuple[str, str]]:

222 """Parse a query given as a string argument.

223

224 Works like urllib.parse.parse_qsl with keep empty values.

225 """

226 pairs: list[tuple[str, str]] = []

227 if not query_string:

228 return pairs

229 for k_v in query_string.split("&"):

230 k, _, v = k_v.partition("=")

231 pairs.append((UNQUOTER_PLUS(k), UNQUOTER_PLUS(v)))

232 return pairs

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/yarl/_parse.py: 83%

138 statements