Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/yarl/

1"""URL parsing utilities."""

3import re

4import unicodedata

5from functools import lru_cache

6from urllib.parse import scheme_chars, uses_netloc

8from ._quoters import QUOTER, UNQUOTER_PLUS

10# Leading and trailing C0 control and space to be stripped per WHATWG spec.

11# == "".join([chr(i) for i in range(0, 0x20 + 1)])

12WHATWG_C0_CONTROL_OR_SPACE = (

13 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10"

14 "\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f "

15)

17# Unsafe bytes to be removed per WHATWG spec

18UNSAFE_URL_BYTES_TO_REMOVE = ["\t", "\r", "\n"]

19USES_AUTHORITY = frozenset(uses_netloc)

21SplitURLType = tuple[str, str, str, str, str]

24def split_url(url: str) -> SplitURLType:

25 """Split URL into parts."""

26 # Adapted from urllib.parse.urlsplit

27 # Only lstrip url as some applications rely on preserving trailing space.

28 # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)

29 url = url.lstrip(WHATWG_C0_CONTROL_OR_SPACE)

30 for b in UNSAFE_URL_BYTES_TO_REMOVE:

31 if b in url:

32 url = url.replace(b, "")

34 scheme = netloc = query = fragment = ""

35 i = url.find(":")

36 if i > 0 and url[0] in scheme_chars:

37 for c in url[1:i]:

38 if c not in scheme_chars:

39 break

40 else:

41 scheme, url = url[:i].lower(), url[i + 1 :]

42 has_hash = "#" in url

43 has_question_mark = "?" in url

44 if url[:2] == "//":

45 delim = len(url) # position of end of domain part of url, default is end

46 if has_hash and has_question_mark:

47 delim_chars = "/?#"

48 elif has_question_mark:

49 delim_chars = "/?"

50 elif has_hash:

51 delim_chars = "/#"

52 else:

53 delim_chars = "/"

54 for c in delim_chars: # look for delimiters; the order is NOT important

55 wdelim = url.find(c, 2) # find first of this delim

56 if wdelim >= 0 and wdelim < delim: # if found

57 delim = wdelim # use earliest delim position

58 netloc = url[2:delim]

59 url = url[delim:]

60 has_left_bracket = "[" in netloc

61 has_right_bracket = "]" in netloc

62 if (has_left_bracket and not has_right_bracket) or (

63 has_right_bracket and not has_left_bracket

64 ):

65 raise ValueError("Invalid IPv6 URL")

66 if has_left_bracket:

67 # Per RFC 3986, brackets are only valid at the START of the host

68 # for IP-literal addresses. Text before '[' (e.g. '127.0.0.1[::1]')

69 # is invalid and must be rejected to prevent SSRF bypasses.

70 hostinfo = netloc.rpartition("@")[2]

71 if hostinfo[0] != "[":

72 raise ValueError("Invalid IPv6 URL")

73 bracketed_host = netloc.partition("[")[2].partition("]")[0]

74 # Valid bracketed hosts are defined in

75 # https://www.rfc-editor.org/rfc/rfc3986#page-49

76 # https://url.spec.whatwg.org/

77 if bracketed_host and bracketed_host[0] == "v":

78 if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", bracketed_host):

79 raise ValueError("IPvFuture address is invalid")

80 elif ":" not in bracketed_host:

81 raise ValueError("The IPv6 content between brackets is not valid")

82 if has_hash:

83 url, _, fragment = url.partition("#")

84 if has_question_mark:

85 url, _, query = url.partition("?")

86 if netloc and not netloc.isascii():

87 _check_netloc(netloc)

88 return scheme, netloc, url, query, fragment

91def _check_netloc(netloc: str) -> None:

92 # Adapted from urllib.parse._checknetloc

93 # looking for characters like \u2100 that expand to 'a/c'

94 # IDNA uses NFKC equivalence, so normalize for this check

96 # ignore characters already included

97 # but not the surrounding text

98 n = netloc.replace("@", "").replace(":", "").replace("#", "").replace("?", "")

99 normalized_netloc = unicodedata.normalize("NFKC", n)

100 if n == normalized_netloc:

101 return

102 # Note that there are no unicode decompositions for the character '@' so

103 # its currently impossible to have test coverage for this branch, however if the

104 # one should be added in the future we want to make sure its still checked.

105 for c in "/?#@:": # pragma: no branch

106 if c in normalized_netloc:

107 raise ValueError(

108 f"netloc '{netloc}' contains invalid "

109 "characters under NFKC normalization"

110 )

111

112

113@lru_cache # match the same size as urlsplit

114def split_netloc(

115 netloc: str,

116) -> tuple[str | None, str | None, str | None, int | None]:

117 """Split netloc into username, password, host and port."""

118 if "@" not in netloc:

119 username: str | None = None

120 password: str | None = None

121 hostinfo = netloc

122 else:

123 userinfo, _, hostinfo = netloc.rpartition("@")

124 username, have_password, password = userinfo.partition(":")

125 if not have_password:

126 password = None

127

128 if "[" in hostinfo:

129 _, _, bracketed = hostinfo.partition("[")

130 hostname, _, port_str = bracketed.partition("]")

131 _, _, port_str = port_str.partition(":")

132 else:

133 hostname, _, port_str = hostinfo.partition(":")

134

135 if not port_str:

136 return username or None, password, hostname or None, None

137

138 try:

139 port = int(port_str)

140 except ValueError:

141 raise ValueError("Invalid URL: port can't be converted to integer")

142 if not (0 <= port <= 65535):

143 raise ValueError("Port out of range 0-65535")

144 return username or None, password, hostname or None, port

145

146

147def unsplit_result(

148 scheme: str, netloc: str, url: str, query: str, fragment: str

149) -> str:

150 """Unsplit a URL without any normalization."""

151 if netloc or (scheme and scheme in USES_AUTHORITY) or url[:2] == "//":

152 if url and url[:1] != "/":

153 url = f"{scheme}://{netloc}/{url}" if scheme else f"{scheme}:{url}"

154 else:

155 url = f"{scheme}://{netloc}{url}" if scheme else f"//{netloc}{url}"

156 elif scheme:

157 url = f"{scheme}:{url}"

158 if query:

159 url = f"{url}?{query}"

160 return f"{url}#{fragment}" if fragment else url

161

162

163@lru_cache # match the same size as urlsplit

164def make_netloc(

165 user: str | None,

166 password: str | None,

167 host: str | None,

168 port: int | None,

169 encode: bool = False,

170) -> str:

171 """Make netloc from parts.

172

173 The user and password are encoded if encode is True.

174

175 The host must already be encoded with _encode_host.

176 """

177 if host is None:

178 return ""

179 ret = host

180 if port is not None:

181 ret = f"{ret}:{port}"

182 if user is None and password is None:

183 return ret

184 if password is not None:

185 if not user:

186 user = ""

187 elif encode:

188 user = QUOTER(user)

189 if encode:

190 password = QUOTER(password)

191 user = f"{user}:{password}"

192 elif user and encode:

193 user = QUOTER(user)

194 return f"{user}@{ret}" if user else ret

195

196

197def query_to_pairs(query_string: str) -> list[tuple[str, str]]:

198 """Parse a query given as a string argument.

199

200 Works like urllib.parse.parse_qsl with keep empty values.

201 """

202 pairs: list[tuple[str, str]] = []

203 if not query_string:

204 return pairs

205 for k_v in query_string.split("&"):

206 k, _, v = k_v.partition("=")

207 pairs.append((UNQUOTER_PLUS(k), UNQUOTER_PLUS(v)))

208 return pairs

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/yarl/_parse.py: 84%

130 statements