Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/yarl/_parse.py: 48%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

128 statements  

1"""URL parsing utilities.""" 

2 

3import re 

4import unicodedata 

5from functools import lru_cache 

6from typing import Union 

7from urllib.parse import scheme_chars, uses_netloc 

8 

9from ._quoters import QUOTER, UNQUOTER_PLUS 

10 

11# Leading and trailing C0 control and space to be stripped per WHATWG spec. 

12# == "".join([chr(i) for i in range(0, 0x20 + 1)]) 

13WHATWG_C0_CONTROL_OR_SPACE = ( 

14 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10" 

15 "\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f " 

16) 

17 

18# Unsafe bytes to be removed per WHATWG spec 

19UNSAFE_URL_BYTES_TO_REMOVE = ["\t", "\r", "\n"] 

20USES_AUTHORITY = frozenset(uses_netloc) 

21 

22SplitURLType = tuple[str, str, str, str, str] 

23 

24 

25def split_url(url: str) -> SplitURLType: 

26 """Split URL into parts.""" 

27 # Adapted from urllib.parse.urlsplit 

28 # Only lstrip url as some applications rely on preserving trailing space. 

29 # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both) 

30 url = url.lstrip(WHATWG_C0_CONTROL_OR_SPACE) 

31 for b in UNSAFE_URL_BYTES_TO_REMOVE: 

32 if b in url: 

33 url = url.replace(b, "") 

34 

35 scheme = netloc = query = fragment = "" 

36 i = url.find(":") 

37 if i > 0 and url[0] in scheme_chars: 

38 for c in url[1:i]: 

39 if c not in scheme_chars: 

40 break 

41 else: 

42 scheme, url = url[:i].lower(), url[i + 1 :] 

43 has_hash = "#" in url 

44 has_question_mark = "?" in url 

45 if url[:2] == "//": 

46 delim = len(url) # position of end of domain part of url, default is end 

47 if has_hash and has_question_mark: 

48 delim_chars = "/?#" 

49 elif has_question_mark: 

50 delim_chars = "/?" 

51 elif has_hash: 

52 delim_chars = "/#" 

53 else: 

54 delim_chars = "/" 

55 for c in delim_chars: # look for delimiters; the order is NOT important 

56 wdelim = url.find(c, 2) # find first of this delim 

57 if wdelim >= 0 and wdelim < delim: # if found 

58 delim = wdelim # use earliest delim position 

59 netloc = url[2:delim] 

60 url = url[delim:] 

61 has_left_bracket = "[" in netloc 

62 has_right_bracket = "]" in netloc 

63 if (has_left_bracket and not has_right_bracket) or ( 

64 has_right_bracket and not has_left_bracket 

65 ): 

66 raise ValueError("Invalid IPv6 URL") 

67 if has_left_bracket: 

68 bracketed_host = netloc.partition("[")[2].partition("]")[0] 

69 # Valid bracketed hosts are defined in 

70 # https://www.rfc-editor.org/rfc/rfc3986#page-49 

71 # https://url.spec.whatwg.org/ 

72 if bracketed_host and bracketed_host[0] == "v": 

73 if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", bracketed_host): 

74 raise ValueError("IPvFuture address is invalid") 

75 elif ":" not in bracketed_host: 

76 raise ValueError("The IPv6 content between brackets is not valid") 

77 if has_hash: 

78 url, _, fragment = url.partition("#") 

79 if has_question_mark: 

80 url, _, query = url.partition("?") 

81 if netloc and not netloc.isascii(): 

82 _check_netloc(netloc) 

83 return scheme, netloc, url, query, fragment 

84 

85 

86def _check_netloc(netloc: str) -> None: 

87 # Adapted from urllib.parse._checknetloc 

88 # looking for characters like \u2100 that expand to 'a/c' 

89 # IDNA uses NFKC equivalence, so normalize for this check 

90 

91 # ignore characters already included 

92 # but not the surrounding text 

93 n = netloc.replace("@", "").replace(":", "").replace("#", "").replace("?", "") 

94 normalized_netloc = unicodedata.normalize("NFKC", n) 

95 if n == normalized_netloc: 

96 return 

97 # Note that there are no unicode decompositions for the character '@' so 

98 # its currently impossible to have test coverage for this branch, however if the 

99 # one should be added in the future we want to make sure its still checked. 

100 for c in "/?#@:": # pragma: no branch 

101 if c in normalized_netloc: 

102 raise ValueError( 

103 f"netloc '{netloc}' contains invalid " 

104 "characters under NFKC normalization" 

105 ) 

106 

107 

108@lru_cache # match the same size as urlsplit 

109def split_netloc( 

110 netloc: str, 

111) -> tuple[Union[str, None], Union[str, None], Union[str, None], Union[int, None]]: 

112 """Split netloc into username, password, host and port.""" 

113 if "@" not in netloc: 

114 username: Union[str, None] = None 

115 password: Union[str, None] = None 

116 hostinfo = netloc 

117 else: 

118 userinfo, _, hostinfo = netloc.rpartition("@") 

119 username, have_password, password = userinfo.partition(":") 

120 if not have_password: 

121 password = None 

122 

123 if "[" in hostinfo: 

124 _, _, bracketed = hostinfo.partition("[") 

125 hostname, _, port_str = bracketed.partition("]") 

126 _, _, port_str = port_str.partition(":") 

127 else: 

128 hostname, _, port_str = hostinfo.partition(":") 

129 

130 if not port_str: 

131 return username or None, password, hostname or None, None 

132 

133 try: 

134 port = int(port_str) 

135 except ValueError: 

136 raise ValueError("Invalid URL: port can't be converted to integer") 

137 if not (0 <= port <= 65535): 

138 raise ValueError("Port out of range 0-65535") 

139 return username or None, password, hostname or None, port 

140 

141 

142def unsplit_result( 

143 scheme: str, netloc: str, url: str, query: str, fragment: str 

144) -> str: 

145 """Unsplit a URL without any normalization.""" 

146 if netloc or (scheme and scheme in USES_AUTHORITY) or url[:2] == "//": 

147 if url and url[:1] != "/": 

148 url = f"{scheme}://{netloc}/{url}" if scheme else f"{scheme}:{url}" 

149 else: 

150 url = f"{scheme}://{netloc}{url}" if scheme else f"//{netloc}{url}" 

151 elif scheme: 

152 url = f"{scheme}:{url}" 

153 if query: 

154 url = f"{url}?{query}" 

155 return f"{url}#{fragment}" if fragment else url 

156 

157 

158@lru_cache # match the same size as urlsplit 

159def make_netloc( 

160 user: Union[str, None], 

161 password: Union[str, None], 

162 host: Union[str, None], 

163 port: Union[int, None], 

164 encode: bool = False, 

165) -> str: 

166 """Make netloc from parts. 

167 

168 The user and password are encoded if encode is True. 

169 

170 The host must already be encoded with _encode_host. 

171 """ 

172 if host is None: 

173 return "" 

174 ret = host 

175 if port is not None: 

176 ret = f"{ret}:{port}" 

177 if user is None and password is None: 

178 return ret 

179 if password is not None: 

180 if not user: 

181 user = "" 

182 elif encode: 

183 user = QUOTER(user) 

184 if encode: 

185 password = QUOTER(password) 

186 user = f"{user}:{password}" 

187 elif user and encode: 

188 user = QUOTER(user) 

189 return f"{user}@{ret}" if user else ret 

190 

191 

192def query_to_pairs(query_string: str) -> list[tuple[str, str]]: 

193 """Parse a query given as a string argument. 

194 

195 Works like urllib.parse.parse_qsl with keep empty values. 

196 """ 

197 pairs: list[tuple[str, str]] = [] 

198 if not query_string: 

199 return pairs 

200 for k_v in query_string.split("&"): 

201 k, _, v = k_v.partition("=") 

202 pairs.append((UNQUOTER_PLUS(k), UNQUOTER_PLUS(v))) 

203 return pairs