Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/httpx/_urlparse.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

178 statements  

1""" 

2An implementation of `urlparse` that provides URL validation and normalization 

3as described by RFC3986. 

4 

5We rely on this implementation rather than the one in Python's stdlib, because: 

6 

7* It provides more complete URL validation. 

8* It properly differentiates between an empty querystring and an absent querystring, 

9 to distinguish URLs with a trailing '?'. 

10* It handles scheme, hostname, port, and path normalization. 

11* It supports IDNA hostnames, normalizing them to their encoded form. 

12* The API supports passing individual components, as well as the complete URL string. 

13 

14Previously we relied on the excellent `rfc3986` package to handle URL parsing and 

15validation, but this module provides a simpler alternative, with less indirection 

16required. 

17""" 

18from __future__ import annotations 

19 

20import ipaddress 

21import re 

22import typing 

23 

24import idna 

25 

26from ._exceptions import InvalidURL 

27 

28MAX_URL_LENGTH = 65536 

29 

30# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3 

31UNRESERVED_CHARACTERS = ( 

32 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~" 

33) 

34SUB_DELIMS = "!$&'()*+,;=" 

35 

36PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}") 

37 

38 

39# {scheme}: (optional) 

40# //{authority} (optional) 

41# {path} 

42# ?{query} (optional) 

43# #{fragment} (optional) 

44URL_REGEX = re.compile( 

45 ( 

46 r"(?:(?P<scheme>{scheme}):)?" 

47 r"(?://(?P<authority>{authority}))?" 

48 r"(?P<path>{path})" 

49 r"(?:\?(?P<query>{query}))?" 

50 r"(?:#(?P<fragment>{fragment}))?" 

51 ).format( 

52 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?", 

53 authority="[^/?#]*", 

54 path="[^?#]*", 

55 query="[^#]*", 

56 fragment=".*", 

57 ) 

58) 

59 

60# {userinfo}@ (optional) 

61# {host} 

62# :{port} (optional) 

63AUTHORITY_REGEX = re.compile( 

64 ( 

65 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?" 

66 ).format( 

67 userinfo=".*", # Any character sequence. 

68 host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@', 

69 # or an IPv6 address enclosed within square brackets. 

70 port=".*", # Any character sequence. 

71 ) 

72) 

73 

74 

75# If we call urlparse with an individual component, then we need to regex 

76# validate that component individually. 

77# Note that we're duplicating the same strings as above. Shock! Horror!! 

78COMPONENT_REGEX = { 

79 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"), 

80 "authority": re.compile("[^/?#]*"), 

81 "path": re.compile("[^?#]*"), 

82 "query": re.compile("[^#]*"), 

83 "fragment": re.compile(".*"), 

84 "userinfo": re.compile("[^@]*"), 

85 "host": re.compile("(\\[.*\\]|[^:]*)"), 

86 "port": re.compile(".*"), 

87} 

88 

89 

90# We use these simple regexs as a first pass before handing off to 

91# the stdlib 'ipaddress' module for IP address validation. 

92IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$") 

93IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$") 

94 

95 

96class ParseResult(typing.NamedTuple): 

97 scheme: str 

98 userinfo: str 

99 host: str 

100 port: int | None 

101 path: str 

102 query: str | None 

103 fragment: str | None 

104 

105 @property 

106 def authority(self) -> str: 

107 return "".join( 

108 [ 

109 f"{self.userinfo}@" if self.userinfo else "", 

110 f"[{self.host}]" if ":" in self.host else self.host, 

111 f":{self.port}" if self.port is not None else "", 

112 ] 

113 ) 

114 

115 @property 

116 def netloc(self) -> str: 

117 return "".join( 

118 [ 

119 f"[{self.host}]" if ":" in self.host else self.host, 

120 f":{self.port}" if self.port is not None else "", 

121 ] 

122 ) 

123 

124 def copy_with(self, **kwargs: str | None) -> ParseResult: 

125 if not kwargs: 

126 return self 

127 

128 defaults = { 

129 "scheme": self.scheme, 

130 "authority": self.authority, 

131 "path": self.path, 

132 "query": self.query, 

133 "fragment": self.fragment, 

134 } 

135 defaults.update(kwargs) 

136 return urlparse("", **defaults) 

137 

138 def __str__(self) -> str: 

139 authority = self.authority 

140 return "".join( 

141 [ 

142 f"{self.scheme}:" if self.scheme else "", 

143 f"//{authority}" if authority else "", 

144 self.path, 

145 f"?{self.query}" if self.query is not None else "", 

146 f"#{self.fragment}" if self.fragment is not None else "", 

147 ] 

148 ) 

149 

150 

151def urlparse(url: str = "", **kwargs: str | None) -> ParseResult: 

152 # Initial basic checks on allowable URLs. 

153 # --------------------------------------- 

154 

155 # Hard limit the maximum allowable URL length. 

156 if len(url) > MAX_URL_LENGTH: 

157 raise InvalidURL("URL too long") 

158 

159 # If a URL includes any ASCII control characters including \t, \r, \n, 

160 # then treat it as invalid. 

161 if any(char.isascii() and not char.isprintable() for char in url): 

162 raise InvalidURL("Invalid non-printable ASCII character in URL") 

163 

164 # Some keyword arguments require special handling. 

165 # ------------------------------------------------ 

166 

167 # Coerce "port" to a string, if it is provided as an integer. 

168 if "port" in kwargs: 

169 port = kwargs["port"] 

170 kwargs["port"] = str(port) if isinstance(port, int) else port 

171 

172 # Replace "netloc" with "host and "port". 

173 if "netloc" in kwargs: 

174 netloc = kwargs.pop("netloc") or "" 

175 kwargs["host"], _, kwargs["port"] = netloc.partition(":") 

176 

177 # Replace "username" and/or "password" with "userinfo". 

178 if "username" in kwargs or "password" in kwargs: 

179 username = quote(kwargs.pop("username", "") or "") 

180 password = quote(kwargs.pop("password", "") or "") 

181 kwargs["userinfo"] = f"{username}:{password}" if password else username 

182 

183 # Replace "raw_path" with "path" and "query". 

184 if "raw_path" in kwargs: 

185 raw_path = kwargs.pop("raw_path") or "" 

186 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?") 

187 if not seperator: 

188 kwargs["query"] = None 

189 

190 # Ensure that IPv6 "host" addresses are always escaped with "[...]". 

191 if "host" in kwargs: 

192 host = kwargs.get("host") or "" 

193 if ":" in host and not (host.startswith("[") and host.endswith("]")): 

194 kwargs["host"] = f"[{host}]" 

195 

196 # If any keyword arguments are provided, ensure they are valid. 

197 # ------------------------------------------------------------- 

198 

199 for key, value in kwargs.items(): 

200 if value is not None: 

201 if len(value) > MAX_URL_LENGTH: 

202 raise InvalidURL(f"URL component '{key}' too long") 

203 

204 # If a component includes any ASCII control characters including \t, \r, \n, 

205 # then treat it as invalid. 

206 if any(char.isascii() and not char.isprintable() for char in value): 

207 raise InvalidURL( 

208 f"Invalid non-printable ASCII character in URL component '{key}'" 

209 ) 

210 

211 # Ensure that keyword arguments match as a valid regex. 

212 if not COMPONENT_REGEX[key].fullmatch(value): 

213 raise InvalidURL(f"Invalid URL component '{key}'") 

214 

215 # The URL_REGEX will always match, but may have empty components. 

216 url_match = URL_REGEX.match(url) 

217 assert url_match is not None 

218 url_dict = url_match.groupdict() 

219 

220 # * 'scheme', 'authority', and 'path' may be empty strings. 

221 # * 'query' may be 'None', indicating no trailing "?" portion. 

222 # Any string including the empty string, indicates a trailing "?". 

223 # * 'fragment' may be 'None', indicating no trailing "#" portion. 

224 # Any string including the empty string, indicates a trailing "#". 

225 scheme = kwargs.get("scheme", url_dict["scheme"]) or "" 

226 authority = kwargs.get("authority", url_dict["authority"]) or "" 

227 path = kwargs.get("path", url_dict["path"]) or "" 

228 query = kwargs.get("query", url_dict["query"]) 

229 fragment = kwargs.get("fragment", url_dict["fragment"]) 

230 

231 # The AUTHORITY_REGEX will always match, but may have empty components. 

232 authority_match = AUTHORITY_REGEX.match(authority) 

233 assert authority_match is not None 

234 authority_dict = authority_match.groupdict() 

235 

236 # * 'userinfo' and 'host' may be empty strings. 

237 # * 'port' may be 'None'. 

238 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or "" 

239 host = kwargs.get("host", authority_dict["host"]) or "" 

240 port = kwargs.get("port", authority_dict["port"]) 

241 

242 # Normalize and validate each component. 

243 # We end up with a parsed representation of the URL, 

244 # with components that are plain ASCII bytestrings. 

245 parsed_scheme: str = scheme.lower() 

246 parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":") 

247 parsed_host: str = encode_host(host) 

248 parsed_port: int | None = normalize_port(port, scheme) 

249 

250 has_scheme = parsed_scheme != "" 

251 has_authority = ( 

252 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None 

253 ) 

254 validate_path(path, has_scheme=has_scheme, has_authority=has_authority) 

255 if has_authority: 

256 path = normalize_path(path) 

257 

258 # The GEN_DELIMS set is... : / ? # [ ] @ 

259 # These do not need to be percent-quoted unless they serve as delimiters for the 

260 # specific component. 

261 

262 # For 'path' we need to drop ? and # from the GEN_DELIMS set. 

263 parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@") 

264 # For 'query' we need to drop '#' from the GEN_DELIMS set. 

265 parsed_query: str | None = ( 

266 None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@") 

267 ) 

268 # For 'fragment' we can include all of the GEN_DELIMS set. 

269 parsed_fragment: str | None = ( 

270 None if fragment is None else quote(fragment, safe=SUB_DELIMS + ":/?#[]@") 

271 ) 

272 

273 # The parsed ASCII bytestrings are our canonical form. 

274 # All properties of the URL are derived from these. 

275 return ParseResult( 

276 parsed_scheme, 

277 parsed_userinfo, 

278 parsed_host, 

279 parsed_port, 

280 parsed_path, 

281 parsed_query, 

282 parsed_fragment, 

283 ) 

284 

285 

286def encode_host(host: str) -> str: 

287 if not host: 

288 return "" 

289 

290 elif IPv4_STYLE_HOSTNAME.match(host): 

291 # Validate IPv4 hostnames like #.#.#.# 

292 # 

293 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 

294 # 

295 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet 

296 try: 

297 ipaddress.IPv4Address(host) 

298 except ipaddress.AddressValueError: 

299 raise InvalidURL(f"Invalid IPv4 address: {host!r}") 

300 return host 

301 

302 elif IPv6_STYLE_HOSTNAME.match(host): 

303 # Validate IPv6 hostnames like [...] 

304 # 

305 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 

306 # 

307 # "A host identified by an Internet Protocol literal address, version 6 

308 # [RFC3513] or later, is distinguished by enclosing the IP literal 

309 # within square brackets ("[" and "]"). This is the only place where 

310 # square bracket characters are allowed in the URI syntax." 

311 try: 

312 ipaddress.IPv6Address(host[1:-1]) 

313 except ipaddress.AddressValueError: 

314 raise InvalidURL(f"Invalid IPv6 address: {host!r}") 

315 return host[1:-1] 

316 

317 elif host.isascii(): 

318 # Regular ASCII hostnames 

319 # 

320 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 

321 # 

322 # reg-name = *( unreserved / pct-encoded / sub-delims ) 

323 return quote(host.lower(), safe=SUB_DELIMS) 

324 

325 # IDNA hostnames 

326 try: 

327 return idna.encode(host.lower()).decode("ascii") 

328 except idna.IDNAError: 

329 raise InvalidURL(f"Invalid IDNA hostname: {host!r}") 

330 

331 

332def normalize_port(port: str | int | None, scheme: str) -> int | None: 

333 # From https://tools.ietf.org/html/rfc3986#section-3.2.3 

334 # 

335 # "A scheme may define a default port. For example, the "http" scheme 

336 # defines a default port of "80", corresponding to its reserved TCP 

337 # port number. The type of port designated by the port number (e.g., 

338 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and 

339 # normalizers should omit the port component and its ":" delimiter if 

340 # port is empty or if its value would be the same as that of the 

341 # scheme's default." 

342 if port is None or port == "": 

343 return None 

344 

345 try: 

346 port_as_int = int(port) 

347 except ValueError: 

348 raise InvalidURL(f"Invalid port: {port!r}") 

349 

350 # See https://url.spec.whatwg.org/#url-miscellaneous 

351 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get( 

352 scheme 

353 ) 

354 if port_as_int == default_port: 

355 return None 

356 return port_as_int 

357 

358 

359def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None: 

360 """ 

361 Path validation rules that depend on if the URL contains 

362 a scheme or authority component. 

363 

364 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3 

365 """ 

366 if has_authority: 

367 # If a URI contains an authority component, then the path component 

368 # must either be empty or begin with a slash ("/") character." 

369 if path and not path.startswith("/"): 

370 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'") 

371 else: 

372 # If a URI does not contain an authority component, then the path cannot begin 

373 # with two slash characters ("//"). 

374 if path.startswith("//"): 

375 raise InvalidURL( 

376 "URLs with no authority component cannot have a path starting with '//'" 

377 ) 

378 # In addition, a URI reference (Section 4.1) may be a relative-path reference, 

379 # in which case the first path segment cannot contain a colon (":") character. 

380 if path.startswith(":") and not has_scheme: 

381 raise InvalidURL( 

382 "URLs with no scheme component cannot have a path starting with ':'" 

383 ) 

384 

385 

386def normalize_path(path: str) -> str: 

387 """ 

388 Drop "." and ".." segments from a URL path. 

389 

390 For example: 

391 

392 normalize_path("/path/./to/somewhere/..") == "/path/to" 

393 """ 

394 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4 

395 components = path.split("/") 

396 output: list[str] = [] 

397 for component in components: 

398 if component == ".": 

399 pass 

400 elif component == "..": 

401 if output and output != [""]: 

402 output.pop() 

403 else: 

404 output.append(component) 

405 return "/".join(output) 

406 

407 

408def percent_encode(char: str) -> str: 

409 """ 

410 Replace a single character with the percent-encoded representation. 

411 

412 Characters outside the ASCII range are represented with their a percent-encoded 

413 representation of their UTF-8 byte sequence. 

414 

415 For example: 

416 

417 percent_encode(" ") == "%20" 

418 """ 

419 return "".join([f"%{byte:02x}" for byte in char.encode("utf-8")]).upper() 

420 

421 

422def is_safe(string: str, safe: str = "/") -> bool: 

423 """ 

424 Determine if a given string is already quote-safe. 

425 """ 

426 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe + "%" 

427 

428 # All characters must already be non-escaping or '%' 

429 for char in string: 

430 if char not in NON_ESCAPED_CHARS: 

431 return False 

432 

433 return True 

434 

435 

436def percent_encoded(string: str, safe: str = "/") -> str: 

437 """ 

438 Use percent-encoding to quote a string. 

439 """ 

440 if is_safe(string, safe=safe): 

441 return string 

442 

443 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe 

444 return "".join( 

445 [char if char in NON_ESCAPED_CHARS else percent_encode(char) for char in string] 

446 ) 

447 

448 

449def quote(string: str, safe: str = "/") -> str: 

450 """ 

451 Use percent-encoding to quote a string, omitting existing '%xx' escape sequences. 

452 

453 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1 

454 

455 * `string`: The string to be percent-escaped. 

456 * `safe`: A string containing characters that may be treated as safe, and do not 

457 need to be escaped. Unreserved characters are always treated as safe. 

458 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3 

459 """ 

460 parts = [] 

461 current_position = 0 

462 for match in re.finditer(PERCENT_ENCODED_REGEX, string): 

463 start_position, end_position = match.start(), match.end() 

464 matched_text = match.group(0) 

465 # Add any text up to the '%xx' escape sequence. 

466 if start_position != current_position: 

467 leading_text = string[current_position:start_position] 

468 parts.append(percent_encoded(leading_text, safe=safe)) 

469 

470 # Add the '%xx' escape sequence. 

471 parts.append(matched_text) 

472 current_position = end_position 

473 

474 # Add any text after the final '%xx' escape sequence. 

475 if current_position != len(string): 

476 trailing_text = string[current_position:] 

477 parts.append(percent_encoded(trailing_text, safe=safe)) 

478 

479 return "".join(parts) 

480 

481 

482def urlencode(items: list[tuple[str, str]]) -> str: 

483 """ 

484 We can use a much simpler version of the stdlib urlencode here because 

485 we don't need to handle a bunch of different typing cases, such as bytes vs str. 

486 

487 https://github.com/python/cpython/blob/b2f7b2ef0b5421e01efb8c7bee2ef95d3bab77eb/Lib/urllib/parse.py#L926 

488 

489 Note that we use '%20' encoding for spaces. and '%2F for '/'. 

490 This is slightly different than `requests`, but is the behaviour that browsers use. 

491 

492 See 

493 - https://github.com/encode/httpx/issues/2536 

494 - https://github.com/encode/httpx/issues/2721 

495 - https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode 

496 """ 

497 return "&".join( 

498 [ 

499 percent_encoded(k, safe="") + "=" + percent_encoded(v, safe="") 

500 for k, v in items 

501 ] 

502 )