Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/httpx/_urlparse.py: 21%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

185 statements  

1""" 

2An implementation of `urlparse` that provides URL validation and normalization 

3as described by RFC3986. 

4 

5We rely on this implementation rather than the one in Python's stdlib, because: 

6 

7* It provides more complete URL validation. 

8* It properly differentiates between an empty querystring and an absent querystring, 

9 to distinguish URLs with a trailing '?'. 

10* It handles scheme, hostname, port, and path normalization. 

11* It supports IDNA hostnames, normalizing them to their encoded form. 

12* The API supports passing individual components, as well as the complete URL string. 

13 

14Previously we relied on the excellent `rfc3986` package to handle URL parsing and 

15validation, but this module provides a simpler alternative, with less indirection 

16required. 

17""" 

18 

19from __future__ import annotations 

20 

21import ipaddress 

22import re 

23import typing 

24 

25import idna 

26 

27from ._exceptions import InvalidURL 

28 

29MAX_URL_LENGTH = 65536 

30 

31# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3 

32UNRESERVED_CHARACTERS = ( 

33 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~" 

34) 

35SUB_DELIMS = "!$&'()*+,;=" 

36 

37PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}") 

38 

39 

40# {scheme}: (optional) 

41# //{authority} (optional) 

42# {path} 

43# ?{query} (optional) 

44# #{fragment} (optional) 

45URL_REGEX = re.compile( 

46 ( 

47 r"(?:(?P<scheme>{scheme}):)?" 

48 r"(?://(?P<authority>{authority}))?" 

49 r"(?P<path>{path})" 

50 r"(?:\?(?P<query>{query}))?" 

51 r"(?:#(?P<fragment>{fragment}))?" 

52 ).format( 

53 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?", 

54 authority="[^/?#]*", 

55 path="[^?#]*", 

56 query="[^#]*", 

57 fragment=".*", 

58 ) 

59) 

60 

61# {userinfo}@ (optional) 

62# {host} 

63# :{port} (optional) 

64AUTHORITY_REGEX = re.compile( 

65 ( 

66 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?" 

67 ).format( 

68 userinfo=".*", # Any character sequence. 

69 host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@', 

70 # or an IPv6 address enclosed within square brackets. 

71 port=".*", # Any character sequence. 

72 ) 

73) 

74 

75 

76# If we call urlparse with an individual component, then we need to regex 

77# validate that component individually. 

78# Note that we're duplicating the same strings as above. Shock! Horror!! 

79COMPONENT_REGEX = { 

80 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"), 

81 "authority": re.compile("[^/?#]*"), 

82 "path": re.compile("[^?#]*"), 

83 "query": re.compile("[^#]*"), 

84 "fragment": re.compile(".*"), 

85 "userinfo": re.compile("[^@]*"), 

86 "host": re.compile("(\\[.*\\]|[^:]*)"), 

87 "port": re.compile(".*"), 

88} 

89 

90 

91# We use these simple regexs as a first pass before handing off to 

92# the stdlib 'ipaddress' module for IP address validation. 

93IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$") 

94IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$") 

95 

96 

97class ParseResult(typing.NamedTuple): 

98 scheme: str 

99 userinfo: str 

100 host: str 

101 port: int | None 

102 path: str 

103 query: str | None 

104 fragment: str | None 

105 

106 @property 

107 def authority(self) -> str: 

108 return "".join( 

109 [ 

110 f"{self.userinfo}@" if self.userinfo else "", 

111 f"[{self.host}]" if ":" in self.host else self.host, 

112 f":{self.port}" if self.port is not None else "", 

113 ] 

114 ) 

115 

116 @property 

117 def netloc(self) -> str: 

118 return "".join( 

119 [ 

120 f"[{self.host}]" if ":" in self.host else self.host, 

121 f":{self.port}" if self.port is not None else "", 

122 ] 

123 ) 

124 

125 def copy_with(self, **kwargs: str | None) -> ParseResult: 

126 if not kwargs: 

127 return self 

128 

129 defaults = { 

130 "scheme": self.scheme, 

131 "authority": self.authority, 

132 "path": self.path, 

133 "query": self.query, 

134 "fragment": self.fragment, 

135 } 

136 defaults.update(kwargs) 

137 return urlparse("", **defaults) 

138 

139 def __str__(self) -> str: 

140 authority = self.authority 

141 return "".join( 

142 [ 

143 f"{self.scheme}:" if self.scheme else "", 

144 f"//{authority}" if authority else "", 

145 self.path, 

146 f"?{self.query}" if self.query is not None else "", 

147 f"#{self.fragment}" if self.fragment is not None else "", 

148 ] 

149 ) 

150 

151 

152def urlparse(url: str = "", **kwargs: str | None) -> ParseResult: 

153 # Initial basic checks on allowable URLs. 

154 # --------------------------------------- 

155 

156 # Hard limit the maximum allowable URL length. 

157 if len(url) > MAX_URL_LENGTH: 

158 raise InvalidURL("URL too long") 

159 

160 # If a URL includes any ASCII control characters including \t, \r, \n, 

161 # then treat it as invalid. 

162 if any(char.isascii() and not char.isprintable() for char in url): 

163 char = next(char for char in url if char.isascii() and not char.isprintable()) 

164 idx = url.find(char) 

165 error = ( 

166 f"Invalid non-printable ASCII character in URL, {char!r} at position {idx}." 

167 ) 

168 raise InvalidURL(error) 

169 

170 # Some keyword arguments require special handling. 

171 # ------------------------------------------------ 

172 

173 # Coerce "port" to a string, if it is provided as an integer. 

174 if "port" in kwargs: 

175 port = kwargs["port"] 

176 kwargs["port"] = str(port) if isinstance(port, int) else port 

177 

178 # Replace "netloc" with "host and "port". 

179 if "netloc" in kwargs: 

180 netloc = kwargs.pop("netloc") or "" 

181 kwargs["host"], _, kwargs["port"] = netloc.partition(":") 

182 

183 # Replace "username" and/or "password" with "userinfo". 

184 if "username" in kwargs or "password" in kwargs: 

185 username = quote(kwargs.pop("username", "") or "") 

186 password = quote(kwargs.pop("password", "") or "") 

187 kwargs["userinfo"] = f"{username}:{password}" if password else username 

188 

189 # Replace "raw_path" with "path" and "query". 

190 if "raw_path" in kwargs: 

191 raw_path = kwargs.pop("raw_path") or "" 

192 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?") 

193 if not seperator: 

194 kwargs["query"] = None 

195 

196 # Ensure that IPv6 "host" addresses are always escaped with "[...]". 

197 if "host" in kwargs: 

198 host = kwargs.get("host") or "" 

199 if ":" in host and not (host.startswith("[") and host.endswith("]")): 

200 kwargs["host"] = f"[{host}]" 

201 

202 # If any keyword arguments are provided, ensure they are valid. 

203 # ------------------------------------------------------------- 

204 

205 for key, value in kwargs.items(): 

206 if value is not None: 

207 if len(value) > MAX_URL_LENGTH: 

208 raise InvalidURL(f"URL component '{key}' too long") 

209 

210 # If a component includes any ASCII control characters including \t, \r, \n, 

211 # then treat it as invalid. 

212 if any(char.isascii() and not char.isprintable() for char in value): 

213 char = next( 

214 char for char in value if char.isascii() and not char.isprintable() 

215 ) 

216 idx = value.find(char) 

217 error = ( 

218 f"Invalid non-printable ASCII character in URL {key} component, " 

219 f"{char!r} at position {idx}." 

220 ) 

221 raise InvalidURL(error) 

222 

223 # Ensure that keyword arguments match as a valid regex. 

224 if not COMPONENT_REGEX[key].fullmatch(value): 

225 raise InvalidURL(f"Invalid URL component '{key}'") 

226 

227 # The URL_REGEX will always match, but may have empty components. 

228 url_match = URL_REGEX.match(url) 

229 assert url_match is not None 

230 url_dict = url_match.groupdict() 

231 

232 # * 'scheme', 'authority', and 'path' may be empty strings. 

233 # * 'query' may be 'None', indicating no trailing "?" portion. 

234 # Any string including the empty string, indicates a trailing "?". 

235 # * 'fragment' may be 'None', indicating no trailing "#" portion. 

236 # Any string including the empty string, indicates a trailing "#". 

237 scheme = kwargs.get("scheme", url_dict["scheme"]) or "" 

238 authority = kwargs.get("authority", url_dict["authority"]) or "" 

239 path = kwargs.get("path", url_dict["path"]) or "" 

240 query = kwargs.get("query", url_dict["query"]) 

241 fragment = kwargs.get("fragment", url_dict["fragment"]) 

242 

243 # The AUTHORITY_REGEX will always match, but may have empty components. 

244 authority_match = AUTHORITY_REGEX.match(authority) 

245 assert authority_match is not None 

246 authority_dict = authority_match.groupdict() 

247 

248 # * 'userinfo' and 'host' may be empty strings. 

249 # * 'port' may be 'None'. 

250 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or "" 

251 host = kwargs.get("host", authority_dict["host"]) or "" 

252 port = kwargs.get("port", authority_dict["port"]) 

253 

254 # Normalize and validate each component. 

255 # We end up with a parsed representation of the URL, 

256 # with components that are plain ASCII bytestrings. 

257 parsed_scheme: str = scheme.lower() 

258 parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":") 

259 parsed_host: str = encode_host(host) 

260 parsed_port: int | None = normalize_port(port, scheme) 

261 

262 has_scheme = parsed_scheme != "" 

263 has_authority = ( 

264 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None 

265 ) 

266 validate_path(path, has_scheme=has_scheme, has_authority=has_authority) 

267 if has_scheme or has_authority: 

268 path = normalize_path(path) 

269 

270 # The GEN_DELIMS set is... : / ? # [ ] @ 

271 # These do not need to be percent-quoted unless they serve as delimiters for the 

272 # specific component. 

273 WHATWG_SAFE = '`{}%|^\\"' 

274 

275 # For 'path' we need to drop ? and # from the GEN_DELIMS set. 

276 parsed_path: str = quote(path, safe=SUB_DELIMS + WHATWG_SAFE + ":/[]@") 

277 # For 'query' we need to drop '#' from the GEN_DELIMS set. 

278 parsed_query: str | None = ( 

279 None 

280 if query is None 

281 else quote(query, safe=SUB_DELIMS + WHATWG_SAFE + ":/?[]@") 

282 ) 

283 # For 'fragment' we can include all of the GEN_DELIMS set. 

284 parsed_fragment: str | None = ( 

285 None 

286 if fragment is None 

287 else quote(fragment, safe=SUB_DELIMS + WHATWG_SAFE + ":/?#[]@") 

288 ) 

289 

290 # The parsed ASCII bytestrings are our canonical form. 

291 # All properties of the URL are derived from these. 

292 return ParseResult( 

293 parsed_scheme, 

294 parsed_userinfo, 

295 parsed_host, 

296 parsed_port, 

297 parsed_path, 

298 parsed_query, 

299 parsed_fragment, 

300 ) 

301 

302 

303def encode_host(host: str) -> str: 

304 if not host: 

305 return "" 

306 

307 elif IPv4_STYLE_HOSTNAME.match(host): 

308 # Validate IPv4 hostnames like #.#.#.# 

309 # 

310 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 

311 # 

312 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet 

313 try: 

314 ipaddress.IPv4Address(host) 

315 except ipaddress.AddressValueError: 

316 raise InvalidURL(f"Invalid IPv4 address: {host!r}") 

317 return host 

318 

319 elif IPv6_STYLE_HOSTNAME.match(host): 

320 # Validate IPv6 hostnames like [...] 

321 # 

322 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 

323 # 

324 # "A host identified by an Internet Protocol literal address, version 6 

325 # [RFC3513] or later, is distinguished by enclosing the IP literal 

326 # within square brackets ("[" and "]"). This is the only place where 

327 # square bracket characters are allowed in the URI syntax." 

328 try: 

329 ipaddress.IPv6Address(host[1:-1]) 

330 except ipaddress.AddressValueError: 

331 raise InvalidURL(f"Invalid IPv6 address: {host!r}") 

332 return host[1:-1] 

333 

334 elif host.isascii(): 

335 # Regular ASCII hostnames 

336 # 

337 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 

338 # 

339 # reg-name = *( unreserved / pct-encoded / sub-delims ) 

340 WHATWG_SAFE = '"`{}%|\\' 

341 return quote(host.lower(), safe=SUB_DELIMS + WHATWG_SAFE) 

342 

343 # IDNA hostnames 

344 try: 

345 return idna.encode(host.lower()).decode("ascii") 

346 except idna.IDNAError: 

347 raise InvalidURL(f"Invalid IDNA hostname: {host!r}") 

348 

349 

350def normalize_port(port: str | int | None, scheme: str) -> int | None: 

351 # From https://tools.ietf.org/html/rfc3986#section-3.2.3 

352 # 

353 # "A scheme may define a default port. For example, the "http" scheme 

354 # defines a default port of "80", corresponding to its reserved TCP 

355 # port number. The type of port designated by the port number (e.g., 

356 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and 

357 # normalizers should omit the port component and its ":" delimiter if 

358 # port is empty or if its value would be the same as that of the 

359 # scheme's default." 

360 if port is None or port == "": 

361 return None 

362 

363 try: 

364 port_as_int = int(port) 

365 except ValueError: 

366 raise InvalidURL(f"Invalid port: {port!r}") 

367 

368 # See https://url.spec.whatwg.org/#url-miscellaneous 

369 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get( 

370 scheme 

371 ) 

372 if port_as_int == default_port: 

373 return None 

374 return port_as_int 

375 

376 

377def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None: 

378 """ 

379 Path validation rules that depend on if the URL contains 

380 a scheme or authority component. 

381 

382 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3 

383 """ 

384 if has_authority: 

385 # If a URI contains an authority component, then the path component 

386 # must either be empty or begin with a slash ("/") character." 

387 if path and not path.startswith("/"): 

388 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'") 

389 

390 if not has_scheme and not has_authority: 

391 # If a URI does not contain an authority component, then the path cannot begin 

392 # with two slash characters ("//"). 

393 if path.startswith("//"): 

394 raise InvalidURL("Relative URLs cannot have a path starting with '//'") 

395 

396 # In addition, a URI reference (Section 4.1) may be a relative-path reference, 

397 # in which case the first path segment cannot contain a colon (":") character. 

398 if path.startswith(":"): 

399 raise InvalidURL("Relative URLs cannot have a path starting with ':'") 

400 

401 

402def normalize_path(path: str) -> str: 

403 """ 

404 Drop "." and ".." segments from a URL path. 

405 

406 For example: 

407 

408 normalize_path("/path/./to/somewhere/..") == "/path/to" 

409 """ 

410 # Fast return when no '.' characters in the path. 

411 if "." not in path: 

412 return path 

413 

414 components = path.split("/") 

415 

416 # Fast return when no '.' or '..' components in the path. 

417 if "." not in components and ".." not in components: 

418 return path 

419 

420 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4 

421 output: list[str] = [] 

422 for component in components: 

423 if component == ".": 

424 pass 

425 elif component == "..": 

426 if output and output != [""]: 

427 output.pop() 

428 else: 

429 output.append(component) 

430 return "/".join(output) 

431 

432 

433def PERCENT(string: str) -> str: 

434 return "".join([f"%{byte:02X}" for byte in string.encode("utf-8")]) 

435 

436 

437def percent_encoded(string: str, safe: str = "/") -> str: 

438 """ 

439 Use percent-encoding to quote a string. 

440 """ 

441 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe 

442 

443 # Fast path for strings that don't need escaping. 

444 if not string.rstrip(NON_ESCAPED_CHARS): 

445 return string 

446 

447 return "".join( 

448 [char if char in NON_ESCAPED_CHARS else PERCENT(char) for char in string] 

449 ) 

450 

451 

452def quote(string: str, safe: str = "/") -> str: 

453 """ 

454 Use percent-encoding to quote a string, omitting existing '%xx' escape sequences. 

455 

456 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1 

457 

458 * `string`: The string to be percent-escaped. 

459 * `safe`: A string containing characters that may be treated as safe, and do not 

460 need to be escaped. Unreserved characters are always treated as safe. 

461 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3 

462 """ 

463 parts = [] 

464 current_position = 0 

465 for match in re.finditer(PERCENT_ENCODED_REGEX, string): 

466 start_position, end_position = match.start(), match.end() 

467 matched_text = match.group(0) 

468 # Add any text up to the '%xx' escape sequence. 

469 if start_position != current_position: 

470 leading_text = string[current_position:start_position] 

471 parts.append(percent_encoded(leading_text, safe=safe)) 

472 

473 # Add the '%xx' escape sequence. 

474 parts.append(matched_text) 

475 current_position = end_position 

476 

477 # Add any text after the final '%xx' escape sequence. 

478 if current_position != len(string): 

479 trailing_text = string[current_position:] 

480 parts.append(percent_encoded(trailing_text, safe=safe)) 

481 

482 return "".join(parts) 

483 

484 

485def urlencode(items: list[tuple[str, str]]) -> str: 

486 """ 

487 We can use a much simpler version of the stdlib urlencode here because 

488 we don't need to handle a bunch of different typing cases, such as bytes vs str. 

489 

490 https://github.com/python/cpython/blob/b2f7b2ef0b5421e01efb8c7bee2ef95d3bab77eb/Lib/urllib/parse.py#L926 

491 

492 Note that we use '%20' encoding for spaces. and '%2F for '/'. 

493 This is slightly different than `requests`, but is the behaviour that browsers use. 

494 

495 See 

496 - https://github.com/encode/httpx/issues/2536 

497 - https://github.com/encode/httpx/issues/2721 

498 - https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode 

499 """ 

500 return "&".join( 

501 [ 

502 percent_encoded(k, safe="") + "=" + percent_encoded(v, safe="") 

503 for k, v in items 

504 ] 

505 )