Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/httpx/_urlparse.py: 23%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

189 statements  

1""" 

2An implementation of `urlparse` that provides URL validation and normalization 

3as described by RFC3986. 

4 

5We rely on this implementation rather than the one in Python's stdlib, because: 

6 

7* It provides more complete URL validation. 

8* It properly differentiates between an empty querystring and an absent querystring, 

9 to distinguish URLs with a trailing '?'. 

10* It handles scheme, hostname, port, and path normalization. 

11* It supports IDNA hostnames, normalizing them to their encoded form. 

12* The API supports passing individual components, as well as the complete URL string. 

13 

14Previously we relied on the excellent `rfc3986` package to handle URL parsing and 

15validation, but this module provides a simpler alternative, with less indirection 

16required. 

17""" 

18 

19from __future__ import annotations 

20 

21import ipaddress 

22import re 

23import typing 

24 

25import idna 

26 

27from ._exceptions import InvalidURL 

28 

29MAX_URL_LENGTH = 65536 

30 

31# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3 

32UNRESERVED_CHARACTERS = ( 

33 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~" 

34) 

35SUB_DELIMS = "!$&'()*+,;=" 

36 

37PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}") 

38 

39# https://url.spec.whatwg.org/#percent-encoded-bytes 

40 

41# The fragment percent-encode set is the C0 control percent-encode set 

42# and U+0020 SPACE, U+0022 ("), U+003C (<), U+003E (>), and U+0060 (`). 

43FRAG_SAFE = "".join( 

44 [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x3C, 0x3E, 0x60)] 

45) 

46 

47# The query percent-encode set is the C0 control percent-encode set 

48# and U+0020 SPACE, U+0022 ("), U+0023 (#), U+003C (<), and U+003E (>). 

49QUERY_SAFE = "".join( 

50 [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E)] 

51) 

52 

53# The path percent-encode set is the query percent-encode set 

54# and U+003F (?), U+0060 (`), U+007B ({), and U+007D (}). 

55PATH_SAFE = "".join( 

56 [ 

57 chr(i) 

58 for i in range(0x20, 0x7F) 

59 if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + (0x3F, 0x60, 0x7B, 0x7D) 

60 ] 

61) 

62 

63# The userinfo percent-encode set is the path percent-encode set 

64# and U+002F (/), U+003A (:), U+003B (;), U+003D (=), U+0040 (@), 

65# U+005B ([) to U+005E (^), inclusive, and U+007C (|). 

66USERNAME_SAFE = "".join( 

67 [ 

68 chr(i) 

69 for i in range(0x20, 0x7F) 

70 if i 

71 not in (0x20, 0x22, 0x23, 0x3C, 0x3E) 

72 + (0x3F, 0x60, 0x7B, 0x7D) 

73 + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C) 

74 ] 

75) 

76PASSWORD_SAFE = "".join( 

77 [ 

78 chr(i) 

79 for i in range(0x20, 0x7F) 

80 if i 

81 not in (0x20, 0x22, 0x23, 0x3C, 0x3E) 

82 + (0x3F, 0x60, 0x7B, 0x7D) 

83 + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C) 

84 ] 

85) 

86# Note... The terminology 'userinfo' percent-encode set in the WHATWG document 

87# is used for the username and password quoting. For the joint userinfo component 

88# we remove U+003A (:) from the safe set. 

89USERINFO_SAFE = "".join( 

90 [ 

91 chr(i) 

92 for i in range(0x20, 0x7F) 

93 if i 

94 not in (0x20, 0x22, 0x23, 0x3C, 0x3E) 

95 + (0x3F, 0x60, 0x7B, 0x7D) 

96 + (0x2F, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C) 

97 ] 

98) 

99 

100 

101# {scheme}: (optional) 

102# //{authority} (optional) 

103# {path} 

104# ?{query} (optional) 

105# #{fragment} (optional) 

106URL_REGEX = re.compile( 

107 ( 

108 r"(?:(?P<scheme>{scheme}):)?" 

109 r"(?://(?P<authority>{authority}))?" 

110 r"(?P<path>{path})" 

111 r"(?:\?(?P<query>{query}))?" 

112 r"(?:#(?P<fragment>{fragment}))?" 

113 ).format( 

114 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?", 

115 authority="[^/?#]*", 

116 path="[^?#]*", 

117 query="[^#]*", 

118 fragment=".*", 

119 ) 

120) 

121 

122# {userinfo}@ (optional) 

123# {host} 

124# :{port} (optional) 

125AUTHORITY_REGEX = re.compile( 

126 ( 

127 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?" 

128 ).format( 

129 userinfo=".*", # Any character sequence. 

130 host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@', 

131 # or an IPv6 address enclosed within square brackets. 

132 port=".*", # Any character sequence. 

133 ) 

134) 

135 

136 

137# If we call urlparse with an individual component, then we need to regex 

138# validate that component individually. 

139# Note that we're duplicating the same strings as above. Shock! Horror!! 

140COMPONENT_REGEX = { 

141 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"), 

142 "authority": re.compile("[^/?#]*"), 

143 "path": re.compile("[^?#]*"), 

144 "query": re.compile("[^#]*"), 

145 "fragment": re.compile(".*"), 

146 "userinfo": re.compile("[^@]*"), 

147 "host": re.compile("(\\[.*\\]|[^:]*)"), 

148 "port": re.compile(".*"), 

149} 

150 

151 

152# We use these simple regexs as a first pass before handing off to 

153# the stdlib 'ipaddress' module for IP address validation. 

154IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$") 

155IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$") 

156 

157 

158class ParseResult(typing.NamedTuple): 

159 scheme: str 

160 userinfo: str 

161 host: str 

162 port: int | None 

163 path: str 

164 query: str | None 

165 fragment: str | None 

166 

167 @property 

168 def authority(self) -> str: 

169 return "".join( 

170 [ 

171 f"{self.userinfo}@" if self.userinfo else "", 

172 f"[{self.host}]" if ":" in self.host else self.host, 

173 f":{self.port}" if self.port is not None else "", 

174 ] 

175 ) 

176 

177 @property 

178 def netloc(self) -> str: 

179 return "".join( 

180 [ 

181 f"[{self.host}]" if ":" in self.host else self.host, 

182 f":{self.port}" if self.port is not None else "", 

183 ] 

184 ) 

185 

186 def copy_with(self, **kwargs: str | None) -> ParseResult: 

187 if not kwargs: 

188 return self 

189 

190 defaults = { 

191 "scheme": self.scheme, 

192 "authority": self.authority, 

193 "path": self.path, 

194 "query": self.query, 

195 "fragment": self.fragment, 

196 } 

197 defaults.update(kwargs) 

198 return urlparse("", **defaults) 

199 

200 def __str__(self) -> str: 

201 authority = self.authority 

202 return "".join( 

203 [ 

204 f"{self.scheme}:" if self.scheme else "", 

205 f"//{authority}" if authority else "", 

206 self.path, 

207 f"?{self.query}" if self.query is not None else "", 

208 f"#{self.fragment}" if self.fragment is not None else "", 

209 ] 

210 ) 

211 

212 

213def urlparse(url: str = "", **kwargs: str | None) -> ParseResult: 

214 # Initial basic checks on allowable URLs. 

215 # --------------------------------------- 

216 

217 # Hard limit the maximum allowable URL length. 

218 if len(url) > MAX_URL_LENGTH: 

219 raise InvalidURL("URL too long") 

220 

221 # If a URL includes any ASCII control characters including \t, \r, \n, 

222 # then treat it as invalid. 

223 if any(char.isascii() and not char.isprintable() for char in url): 

224 char = next(char for char in url if char.isascii() and not char.isprintable()) 

225 idx = url.find(char) 

226 error = ( 

227 f"Invalid non-printable ASCII character in URL, {char!r} at position {idx}." 

228 ) 

229 raise InvalidURL(error) 

230 

231 # Some keyword arguments require special handling. 

232 # ------------------------------------------------ 

233 

234 # Coerce "port" to a string, if it is provided as an integer. 

235 if "port" in kwargs: 

236 port = kwargs["port"] 

237 kwargs["port"] = str(port) if isinstance(port, int) else port 

238 

239 # Replace "netloc" with "host and "port". 

240 if "netloc" in kwargs: 

241 netloc = kwargs.pop("netloc") or "" 

242 kwargs["host"], _, kwargs["port"] = netloc.partition(":") 

243 

244 # Replace "username" and/or "password" with "userinfo". 

245 if "username" in kwargs or "password" in kwargs: 

246 username = quote(kwargs.pop("username", "") or "", safe=USERNAME_SAFE) 

247 password = quote(kwargs.pop("password", "") or "", safe=PASSWORD_SAFE) 

248 kwargs["userinfo"] = f"{username}:{password}" if password else username 

249 

250 # Replace "raw_path" with "path" and "query". 

251 if "raw_path" in kwargs: 

252 raw_path = kwargs.pop("raw_path") or "" 

253 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?") 

254 if not seperator: 

255 kwargs["query"] = None 

256 

257 # Ensure that IPv6 "host" addresses are always escaped with "[...]". 

258 if "host" in kwargs: 

259 host = kwargs.get("host") or "" 

260 if ":" in host and not (host.startswith("[") and host.endswith("]")): 

261 kwargs["host"] = f"[{host}]" 

262 

263 # If any keyword arguments are provided, ensure they are valid. 

264 # ------------------------------------------------------------- 

265 

266 for key, value in kwargs.items(): 

267 if value is not None: 

268 if len(value) > MAX_URL_LENGTH: 

269 raise InvalidURL(f"URL component '{key}' too long") 

270 

271 # If a component includes any ASCII control characters including \t, \r, \n, 

272 # then treat it as invalid. 

273 if any(char.isascii() and not char.isprintable() for char in value): 

274 char = next( 

275 char for char in value if char.isascii() and not char.isprintable() 

276 ) 

277 idx = value.find(char) 

278 error = ( 

279 f"Invalid non-printable ASCII character in URL {key} component, " 

280 f"{char!r} at position {idx}." 

281 ) 

282 raise InvalidURL(error) 

283 

284 # Ensure that keyword arguments match as a valid regex. 

285 if not COMPONENT_REGEX[key].fullmatch(value): 

286 raise InvalidURL(f"Invalid URL component '{key}'") 

287 

288 # The URL_REGEX will always match, but may have empty components. 

289 url_match = URL_REGEX.match(url) 

290 assert url_match is not None 

291 url_dict = url_match.groupdict() 

292 

293 # * 'scheme', 'authority', and 'path' may be empty strings. 

294 # * 'query' may be 'None', indicating no trailing "?" portion. 

295 # Any string including the empty string, indicates a trailing "?". 

296 # * 'fragment' may be 'None', indicating no trailing "#" portion. 

297 # Any string including the empty string, indicates a trailing "#". 

298 scheme = kwargs.get("scheme", url_dict["scheme"]) or "" 

299 authority = kwargs.get("authority", url_dict["authority"]) or "" 

300 path = kwargs.get("path", url_dict["path"]) or "" 

301 query = kwargs.get("query", url_dict["query"]) 

302 frag = kwargs.get("fragment", url_dict["fragment"]) 

303 

304 # The AUTHORITY_REGEX will always match, but may have empty components. 

305 authority_match = AUTHORITY_REGEX.match(authority) 

306 assert authority_match is not None 

307 authority_dict = authority_match.groupdict() 

308 

309 # * 'userinfo' and 'host' may be empty strings. 

310 # * 'port' may be 'None'. 

311 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or "" 

312 host = kwargs.get("host", authority_dict["host"]) or "" 

313 port = kwargs.get("port", authority_dict["port"]) 

314 

315 # Normalize and validate each component. 

316 # We end up with a parsed representation of the URL, 

317 # with components that are plain ASCII bytestrings. 

318 parsed_scheme: str = scheme.lower() 

319 parsed_userinfo: str = quote(userinfo, safe=USERINFO_SAFE) 

320 parsed_host: str = encode_host(host) 

321 parsed_port: int | None = normalize_port(port, scheme) 

322 

323 has_scheme = parsed_scheme != "" 

324 has_authority = ( 

325 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None 

326 ) 

327 validate_path(path, has_scheme=has_scheme, has_authority=has_authority) 

328 if has_scheme or has_authority: 

329 path = normalize_path(path) 

330 

331 parsed_path: str = quote(path, safe=PATH_SAFE) 

332 parsed_query: str | None = None if query is None else quote(query, safe=QUERY_SAFE) 

333 parsed_frag: str | None = None if frag is None else quote(frag, safe=FRAG_SAFE) 

334 

335 # The parsed ASCII bytestrings are our canonical form. 

336 # All properties of the URL are derived from these. 

337 return ParseResult( 

338 parsed_scheme, 

339 parsed_userinfo, 

340 parsed_host, 

341 parsed_port, 

342 parsed_path, 

343 parsed_query, 

344 parsed_frag, 

345 ) 

346 

347 

348def encode_host(host: str) -> str: 

349 if not host: 

350 return "" 

351 

352 elif IPv4_STYLE_HOSTNAME.match(host): 

353 # Validate IPv4 hostnames like #.#.#.# 

354 # 

355 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 

356 # 

357 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet 

358 try: 

359 ipaddress.IPv4Address(host) 

360 except ipaddress.AddressValueError: 

361 raise InvalidURL(f"Invalid IPv4 address: {host!r}") 

362 return host 

363 

364 elif IPv6_STYLE_HOSTNAME.match(host): 

365 # Validate IPv6 hostnames like [...] 

366 # 

367 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 

368 # 

369 # "A host identified by an Internet Protocol literal address, version 6 

370 # [RFC3513] or later, is distinguished by enclosing the IP literal 

371 # within square brackets ("[" and "]"). This is the only place where 

372 # square bracket characters are allowed in the URI syntax." 

373 try: 

374 ipaddress.IPv6Address(host[1:-1]) 

375 except ipaddress.AddressValueError: 

376 raise InvalidURL(f"Invalid IPv6 address: {host!r}") 

377 return host[1:-1] 

378 

379 elif host.isascii(): 

380 # Regular ASCII hostnames 

381 # 

382 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 

383 # 

384 # reg-name = *( unreserved / pct-encoded / sub-delims ) 

385 WHATWG_SAFE = '"`{}%|\\' 

386 return quote(host.lower(), safe=SUB_DELIMS + WHATWG_SAFE) 

387 

388 # IDNA hostnames 

389 try: 

390 return idna.encode(host.lower()).decode("ascii") 

391 except idna.IDNAError: 

392 raise InvalidURL(f"Invalid IDNA hostname: {host!r}") 

393 

394 

395def normalize_port(port: str | int | None, scheme: str) -> int | None: 

396 # From https://tools.ietf.org/html/rfc3986#section-3.2.3 

397 # 

398 # "A scheme may define a default port. For example, the "http" scheme 

399 # defines a default port of "80", corresponding to its reserved TCP 

400 # port number. The type of port designated by the port number (e.g., 

401 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and 

402 # normalizers should omit the port component and its ":" delimiter if 

403 # port is empty or if its value would be the same as that of the 

404 # scheme's default." 

405 if port is None or port == "": 

406 return None 

407 

408 try: 

409 port_as_int = int(port) 

410 except ValueError: 

411 raise InvalidURL(f"Invalid port: {port!r}") 

412 

413 # See https://url.spec.whatwg.org/#url-miscellaneous 

414 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get( 

415 scheme 

416 ) 

417 if port_as_int == default_port: 

418 return None 

419 return port_as_int 

420 

421 

422def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None: 

423 """ 

424 Path validation rules that depend on if the URL contains 

425 a scheme or authority component. 

426 

427 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3 

428 """ 

429 if has_authority: 

430 # If a URI contains an authority component, then the path component 

431 # must either be empty or begin with a slash ("/") character." 

432 if path and not path.startswith("/"): 

433 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'") 

434 

435 if not has_scheme and not has_authority: 

436 # If a URI does not contain an authority component, then the path cannot begin 

437 # with two slash characters ("//"). 

438 if path.startswith("//"): 

439 raise InvalidURL("Relative URLs cannot have a path starting with '//'") 

440 

441 # In addition, a URI reference (Section 4.1) may be a relative-path reference, 

442 # in which case the first path segment cannot contain a colon (":") character. 

443 if path.startswith(":"): 

444 raise InvalidURL("Relative URLs cannot have a path starting with ':'") 

445 

446 

447def normalize_path(path: str) -> str: 

448 """ 

449 Drop "." and ".." segments from a URL path. 

450 

451 For example: 

452 

453 normalize_path("/path/./to/somewhere/..") == "/path/to" 

454 """ 

455 # Fast return when no '.' characters in the path. 

456 if "." not in path: 

457 return path 

458 

459 components = path.split("/") 

460 

461 # Fast return when no '.' or '..' components in the path. 

462 if "." not in components and ".." not in components: 

463 return path 

464 

465 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4 

466 output: list[str] = [] 

467 for component in components: 

468 if component == ".": 

469 pass 

470 elif component == "..": 

471 if output and output != [""]: 

472 output.pop() 

473 else: 

474 output.append(component) 

475 return "/".join(output) 

476 

477 

478def PERCENT(string: str) -> str: 

479 return "".join([f"%{byte:02X}" for byte in string.encode("utf-8")]) 

480 

481 

482def percent_encoded(string: str, safe: str) -> str: 

483 """ 

484 Use percent-encoding to quote a string. 

485 """ 

486 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe 

487 

488 # Fast path for strings that don't need escaping. 

489 if not string.rstrip(NON_ESCAPED_CHARS): 

490 return string 

491 

492 return "".join( 

493 [char if char in NON_ESCAPED_CHARS else PERCENT(char) for char in string] 

494 ) 

495 

496 

497def quote(string: str, safe: str) -> str: 

498 """ 

499 Use percent-encoding to quote a string, omitting existing '%xx' escape sequences. 

500 

501 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1 

502 

503 * `string`: The string to be percent-escaped. 

504 * `safe`: A string containing characters that may be treated as safe, and do not 

505 need to be escaped. Unreserved characters are always treated as safe. 

506 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3 

507 """ 

508 parts = [] 

509 current_position = 0 

510 for match in re.finditer(PERCENT_ENCODED_REGEX, string): 

511 start_position, end_position = match.start(), match.end() 

512 matched_text = match.group(0) 

513 # Add any text up to the '%xx' escape sequence. 

514 if start_position != current_position: 

515 leading_text = string[current_position:start_position] 

516 parts.append(percent_encoded(leading_text, safe=safe)) 

517 

518 # Add the '%xx' escape sequence. 

519 parts.append(matched_text) 

520 current_position = end_position 

521 

522 # Add any text after the final '%xx' escape sequence. 

523 if current_position != len(string): 

524 trailing_text = string[current_position:] 

525 parts.append(percent_encoded(trailing_text, safe=safe)) 

526 

527 return "".join(parts)