Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/httpx/

1"""

2An implementation of `urlparse` that provides URL validation and normalization

3as described by RFC3986.

5We rely on this implementation rather than the one in Python's stdlib, because:

7* It provides more complete URL validation.

8* It properly differentiates between an empty querystring and an absent querystring,

9 to distinguish URLs with a trailing '?'.

10* It handles scheme, hostname, port, and path normalization.

11* It supports IDNA hostnames, normalizing them to their encoded form.

12* The API supports passing individual components, as well as the complete URL string.

14Previously we relied on the excellent `rfc3986` package to handle URL parsing and

15validation, but this module provides a simpler alternative, with less indirection

16required.

17"""

19from __future__ import annotations

21import ipaddress

22import re

23import typing

25import idna

27from ._exceptions import InvalidURL

29MAX_URL_LENGTH = 65536

31# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3

32UNRESERVED_CHARACTERS = (

33 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"

34)

35SUB_DELIMS = "!$&'()*+,;="

37PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")

39# https://url.spec.whatwg.org/#percent-encoded-bytes

41# The fragment percent-encode set is the C0 control percent-encode set

42# and U+0020 SPACE, U+0022 ("), U+003C (<), U+003E (>), and U+0060 (`).

43FRAG_SAFE = "".join(

44 [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x3C, 0x3E, 0x60)]

45)

47# The query percent-encode set is the C0 control percent-encode set

48# and U+0020 SPACE, U+0022 ("), U+0023 (#), U+003C (<), and U+003E (>).

49QUERY_SAFE = "".join(

50 [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E)]

51)

53# The path percent-encode set is the query percent-encode set

54# and U+003F (?), U+0060 (`), U+007B ({), and U+007D (}).

55PATH_SAFE = "".join(

56 [

57 chr(i)

58 for i in range(0x20, 0x7F)

59 if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + (0x3F, 0x60, 0x7B, 0x7D)

60 ]

61)

63# The userinfo percent-encode set is the path percent-encode set

64# and U+002F (/), U+003A (:), U+003B (;), U+003D (=), U+0040 (@),

65# U+005B ([) to U+005E (^), inclusive, and U+007C (|).

66USERNAME_SAFE = "".join(

67 [

68 chr(i)

69 for i in range(0x20, 0x7F)

70 if i

71 not in (0x20, 0x22, 0x23, 0x3C, 0x3E)

72 + (0x3F, 0x60, 0x7B, 0x7D)

73 + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)

74 ]

75)

76PASSWORD_SAFE = "".join(

77 [

78 chr(i)

79 for i in range(0x20, 0x7F)

80 if i

81 not in (0x20, 0x22, 0x23, 0x3C, 0x3E)

82 + (0x3F, 0x60, 0x7B, 0x7D)

83 + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)

84 ]

85)

86# Note... The terminology 'userinfo' percent-encode set in the WHATWG document

87# is used for the username and password quoting. For the joint userinfo component

88# we remove U+003A (:) from the safe set.

89USERINFO_SAFE = "".join(

90 [

91 chr(i)

92 for i in range(0x20, 0x7F)

93 if i

94 not in (0x20, 0x22, 0x23, 0x3C, 0x3E)

95 + (0x3F, 0x60, 0x7B, 0x7D)

96 + (0x2F, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)

97 ]

98)

100

101# {scheme}: (optional)

102# //{authority} (optional)

103# {path}

104# ?{query} (optional)

105# #{fragment} (optional)

106URL_REGEX = re.compile(

107 (

108 r"(?:(?P<scheme>{scheme}):)?"

109 r"(?://(?P<authority>{authority}))?"

110 r"(?P<path>{path})"

111 r"(?:\?(?P<query>{query}))?"

112 r"(?:#(?P<fragment>{fragment}))?"

113 ).format(

114 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?",

115 authority="[^/?#]*",

116 path="[^?#]*",

117 query="[^#]*",

118 fragment=".*",

119 )

120)

121

122# {userinfo}@ (optional)

123# {host}

124# :{port} (optional)

125AUTHORITY_REGEX = re.compile(

126 (

127 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?"

128 ).format(

129 userinfo=".*", # Any character sequence.

130 host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@',

131 # or an IPv6 address enclosed within square brackets.

132 port=".*", # Any character sequence.

133 )

134)

135

136

137# If we call urlparse with an individual component, then we need to regex

138# validate that component individually.

139# Note that we're duplicating the same strings as above. Shock! Horror!!

140COMPONENT_REGEX = {

141 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"),

142 "authority": re.compile("[^/?#]*"),

143 "path": re.compile("[^?#]*"),

144 "query": re.compile("[^#]*"),

145 "fragment": re.compile(".*"),

146 "userinfo": re.compile("[^@]*"),

147 "host": re.compile("(\\[.*\\]|[^:]*)"),

148 "port": re.compile(".*"),

149}

150

151

152# We use these simple regexs as a first pass before handing off to

153# the stdlib 'ipaddress' module for IP address validation.

154IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$")

155IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$")

156

157

158class ParseResult(typing.NamedTuple):

159 scheme: str

160 userinfo: str

161 host: str

162 port: int | None

163 path: str

164 query: str | None

165 fragment: str | None

166

167 @property

168 def authority(self) -> str:

169 return "".join(

170 [

171 f"{self.userinfo}@" if self.userinfo else "",

172 f"[{self.host}]" if ":" in self.host else self.host,

173 f":{self.port}" if self.port is not None else "",

174 ]

175 )

176

177 @property

178 def netloc(self) -> str:

179 return "".join(

180 [

181 f"[{self.host}]" if ":" in self.host else self.host,

182 f":{self.port}" if self.port is not None else "",

183 ]

184 )

185

186 def copy_with(self, **kwargs: str | None) -> ParseResult:

187 if not kwargs:

188 return self

189

190 defaults = {

191 "scheme": self.scheme,

192 "authority": self.authority,

193 "path": self.path,

194 "query": self.query,

195 "fragment": self.fragment,

196 }

197 defaults.update(kwargs)

198 return urlparse("", **defaults)

199

200 def __str__(self) -> str:

201 authority = self.authority

202 return "".join(

203 [

204 f"{self.scheme}:" if self.scheme else "",

205 f"//{authority}" if authority else "",

206 self.path,

207 f"?{self.query}" if self.query is not None else "",

208 f"#{self.fragment}" if self.fragment is not None else "",

209 ]

210 )

211

212

213def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:

214 # Initial basic checks on allowable URLs.

215 # ---------------------------------------

216

217 # Hard limit the maximum allowable URL length.

218 if len(url) > MAX_URL_LENGTH:

219 raise InvalidURL("URL too long")

220

221 # If a URL includes any ASCII control characters including \t, \r, \n,

222 # then treat it as invalid.

223 if any(char.isascii() and not char.isprintable() for char in url):

224 char = next(char for char in url if char.isascii() and not char.isprintable())

225 idx = url.find(char)

226 error = (

227 f"Invalid non-printable ASCII character in URL, {char!r} at position {idx}."

228 )

229 raise InvalidURL(error)

230

231 # Some keyword arguments require special handling.

232 # ------------------------------------------------

233

234 # Coerce "port" to a string, if it is provided as an integer.

235 if "port" in kwargs:

236 port = kwargs["port"]

237 kwargs["port"] = str(port) if isinstance(port, int) else port

238

239 # Replace "netloc" with "host and "port".

240 if "netloc" in kwargs:

241 netloc = kwargs.pop("netloc") or ""

242 kwargs["host"], _, kwargs["port"] = netloc.partition(":")

243

244 # Replace "username" and/or "password" with "userinfo".

245 if "username" in kwargs or "password" in kwargs:

246 username = quote(kwargs.pop("username", "") or "", safe=USERNAME_SAFE)

247 password = quote(kwargs.pop("password", "") or "", safe=PASSWORD_SAFE)

248 kwargs["userinfo"] = f"{username}:{password}" if password else username

249

250 # Replace "raw_path" with "path" and "query".

251 if "raw_path" in kwargs:

252 raw_path = kwargs.pop("raw_path") or ""

253 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?")

254 if not seperator:

255 kwargs["query"] = None

256

257 # Ensure that IPv6 "host" addresses are always escaped with "[...]".

258 if "host" in kwargs:

259 host = kwargs.get("host") or ""

260 if ":" in host and not (host.startswith("[") and host.endswith("]")):

261 kwargs["host"] = f"[{host}]"

262

263 # If any keyword arguments are provided, ensure they are valid.

264 # -------------------------------------------------------------

265

266 for key, value in kwargs.items():

267 if value is not None:

268 if len(value) > MAX_URL_LENGTH:

269 raise InvalidURL(f"URL component '{key}' too long")

270

271 # If a component includes any ASCII control characters including \t, \r, \n,

272 # then treat it as invalid.

273 if any(char.isascii() and not char.isprintable() for char in value):

274 char = next(

275 char for char in value if char.isascii() and not char.isprintable()

276 )

277 idx = value.find(char)

278 error = (

279 f"Invalid non-printable ASCII character in URL {key} component, "

280 f"{char!r} at position {idx}."

281 )

282 raise InvalidURL(error)

283

284 # Ensure that keyword arguments match as a valid regex.

285 if not COMPONENT_REGEX[key].fullmatch(value):

286 raise InvalidURL(f"Invalid URL component '{key}'")

287

288 # The URL_REGEX will always match, but may have empty components.

289 url_match = URL_REGEX.match(url)

290 assert url_match is not None

291 url_dict = url_match.groupdict()

292

293 # * 'scheme', 'authority', and 'path' may be empty strings.

294 # * 'query' may be 'None', indicating no trailing "?" portion.

295 # Any string including the empty string, indicates a trailing "?".

296 # * 'fragment' may be 'None', indicating no trailing "#" portion.

297 # Any string including the empty string, indicates a trailing "#".

298 scheme = kwargs.get("scheme", url_dict["scheme"]) or ""

299 authority = kwargs.get("authority", url_dict["authority"]) or ""

300 path = kwargs.get("path", url_dict["path"]) or ""

301 query = kwargs.get("query", url_dict["query"])

302 frag = kwargs.get("fragment", url_dict["fragment"])

303

304 # The AUTHORITY_REGEX will always match, but may have empty components.

305 authority_match = AUTHORITY_REGEX.match(authority)

306 assert authority_match is not None

307 authority_dict = authority_match.groupdict()

308

309 # * 'userinfo' and 'host' may be empty strings.

310 # * 'port' may be 'None'.

311 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or ""

312 host = kwargs.get("host", authority_dict["host"]) or ""

313 port = kwargs.get("port", authority_dict["port"])

314

315 # Normalize and validate each component.

316 # We end up with a parsed representation of the URL,

317 # with components that are plain ASCII bytestrings.

318 parsed_scheme: str = scheme.lower()

319 parsed_userinfo: str = quote(userinfo, safe=USERINFO_SAFE)

320 parsed_host: str = encode_host(host)

321 parsed_port: int | None = normalize_port(port, scheme)

322

323 has_scheme = parsed_scheme != ""

324 has_authority = (

325 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None

326 )

327 validate_path(path, has_scheme=has_scheme, has_authority=has_authority)

328 if has_scheme or has_authority:

329 path = normalize_path(path)

330

331 parsed_path: str = quote(path, safe=PATH_SAFE)

332 parsed_query: str | None = None if query is None else quote(query, safe=QUERY_SAFE)

333 parsed_frag: str | None = None if frag is None else quote(frag, safe=FRAG_SAFE)

334

335 # The parsed ASCII bytestrings are our canonical form.

336 # All properties of the URL are derived from these.

337 return ParseResult(

338 parsed_scheme,

339 parsed_userinfo,

340 parsed_host,

341 parsed_port,

342 parsed_path,

343 parsed_query,

344 parsed_frag,

345 )

346

347

348def encode_host(host: str) -> str:

349 if not host:

350 return ""

351

352 elif IPv4_STYLE_HOSTNAME.match(host):

353 # Validate IPv4 hostnames like #.#.#.#

354 #

355 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2

356 #

357 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet

358 try:

359 ipaddress.IPv4Address(host)

360 except ipaddress.AddressValueError:

361 raise InvalidURL(f"Invalid IPv4 address: {host!r}")

362 return host

363

364 elif IPv6_STYLE_HOSTNAME.match(host):

365 # Validate IPv6 hostnames like [...]

366 #

367 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2

368 #

369 # "A host identified by an Internet Protocol literal address, version 6

370 # [RFC3513] or later, is distinguished by enclosing the IP literal

371 # within square brackets ("[" and "]"). This is the only place where

372 # square bracket characters are allowed in the URI syntax."

373 try:

374 ipaddress.IPv6Address(host[1:-1])

375 except ipaddress.AddressValueError:

376 raise InvalidURL(f"Invalid IPv6 address: {host!r}")

377 return host[1:-1]

378

379 elif host.isascii():

380 # Regular ASCII hostnames

381 #

382 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2

383 #

384 # reg-name = *( unreserved / pct-encoded / sub-delims )

385 WHATWG_SAFE = '"`{}%|\\'

386 return quote(host.lower(), safe=SUB_DELIMS + WHATWG_SAFE)

387

388 # IDNA hostnames

389 try:

390 return idna.encode(host.lower()).decode("ascii")

391 except idna.IDNAError:

392 raise InvalidURL(f"Invalid IDNA hostname: {host!r}")

393

394

395def normalize_port(port: str | int | None, scheme: str) -> int | None:

396 # From https://tools.ietf.org/html/rfc3986#section-3.2.3

397 #

398 # "A scheme may define a default port. For example, the "http" scheme

399 # defines a default port of "80", corresponding to its reserved TCP

400 # port number. The type of port designated by the port number (e.g.,

401 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and

402 # normalizers should omit the port component and its ":" delimiter if

403 # port is empty or if its value would be the same as that of the

404 # scheme's default."

405 if port is None or port == "":

406 return None

407

408 try:

409 port_as_int = int(port)

410 except ValueError:

411 raise InvalidURL(f"Invalid port: {port!r}")

412

413 # See https://url.spec.whatwg.org/#url-miscellaneous

414 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get(

415 scheme

416 )

417 if port_as_int == default_port:

418 return None

419 return port_as_int

420

421

422def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:

423 """

424 Path validation rules that depend on if the URL contains

425 a scheme or authority component.

426

427 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3

428 """

429 if has_authority:

430 # If a URI contains an authority component, then the path component

431 # must either be empty or begin with a slash ("/") character."

432 if path and not path.startswith("/"):

433 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")

434

435 if not has_scheme and not has_authority:

436 # If a URI does not contain an authority component, then the path cannot begin

437 # with two slash characters ("//").

438 if path.startswith("//"):

439 raise InvalidURL("Relative URLs cannot have a path starting with '//'")

440

441 # In addition, a URI reference (Section 4.1) may be a relative-path reference,

442 # in which case the first path segment cannot contain a colon (":") character.

443 if path.startswith(":"):

444 raise InvalidURL("Relative URLs cannot have a path starting with ':'")

445

446

447def normalize_path(path: str) -> str:

448 """

449 Drop "." and ".." segments from a URL path.

450

451 For example:

452

453 normalize_path("/path/./to/somewhere/..") == "/path/to"

454 """

455 # Fast return when no '.' characters in the path.

456 if "." not in path:

457 return path

458

459 components = path.split("/")

460

461 # Fast return when no '.' or '..' components in the path.

462 if "." not in components and ".." not in components:

463 return path

464

465 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4

466 output: list[str] = []

467 for component in components:

468 if component == ".":

469 pass

470 elif component == "..":

471 if output and output != [""]:

472 output.pop()

473 else:

474 output.append(component)

475 return "/".join(output)

476

477

478def PERCENT(string: str) -> str:

479 return "".join([f"%{byte:02X}" for byte in string.encode("utf-8")])

480

481

482def percent_encoded(string: str, safe: str) -> str:

483 """

484 Use percent-encoding to quote a string.

485 """

486 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe

487

488 # Fast path for strings that don't need escaping.

489 if not string.rstrip(NON_ESCAPED_CHARS):

490 return string

491

492 return "".join(

493 [char if char in NON_ESCAPED_CHARS else PERCENT(char) for char in string]

494 )

495

496

497def quote(string: str, safe: str) -> str:

498 """

499 Use percent-encoding to quote a string, omitting existing '%xx' escape sequences.

500

501 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1

502

503 * `string`: The string to be percent-escaped.

504 * `safe`: A string containing characters that may be treated as safe, and do not

505 need to be escaped. Unreserved characters are always treated as safe.

506 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3

507 """

508 parts = []

509 current_position = 0

510 for match in re.finditer(PERCENT_ENCODED_REGEX, string):

511 start_position, end_position = match.start(), match.end()

512 matched_text = match.group(0)

513 # Add any text up to the '%xx' escape sequence.

514 if start_position != current_position:

515 leading_text = string[current_position:start_position]

516 parts.append(percent_encoded(leading_text, safe=safe))

517

518 # Add the '%xx' escape sequence.

519 parts.append(matched_text)

520 current_position = end_position

521

522 # Add any text after the final '%xx' escape sequence.

523 if current_position != len(string):

524 trailing_text = string[current_position:]

525 parts.append(percent_encoded(trailing_text, safe=safe))

526

527 return "".join(parts)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/httpx/_urlparse.py: 23%

189 statements