Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/httpx/

1"""

2An implementation of `urlparse` that provides URL validation and normalization

3as described by RFC3986.

5We rely on this implementation rather than the one in Python's stdlib, because:

7* It provides more complete URL validation.

8* It properly differentiates between an empty querystring and an absent querystring,

9 to distinguish URLs with a trailing '?'.

10* It handles scheme, hostname, port, and path normalization.

11* It supports IDNA hostnames, normalizing them to their encoded form.

12* The API supports passing individual components, as well as the complete URL string.

14Previously we relied on the excellent `rfc3986` package to handle URL parsing and

15validation, but this module provides a simpler alternative, with less indirection

16required.

17"""

19from __future__ import annotations

21import ipaddress

22import re

23import typing

25import idna

27from ._exceptions import InvalidURL

29MAX_URL_LENGTH = 65536

31# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3

32UNRESERVED_CHARACTERS = (

33 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"

34)

35SUB_DELIMS = "!$&'()*+,;="

37PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")

40# {scheme}: (optional)

41# //{authority} (optional)

42# {path}

43# ?{query} (optional)

44# #{fragment} (optional)

45URL_REGEX = re.compile(

46 (

47 r"(?:(?P<scheme>{scheme}):)?"

48 r"(?://(?P<authority>{authority}))?"

49 r"(?P<path>{path})"

50 r"(?:\?(?P<query>{query}))?"

51 r"(?:#(?P<fragment>{fragment}))?"

52 ).format(

53 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?",

54 authority="[^/?#]*",

55 path="[^?#]*",

56 query="[^#]*",

57 fragment=".*",

58 )

59)

61# {userinfo}@ (optional)

62# {host}

63# :{port} (optional)

64AUTHORITY_REGEX = re.compile(

65 (

66 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?"

67 ).format(

68 userinfo=".*", # Any character sequence.

69 host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@',

70 # or an IPv6 address enclosed within square brackets.

71 port=".*", # Any character sequence.

72 )

73)

76# If we call urlparse with an individual component, then we need to regex

77# validate that component individually.

78# Note that we're duplicating the same strings as above. Shock! Horror!!

79COMPONENT_REGEX = {

80 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"),

81 "authority": re.compile("[^/?#]*"),

82 "path": re.compile("[^?#]*"),

83 "query": re.compile("[^#]*"),

84 "fragment": re.compile(".*"),

85 "userinfo": re.compile("[^@]*"),

86 "host": re.compile("(\\[.*\\]|[^:]*)"),

87 "port": re.compile(".*"),

88}

91# We use these simple regexs as a first pass before handing off to

92# the stdlib 'ipaddress' module for IP address validation.

93IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$")

94IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$")

97class ParseResult(typing.NamedTuple):

98 scheme: str

99 userinfo: str

100 host: str

101 port: int | None

102 path: str

103 query: str | None

104 fragment: str | None

105

106 @property

107 def authority(self) -> str:

108 return "".join(

109 [

110 f"{self.userinfo}@" if self.userinfo else "",

111 f"[{self.host}]" if ":" in self.host else self.host,

112 f":{self.port}" if self.port is not None else "",

113 ]

114 )

115

116 @property

117 def netloc(self) -> str:

118 return "".join(

119 [

120 f"[{self.host}]" if ":" in self.host else self.host,

121 f":{self.port}" if self.port is not None else "",

122 ]

123 )

124

125 def copy_with(self, **kwargs: str | None) -> ParseResult:

126 if not kwargs:

127 return self

128

129 defaults = {

130 "scheme": self.scheme,

131 "authority": self.authority,

132 "path": self.path,

133 "query": self.query,

134 "fragment": self.fragment,

135 }

136 defaults.update(kwargs)

137 return urlparse("", **defaults)

138

139 def __str__(self) -> str:

140 authority = self.authority

141 return "".join(

142 [

143 f"{self.scheme}:" if self.scheme else "",

144 f"//{authority}" if authority else "",

145 self.path,

146 f"?{self.query}" if self.query is not None else "",

147 f"#{self.fragment}" if self.fragment is not None else "",

148 ]

149 )

150

151

152def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:

153 # Initial basic checks on allowable URLs.

154 # ---------------------------------------

155

156 # Hard limit the maximum allowable URL length.

157 if len(url) > MAX_URL_LENGTH:

158 raise InvalidURL("URL too long")

159

160 # If a URL includes any ASCII control characters including \t, \r, \n,

161 # then treat it as invalid.

162 if any(char.isascii() and not char.isprintable() for char in url):

163 char = next(char for char in url if char.isascii() and not char.isprintable())

164 idx = url.find(char)

165 error = (

166 f"Invalid non-printable ASCII character in URL, {char!r} at position {idx}."

167 )

168 raise InvalidURL(error)

169

170 # Some keyword arguments require special handling.

171 # ------------------------------------------------

172

173 # Coerce "port" to a string, if it is provided as an integer.

174 if "port" in kwargs:

175 port = kwargs["port"]

176 kwargs["port"] = str(port) if isinstance(port, int) else port

177

178 # Replace "netloc" with "host and "port".

179 if "netloc" in kwargs:

180 netloc = kwargs.pop("netloc") or ""

181 kwargs["host"], _, kwargs["port"] = netloc.partition(":")

182

183 # Replace "username" and/or "password" with "userinfo".

184 if "username" in kwargs or "password" in kwargs:

185 username = quote(kwargs.pop("username", "") or "")

186 password = quote(kwargs.pop("password", "") or "")

187 kwargs["userinfo"] = f"{username}:{password}" if password else username

188

189 # Replace "raw_path" with "path" and "query".

190 if "raw_path" in kwargs:

191 raw_path = kwargs.pop("raw_path") or ""

192 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?")

193 if not seperator:

194 kwargs["query"] = None

195

196 # Ensure that IPv6 "host" addresses are always escaped with "[...]".

197 if "host" in kwargs:

198 host = kwargs.get("host") or ""

199 if ":" in host and not (host.startswith("[") and host.endswith("]")):

200 kwargs["host"] = f"[{host}]"

201

202 # If any keyword arguments are provided, ensure they are valid.

203 # -------------------------------------------------------------

204

205 for key, value in kwargs.items():

206 if value is not None:

207 if len(value) > MAX_URL_LENGTH:

208 raise InvalidURL(f"URL component '{key}' too long")

209

210 # If a component includes any ASCII control characters including \t, \r, \n,

211 # then treat it as invalid.

212 if any(char.isascii() and not char.isprintable() for char in value):

213 char = next(

214 char for char in value if char.isascii() and not char.isprintable()

215 )

216 idx = value.find(char)

217 error = (

218 f"Invalid non-printable ASCII character in URL {key} component, "

219 f"{char!r} at position {idx}."

220 )

221 raise InvalidURL(error)

222

223 # Ensure that keyword arguments match as a valid regex.

224 if not COMPONENT_REGEX[key].fullmatch(value):

225 raise InvalidURL(f"Invalid URL component '{key}'")

226

227 # The URL_REGEX will always match, but may have empty components.

228 url_match = URL_REGEX.match(url)

229 assert url_match is not None

230 url_dict = url_match.groupdict()

231

232 # * 'scheme', 'authority', and 'path' may be empty strings.

233 # * 'query' may be 'None', indicating no trailing "?" portion.

234 # Any string including the empty string, indicates a trailing "?".

235 # * 'fragment' may be 'None', indicating no trailing "#" portion.

236 # Any string including the empty string, indicates a trailing "#".

237 scheme = kwargs.get("scheme", url_dict["scheme"]) or ""

238 authority = kwargs.get("authority", url_dict["authority"]) or ""

239 path = kwargs.get("path", url_dict["path"]) or ""

240 query = kwargs.get("query", url_dict["query"])

241 fragment = kwargs.get("fragment", url_dict["fragment"])

242

243 # The AUTHORITY_REGEX will always match, but may have empty components.

244 authority_match = AUTHORITY_REGEX.match(authority)

245 assert authority_match is not None

246 authority_dict = authority_match.groupdict()

247

248 # * 'userinfo' and 'host' may be empty strings.

249 # * 'port' may be 'None'.

250 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or ""

251 host = kwargs.get("host", authority_dict["host"]) or ""

252 port = kwargs.get("port", authority_dict["port"])

253

254 # Normalize and validate each component.

255 # We end up with a parsed representation of the URL,

256 # with components that are plain ASCII bytestrings.

257 parsed_scheme: str = scheme.lower()

258 parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":")

259 parsed_host: str = encode_host(host)

260 parsed_port: int | None = normalize_port(port, scheme)

261

262 has_scheme = parsed_scheme != ""

263 has_authority = (

264 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None

265 )

266 validate_path(path, has_scheme=has_scheme, has_authority=has_authority)

267 if has_scheme or has_authority:

268 path = normalize_path(path)

269

270 # The GEN_DELIMS set is... : / ? # [ ] @

271 # These do not need to be percent-quoted unless they serve as delimiters for the

272 # specific component.

273 WHATWG_SAFE = '`{}%|^\\"'

274

275 # For 'path' we need to drop ? and # from the GEN_DELIMS set.

276 parsed_path: str = quote(path, safe=SUB_DELIMS + WHATWG_SAFE + ":/[]@")

277 # For 'query' we need to drop '#' from the GEN_DELIMS set.

278 parsed_query: str | None = (

279 None

280 if query is None

281 else quote(query, safe=SUB_DELIMS + WHATWG_SAFE + ":/?[]@")

282 )

283 # For 'fragment' we can include all of the GEN_DELIMS set.

284 parsed_fragment: str | None = (

285 None

286 if fragment is None

287 else quote(fragment, safe=SUB_DELIMS + WHATWG_SAFE + ":/?#[]@")

288 )

289

290 # The parsed ASCII bytestrings are our canonical form.

291 # All properties of the URL are derived from these.

292 return ParseResult(

293 parsed_scheme,

294 parsed_userinfo,

295 parsed_host,

296 parsed_port,

297 parsed_path,

298 parsed_query,

299 parsed_fragment,

300 )

301

302

303def encode_host(host: str) -> str:

304 if not host:

305 return ""

306

307 elif IPv4_STYLE_HOSTNAME.match(host):

308 # Validate IPv4 hostnames like #.#.#.#

309 #

310 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2

311 #

312 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet

313 try:

314 ipaddress.IPv4Address(host)

315 except ipaddress.AddressValueError:

316 raise InvalidURL(f"Invalid IPv4 address: {host!r}")

317 return host

318

319 elif IPv6_STYLE_HOSTNAME.match(host):

320 # Validate IPv6 hostnames like [...]

321 #

322 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2

323 #

324 # "A host identified by an Internet Protocol literal address, version 6

325 # [RFC3513] or later, is distinguished by enclosing the IP literal

326 # within square brackets ("[" and "]"). This is the only place where

327 # square bracket characters are allowed in the URI syntax."

328 try:

329 ipaddress.IPv6Address(host[1:-1])

330 except ipaddress.AddressValueError:

331 raise InvalidURL(f"Invalid IPv6 address: {host!r}")

332 return host[1:-1]

333

334 elif host.isascii():

335 # Regular ASCII hostnames

336 #

337 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2

338 #

339 # reg-name = *( unreserved / pct-encoded / sub-delims )

340 WHATWG_SAFE = '"`{}%|\\'

341 return quote(host.lower(), safe=SUB_DELIMS + WHATWG_SAFE)

342

343 # IDNA hostnames

344 try:

345 return idna.encode(host.lower()).decode("ascii")

346 except idna.IDNAError:

347 raise InvalidURL(f"Invalid IDNA hostname: {host!r}")

348

349

350def normalize_port(port: str | int | None, scheme: str) -> int | None:

351 # From https://tools.ietf.org/html/rfc3986#section-3.2.3

352 #

353 # "A scheme may define a default port. For example, the "http" scheme

354 # defines a default port of "80", corresponding to its reserved TCP

355 # port number. The type of port designated by the port number (e.g.,

356 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and

357 # normalizers should omit the port component and its ":" delimiter if

358 # port is empty or if its value would be the same as that of the

359 # scheme's default."

360 if port is None or port == "":

361 return None

362

363 try:

364 port_as_int = int(port)

365 except ValueError:

366 raise InvalidURL(f"Invalid port: {port!r}")

367

368 # See https://url.spec.whatwg.org/#url-miscellaneous

369 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get(

370 scheme

371 )

372 if port_as_int == default_port:

373 return None

374 return port_as_int

375

376

377def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:

378 """

379 Path validation rules that depend on if the URL contains

380 a scheme or authority component.

381

382 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3

383 """

384 if has_authority:

385 # If a URI contains an authority component, then the path component

386 # must either be empty or begin with a slash ("/") character."

387 if path and not path.startswith("/"):

388 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")

389

390 if not has_scheme and not has_authority:

391 # If a URI does not contain an authority component, then the path cannot begin

392 # with two slash characters ("//").

393 if path.startswith("//"):

394 raise InvalidURL("Relative URLs cannot have a path starting with '//'")

395

396 # In addition, a URI reference (Section 4.1) may be a relative-path reference,

397 # in which case the first path segment cannot contain a colon (":") character.

398 if path.startswith(":"):

399 raise InvalidURL("Relative URLs cannot have a path starting with ':'")

400

401

402def normalize_path(path: str) -> str:

403 """

404 Drop "." and ".." segments from a URL path.

405

406 For example:

407

408 normalize_path("/path/./to/somewhere/..") == "/path/to"

409 """

410 # Fast return when no '.' characters in the path.

411 if "." not in path:

412 return path

413

414 components = path.split("/")

415

416 # Fast return when no '.' or '..' components in the path.

417 if "." not in components and ".." not in components:

418 return path

419

420 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4

421 output: list[str] = []

422 for component in components:

423 if component == ".":

424 pass

425 elif component == "..":

426 if output and output != [""]:

427 output.pop()

428 else:

429 output.append(component)

430 return "/".join(output)

431

432

433def PERCENT(string: str) -> str:

434 return "".join([f"%{byte:02X}" for byte in string.encode("utf-8")])

435

436

437def percent_encoded(string: str, safe: str = "/") -> str:

438 """

439 Use percent-encoding to quote a string.

440 """

441 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe

442

443 # Fast path for strings that don't need escaping.

444 if not string.rstrip(NON_ESCAPED_CHARS):

445 return string

446

447 return "".join(

448 [char if char in NON_ESCAPED_CHARS else PERCENT(char) for char in string]

449 )

450

451

452def quote(string: str, safe: str = "/") -> str:

453 """

454 Use percent-encoding to quote a string, omitting existing '%xx' escape sequences.

455

456 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1

457

458 * `string`: The string to be percent-escaped.

459 * `safe`: A string containing characters that may be treated as safe, and do not

460 need to be escaped. Unreserved characters are always treated as safe.

461 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3

462 """

463 parts = []

464 current_position = 0

465 for match in re.finditer(PERCENT_ENCODED_REGEX, string):

466 start_position, end_position = match.start(), match.end()

467 matched_text = match.group(0)

468 # Add any text up to the '%xx' escape sequence.

469 if start_position != current_position:

470 leading_text = string[current_position:start_position]

471 parts.append(percent_encoded(leading_text, safe=safe))

472

473 # Add the '%xx' escape sequence.

474 parts.append(matched_text)

475 current_position = end_position

476

477 # Add any text after the final '%xx' escape sequence.

478 if current_position != len(string):

479 trailing_text = string[current_position:]

480 parts.append(percent_encoded(trailing_text, safe=safe))

481

482 return "".join(parts)

483

484

485def urlencode(items: list[tuple[str, str]]) -> str:

486 """

487 We can use a much simpler version of the stdlib urlencode here because

488 we don't need to handle a bunch of different typing cases, such as bytes vs str.

489

490 https://github.com/python/cpython/blob/b2f7b2ef0b5421e01efb8c7bee2ef95d3bab77eb/Lib/urllib/parse.py#L926

491

492 Note that we use '%20' encoding for spaces. and '%2F for '/'.

493 This is slightly different than `requests`, but is the behaviour that browsers use.

494

495 See

496 - https://github.com/encode/httpx/issues/2536

497 - https://github.com/encode/httpx/issues/2721

498 - https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode

499 """

500 return "&".join(

501 [

502 percent_encoded(k, safe="") + "=" + percent_encoded(v, safe="")

503 for k, v in items

504 ]

505 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/httpx/_urlparse.py: 21%

185 statements