Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/httpx/

1"""

2An implementation of `urlparse` that provides URL validation and normalization

3as described by RFC3986.

5We rely on this implementation rather than the one in Python's stdlib, because:

7* It provides more complete URL validation.

8* It properly differentiates between an empty querystring and an absent querystring,

9 to distinguish URLs with a trailing '?'.

10* It handles scheme, hostname, port, and path normalization.

11* It supports IDNA hostnames, normalizing them to their encoded form.

12* The API supports passing individual components, as well as the complete URL string.

14Previously we relied on the excellent `rfc3986` package to handle URL parsing and

15validation, but this module provides a simpler alternative, with less indirection

16required.

17"""

18from __future__ import annotations

20import ipaddress

21import re

22import typing

24import idna

26from ._exceptions import InvalidURL

28MAX_URL_LENGTH = 65536

30# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3

31UNRESERVED_CHARACTERS = (

32 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"

33)

34SUB_DELIMS = "!$&'()*+,;="

36PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")

39# {scheme}: (optional)

40# //{authority} (optional)

41# {path}

42# ?{query} (optional)

43# #{fragment} (optional)

44URL_REGEX = re.compile(

45 (

46 r"(?:(?P<scheme>{scheme}):)?"

47 r"(?://(?P<authority>{authority}))?"

48 r"(?P<path>{path})"

49 r"(?:\?(?P<query>{query}))?"

50 r"(?:#(?P<fragment>{fragment}))?"

51 ).format(

52 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?",

53 authority="[^/?#]*",

54 path="[^?#]*",

55 query="[^#]*",

56 fragment=".*",

57 )

58)

60# {userinfo}@ (optional)

61# {host}

62# :{port} (optional)

63AUTHORITY_REGEX = re.compile(

64 (

65 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?"

66 ).format(

67 userinfo=".*", # Any character sequence.

68 host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@',

69 # or an IPv6 address enclosed within square brackets.

70 port=".*", # Any character sequence.

71 )

72)

75# If we call urlparse with an individual component, then we need to regex

76# validate that component individually.

77# Note that we're duplicating the same strings as above. Shock! Horror!!

78COMPONENT_REGEX = {

79 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"),

80 "authority": re.compile("[^/?#]*"),

81 "path": re.compile("[^?#]*"),

82 "query": re.compile("[^#]*"),

83 "fragment": re.compile(".*"),

84 "userinfo": re.compile("[^@]*"),

85 "host": re.compile("(\\[.*\\]|[^:]*)"),

86 "port": re.compile(".*"),

87}

90# We use these simple regexs as a first pass before handing off to

91# the stdlib 'ipaddress' module for IP address validation.

92IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$")

93IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$")

96class ParseResult(typing.NamedTuple):

97 scheme: str

98 userinfo: str

99 host: str

100 port: int | None

101 path: str

102 query: str | None

103 fragment: str | None

104

105 @property

106 def authority(self) -> str:

107 return "".join(

108 [

109 f"{self.userinfo}@" if self.userinfo else "",

110 f"[{self.host}]" if ":" in self.host else self.host,

111 f":{self.port}" if self.port is not None else "",

112 ]

113 )

114

115 @property

116 def netloc(self) -> str:

117 return "".join(

118 [

119 f"[{self.host}]" if ":" in self.host else self.host,

120 f":{self.port}" if self.port is not None else "",

121 ]

122 )

123

124 def copy_with(self, **kwargs: str | None) -> ParseResult:

125 if not kwargs:

126 return self

127

128 defaults = {

129 "scheme": self.scheme,

130 "authority": self.authority,

131 "path": self.path,

132 "query": self.query,

133 "fragment": self.fragment,

134 }

135 defaults.update(kwargs)

136 return urlparse("", **defaults)

137

138 def __str__(self) -> str:

139 authority = self.authority

140 return "".join(

141 [

142 f"{self.scheme}:" if self.scheme else "",

143 f"//{authority}" if authority else "",

144 self.path,

145 f"?{self.query}" if self.query is not None else "",

146 f"#{self.fragment}" if self.fragment is not None else "",

147 ]

148 )

149

150

151def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:

152 # Initial basic checks on allowable URLs.

153 # ---------------------------------------

154

155 # Hard limit the maximum allowable URL length.

156 if len(url) > MAX_URL_LENGTH:

157 raise InvalidURL("URL too long")

158

159 # If a URL includes any ASCII control characters including \t, \r, \n,

160 # then treat it as invalid.

161 if any(char.isascii() and not char.isprintable() for char in url):

162 raise InvalidURL("Invalid non-printable ASCII character in URL")

163

164 # Some keyword arguments require special handling.

165 # ------------------------------------------------

166

167 # Coerce "port" to a string, if it is provided as an integer.

168 if "port" in kwargs:

169 port = kwargs["port"]

170 kwargs["port"] = str(port) if isinstance(port, int) else port

171

172 # Replace "netloc" with "host and "port".

173 if "netloc" in kwargs:

174 netloc = kwargs.pop("netloc") or ""

175 kwargs["host"], _, kwargs["port"] = netloc.partition(":")

176

177 # Replace "username" and/or "password" with "userinfo".

178 if "username" in kwargs or "password" in kwargs:

179 username = quote(kwargs.pop("username", "") or "")

180 password = quote(kwargs.pop("password", "") or "")

181 kwargs["userinfo"] = f"{username}:{password}" if password else username

182

183 # Replace "raw_path" with "path" and "query".

184 if "raw_path" in kwargs:

185 raw_path = kwargs.pop("raw_path") or ""

186 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?")

187 if not seperator:

188 kwargs["query"] = None

189

190 # Ensure that IPv6 "host" addresses are always escaped with "[...]".

191 if "host" in kwargs:

192 host = kwargs.get("host") or ""

193 if ":" in host and not (host.startswith("[") and host.endswith("]")):

194 kwargs["host"] = f"[{host}]"

195

196 # If any keyword arguments are provided, ensure they are valid.

197 # -------------------------------------------------------------

198

199 for key, value in kwargs.items():

200 if value is not None:

201 if len(value) > MAX_URL_LENGTH:

202 raise InvalidURL(f"URL component '{key}' too long")

203

204 # If a component includes any ASCII control characters including \t, \r, \n,

205 # then treat it as invalid.

206 if any(char.isascii() and not char.isprintable() for char in value):

207 raise InvalidURL(

208 f"Invalid non-printable ASCII character in URL component '{key}'"

209 )

210

211 # Ensure that keyword arguments match as a valid regex.

212 if not COMPONENT_REGEX[key].fullmatch(value):

213 raise InvalidURL(f"Invalid URL component '{key}'")

214

215 # The URL_REGEX will always match, but may have empty components.

216 url_match = URL_REGEX.match(url)

217 assert url_match is not None

218 url_dict = url_match.groupdict()

219

220 # * 'scheme', 'authority', and 'path' may be empty strings.

221 # * 'query' may be 'None', indicating no trailing "?" portion.

222 # Any string including the empty string, indicates a trailing "?".

223 # * 'fragment' may be 'None', indicating no trailing "#" portion.

224 # Any string including the empty string, indicates a trailing "#".

225 scheme = kwargs.get("scheme", url_dict["scheme"]) or ""

226 authority = kwargs.get("authority", url_dict["authority"]) or ""

227 path = kwargs.get("path", url_dict["path"]) or ""

228 query = kwargs.get("query", url_dict["query"])

229 fragment = kwargs.get("fragment", url_dict["fragment"])

230

231 # The AUTHORITY_REGEX will always match, but may have empty components.

232 authority_match = AUTHORITY_REGEX.match(authority)

233 assert authority_match is not None

234 authority_dict = authority_match.groupdict()

235

236 # * 'userinfo' and 'host' may be empty strings.

237 # * 'port' may be 'None'.

238 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or ""

239 host = kwargs.get("host", authority_dict["host"]) or ""

240 port = kwargs.get("port", authority_dict["port"])

241

242 # Normalize and validate each component.

243 # We end up with a parsed representation of the URL,

244 # with components that are plain ASCII bytestrings.

245 parsed_scheme: str = scheme.lower()

246 parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":")

247 parsed_host: str = encode_host(host)

248 parsed_port: int | None = normalize_port(port, scheme)

249

250 has_scheme = parsed_scheme != ""

251 has_authority = (

252 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None

253 )

254 validate_path(path, has_scheme=has_scheme, has_authority=has_authority)

255 if has_authority:

256 path = normalize_path(path)

257

258 # The GEN_DELIMS set is... : / ? # [ ] @

259 # These do not need to be percent-quoted unless they serve as delimiters for the

260 # specific component.

261

262 # For 'path' we need to drop ? and # from the GEN_DELIMS set.

263 parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@")

264 # For 'query' we need to drop '#' from the GEN_DELIMS set.

265 parsed_query: str | None = (

266 None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@")

267 )

268 # For 'fragment' we can include all of the GEN_DELIMS set.

269 parsed_fragment: str | None = (

270 None if fragment is None else quote(fragment, safe=SUB_DELIMS + ":/?#[]@")

271 )

272

273 # The parsed ASCII bytestrings are our canonical form.

274 # All properties of the URL are derived from these.

275 return ParseResult(

276 parsed_scheme,

277 parsed_userinfo,

278 parsed_host,

279 parsed_port,

280 parsed_path,

281 parsed_query,

282 parsed_fragment,

283 )

284

285

286def encode_host(host: str) -> str:

287 if not host:

288 return ""

289

290 elif IPv4_STYLE_HOSTNAME.match(host):

291 # Validate IPv4 hostnames like #.#.#.#

292 #

293 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2

294 #

295 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet

296 try:

297 ipaddress.IPv4Address(host)

298 except ipaddress.AddressValueError:

299 raise InvalidURL(f"Invalid IPv4 address: {host!r}")

300 return host

301

302 elif IPv6_STYLE_HOSTNAME.match(host):

303 # Validate IPv6 hostnames like [...]

304 #

305 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2

306 #

307 # "A host identified by an Internet Protocol literal address, version 6

308 # [RFC3513] or later, is distinguished by enclosing the IP literal

309 # within square brackets ("[" and "]"). This is the only place where

310 # square bracket characters are allowed in the URI syntax."

311 try:

312 ipaddress.IPv6Address(host[1:-1])

313 except ipaddress.AddressValueError:

314 raise InvalidURL(f"Invalid IPv6 address: {host!r}")

315 return host[1:-1]

316

317 elif host.isascii():

318 # Regular ASCII hostnames

319 #

320 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2

321 #

322 # reg-name = *( unreserved / pct-encoded / sub-delims )

323 return quote(host.lower(), safe=SUB_DELIMS)

324

325 # IDNA hostnames

326 try:

327 return idna.encode(host.lower()).decode("ascii")

328 except idna.IDNAError:

329 raise InvalidURL(f"Invalid IDNA hostname: {host!r}")

330

331

332def normalize_port(port: str | int | None, scheme: str) -> int | None:

333 # From https://tools.ietf.org/html/rfc3986#section-3.2.3

334 #

335 # "A scheme may define a default port. For example, the "http" scheme

336 # defines a default port of "80", corresponding to its reserved TCP

337 # port number. The type of port designated by the port number (e.g.,

338 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and

339 # normalizers should omit the port component and its ":" delimiter if

340 # port is empty or if its value would be the same as that of the

341 # scheme's default."

342 if port is None or port == "":

343 return None

344

345 try:

346 port_as_int = int(port)

347 except ValueError:

348 raise InvalidURL(f"Invalid port: {port!r}")

349

350 # See https://url.spec.whatwg.org/#url-miscellaneous

351 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get(

352 scheme

353 )

354 if port_as_int == default_port:

355 return None

356 return port_as_int

357

358

359def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:

360 """

361 Path validation rules that depend on if the URL contains

362 a scheme or authority component.

363

364 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3

365 """

366 if has_authority:

367 # If a URI contains an authority component, then the path component

368 # must either be empty or begin with a slash ("/") character."

369 if path and not path.startswith("/"):

370 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")

371 else:

372 # If a URI does not contain an authority component, then the path cannot begin

373 # with two slash characters ("//").

374 if path.startswith("//"):

375 raise InvalidURL(

376 "URLs with no authority component cannot have a path starting with '//'"

377 )

378 # In addition, a URI reference (Section 4.1) may be a relative-path reference,

379 # in which case the first path segment cannot contain a colon (":") character.

380 if path.startswith(":") and not has_scheme:

381 raise InvalidURL(

382 "URLs with no scheme component cannot have a path starting with ':'"

383 )

384

385

386def normalize_path(path: str) -> str:

387 """

388 Drop "." and ".." segments from a URL path.

389

390 For example:

391

392 normalize_path("/path/./to/somewhere/..") == "/path/to"

393 """

394 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4

395 components = path.split("/")

396 output: list[str] = []

397 for component in components:

398 if component == ".":

399 pass

400 elif component == "..":

401 if output and output != [""]:

402 output.pop()

403 else:

404 output.append(component)

405 return "/".join(output)

406

407

408def percent_encode(char: str) -> str:

409 """

410 Replace a single character with the percent-encoded representation.

411

412 Characters outside the ASCII range are represented with their a percent-encoded

413 representation of their UTF-8 byte sequence.

414

415 For example:

416

417 percent_encode(" ") == "%20"

418 """

419 return "".join([f"%{byte:02x}" for byte in char.encode("utf-8")]).upper()

420

421

422def is_safe(string: str, safe: str = "/") -> bool:

423 """

424 Determine if a given string is already quote-safe.

425 """

426 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe + "%"

427

428 # All characters must already be non-escaping or '%'

429 for char in string:

430 if char not in NON_ESCAPED_CHARS:

431 return False

432

433 return True

434

435

436def percent_encoded(string: str, safe: str = "/") -> str:

437 """

438 Use percent-encoding to quote a string.

439 """

440 if is_safe(string, safe=safe):

441 return string

442

443 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe

444 return "".join(

445 [char if char in NON_ESCAPED_CHARS else percent_encode(char) for char in string]

446 )

447

448

449def quote(string: str, safe: str = "/") -> str:

450 """

451 Use percent-encoding to quote a string, omitting existing '%xx' escape sequences.

452

453 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1

454

455 * `string`: The string to be percent-escaped.

456 * `safe`: A string containing characters that may be treated as safe, and do not

457 need to be escaped. Unreserved characters are always treated as safe.

458 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3

459 """

460 parts = []

461 current_position = 0

462 for match in re.finditer(PERCENT_ENCODED_REGEX, string):

463 start_position, end_position = match.start(), match.end()

464 matched_text = match.group(0)

465 # Add any text up to the '%xx' escape sequence.

466 if start_position != current_position:

467 leading_text = string[current_position:start_position]

468 parts.append(percent_encoded(leading_text, safe=safe))

469

470 # Add the '%xx' escape sequence.

471 parts.append(matched_text)

472 current_position = end_position

473

474 # Add any text after the final '%xx' escape sequence.

475 if current_position != len(string):

476 trailing_text = string[current_position:]

477 parts.append(percent_encoded(trailing_text, safe=safe))

478

479 return "".join(parts)

480

481

482def urlencode(items: list[tuple[str, str]]) -> str:

483 """

484 We can use a much simpler version of the stdlib urlencode here because

485 we don't need to handle a bunch of different typing cases, such as bytes vs str.

486

487 https://github.com/python/cpython/blob/b2f7b2ef0b5421e01efb8c7bee2ef95d3bab77eb/Lib/urllib/parse.py#L926

488

489 Note that we use '%20' encoding for spaces. and '%2F for '/'.

490 This is slightly different than `requests`, but is the behaviour that browsers use.

491

492 See

493 - https://github.com/encode/httpx/issues/2536

494 - https://github.com/encode/httpx/issues/2721

495 - https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode

496 """

497 return "&".join(

498 [

499 percent_encoded(k, safe="") + "=" + percent_encoded(v, safe="")

500 for k, v in items

501 ]

502 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/httpx/_urlparse.py: 22%

178 statements