Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/httpx/

1"""

2An implementation of `urlparse` that provides URL validation and normalization

3as described by RFC3986.

5We rely on this implementation rather than the one in Python's stdlib, because:

7* It provides more complete URL validation.

8* It properly differentiates between an empty querystring and an absent querystring,

9 to distinguish URLs with a trailing '?'.

10* It handles scheme, hostname, port, and path normalization.

11* It supports IDNA hostnames, normalizing them to their encoded form.

12* The API supports passing individual components, as well as the complete URL string.

14Previously we relied on the excellent `rfc3986` package to handle URL parsing and

15validation, but this module provides a simpler alternative, with less indirection

16required.

17"""

18import ipaddress

19import re

20import typing

22import idna

24from ._exceptions import InvalidURL

26MAX_URL_LENGTH = 65536

28# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3

29UNRESERVED_CHARACTERS = (

30 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"

31)

32SUB_DELIMS = "!$&'()*+,;="

34PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")

37# {scheme}: (optional)

38# //{authority} (optional)

39# {path}

40# ?{query} (optional)

41# #{fragment} (optional)

42URL_REGEX = re.compile(

43 (

44 r"(?:(?P<scheme>{scheme}):)?"

45 r"(?://(?P<authority>{authority}))?"

46 r"(?P<path>{path})"

47 r"(?:\?(?P<query>{query}))?"

48 r"(?:#(?P<fragment>{fragment}))?"

49 ).format(

50 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?",

51 authority="[^/?#]*",

52 path="[^?#]*",

53 query="[^#]*",

54 fragment=".*",

55 )

56)

58# {userinfo}@ (optional)

59# {host}

60# :{port} (optional)

61AUTHORITY_REGEX = re.compile(

62 (

63 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?"

64 ).format(

65 userinfo="[^@]*", # Any character sequence not including '@'.

66 host="(\\[.*\\]|[^:]*)", # Either any character sequence not including ':',

67 # or an IPv6 address enclosed within square brackets.

68 port=".*", # Any character sequence.

69 )

70)

73# If we call urlparse with an individual component, then we need to regex

74# validate that component individually.

75# Note that we're duplicating the same strings as above. Shock! Horror!!

76COMPONENT_REGEX = {

77 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"),

78 "authority": re.compile("[^/?#]*"),

79 "path": re.compile("[^?#]*"),

80 "query": re.compile("[^#]*"),

81 "fragment": re.compile(".*"),

82 "userinfo": re.compile("[^@]*"),

83 "host": re.compile("(\\[.*\\]|[^:]*)"),

84 "port": re.compile(".*"),

85}

88# We use these simple regexs as a first pass before handing off to

89# the stdlib 'ipaddress' module for IP address validation.

90IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+.[0-9]+.[0-9]+.[0-9]+$")

91IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$")

94class ParseResult(typing.NamedTuple):

95 scheme: str

96 userinfo: str

97 host: str

98 port: typing.Optional[int]

99 path: str

100 query: typing.Optional[str]

101 fragment: typing.Optional[str]

102

103 @property

104 def authority(self) -> str:

105 return "".join(

106 [

107 f"{self.userinfo}@" if self.userinfo else "",

108 f"[{self.host}]" if ":" in self.host else self.host,

109 f":{self.port}" if self.port is not None else "",

110 ]

111 )

112

113 @property

114 def netloc(self) -> str:

115 return "".join(

116 [

117 f"[{self.host}]" if ":" in self.host else self.host,

118 f":{self.port}" if self.port is not None else "",

119 ]

120 )

121

122 def copy_with(self, **kwargs: typing.Optional[str]) -> "ParseResult":

123 if not kwargs:

124 return self

125

126 defaults = {

127 "scheme": self.scheme,

128 "authority": self.authority,

129 "path": self.path,

130 "query": self.query,

131 "fragment": self.fragment,

132 }

133 defaults.update(kwargs)

134 return urlparse("", **defaults)

135

136 def __str__(self) -> str:

137 authority = self.authority

138 return "".join(

139 [

140 f"{self.scheme}:" if self.scheme else "",

141 f"//{authority}" if authority else "",

142 self.path,

143 f"?{self.query}" if self.query is not None else "",

144 f"#{self.fragment}" if self.fragment is not None else "",

145 ]

146 )

147

148

149def urlparse(url: str = "", **kwargs: typing.Optional[str]) -> ParseResult:

150 # Initial basic checks on allowable URLs.

151 # ---------------------------------------

152

153 # Hard limit the maximum allowable URL length.

154 if len(url) > MAX_URL_LENGTH:

155 raise InvalidURL("URL too long")

156

157 # If a URL includes any ASCII control characters including \t, \r, \n,

158 # then treat it as invalid.

159 if any(char.isascii() and not char.isprintable() for char in url):

160 raise InvalidURL("Invalid non-printable ASCII character in URL")

161

162 # Some keyword arguments require special handling.

163 # ------------------------------------------------

164

165 # Coerce "port" to a string, if it is provided as an integer.

166 if "port" in kwargs:

167 port = kwargs["port"]

168 kwargs["port"] = str(port) if isinstance(port, int) else port

169

170 # Replace "netloc" with "host and "port".

171 if "netloc" in kwargs:

172 netloc = kwargs.pop("netloc") or ""

173 kwargs["host"], _, kwargs["port"] = netloc.partition(":")

174

175 # Replace "username" and/or "password" with "userinfo".

176 if "username" in kwargs or "password" in kwargs:

177 username = quote(kwargs.pop("username", "") or "")

178 password = quote(kwargs.pop("password", "") or "")

179 kwargs["userinfo"] = f"{username}:{password}" if password else username

180

181 # Replace "raw_path" with "path" and "query".

182 if "raw_path" in kwargs:

183 raw_path = kwargs.pop("raw_path") or ""

184 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?")

185 if not seperator:

186 kwargs["query"] = None

187

188 # Ensure that IPv6 "host" addresses are always escaped with "[...]".

189 if "host" in kwargs:

190 host = kwargs.get("host") or ""

191 if ":" in host and not (host.startswith("[") and host.endswith("]")):

192 kwargs["host"] = f"[{host}]"

193

194 # If any keyword arguments are provided, ensure they are valid.

195 # -------------------------------------------------------------

196

197 for key, value in kwargs.items():

198 if value is not None:

199 if len(value) > MAX_URL_LENGTH:

200 raise InvalidURL(f"URL component '{key}' too long")

201

202 # If a component includes any ASCII control characters including \t, \r, \n,

203 # then treat it as invalid.

204 if any(char.isascii() and not char.isprintable() for char in value):

205 raise InvalidURL(

206 f"Invalid non-printable ASCII character in URL component '{key}'"

207 )

208

209 # Ensure that keyword arguments match as a valid regex.

210 if not COMPONENT_REGEX[key].fullmatch(value):

211 raise InvalidURL(f"Invalid URL component '{key}'")

212

213 # The URL_REGEX will always match, but may have empty components.

214 url_match = URL_REGEX.match(url)

215 assert url_match is not None

216 url_dict = url_match.groupdict()

217

218 # * 'scheme', 'authority', and 'path' may be empty strings.

219 # * 'query' may be 'None', indicating no trailing "?" portion.

220 # Any string including the empty string, indicates a trailing "?".

221 # * 'fragment' may be 'None', indicating no trailing "#" portion.

222 # Any string including the empty string, indicates a trailing "#".

223 scheme = kwargs.get("scheme", url_dict["scheme"]) or ""

224 authority = kwargs.get("authority", url_dict["authority"]) or ""

225 path = kwargs.get("path", url_dict["path"]) or ""

226 query = kwargs.get("query", url_dict["query"])

227 fragment = kwargs.get("fragment", url_dict["fragment"])

228

229 # The AUTHORITY_REGEX will always match, but may have empty components.

230 authority_match = AUTHORITY_REGEX.match(authority)

231 assert authority_match is not None

232 authority_dict = authority_match.groupdict()

233

234 # * 'userinfo' and 'host' may be empty strings.

235 # * 'port' may be 'None'.

236 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or ""

237 host = kwargs.get("host", authority_dict["host"]) or ""

238 port = kwargs.get("port", authority_dict["port"])

239

240 # Normalize and validate each component.

241 # We end up with a parsed representation of the URL,

242 # with components that are plain ASCII bytestrings.

243 parsed_scheme: str = scheme.lower()

244 parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":")

245 parsed_host: str = encode_host(host)

246 parsed_port: typing.Optional[int] = normalize_port(port, scheme)

247

248 has_scheme = parsed_scheme != ""

249 has_authority = (

250 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None

251 )

252 validate_path(path, has_scheme=has_scheme, has_authority=has_authority)

253 if has_authority:

254 path = normalize_path(path)

255

256 # The GEN_DELIMS set is... : / ? # [ ] @

257 # These do not need to be percent-quoted unless they serve as delimiters for the

258 # specific component.

259

260 # For 'path' we need to drop ? and # from the GEN_DELIMS set.

261 parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@")

262 # For 'query' we need to drop '#' from the GEN_DELIMS set.

263 parsed_query: typing.Optional[str] = (

264 None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@")

265 )

266 # For 'fragment' we can include all of the GEN_DELIMS set.

267 parsed_fragment: typing.Optional[str] = (

268 None if fragment is None else quote(fragment, safe=SUB_DELIMS + ":/?#[]@")

269 )

270

271 # The parsed ASCII bytestrings are our canonical form.

272 # All properties of the URL are derived from these.

273 return ParseResult(

274 parsed_scheme,

275 parsed_userinfo,

276 parsed_host,

277 parsed_port,

278 parsed_path,

279 parsed_query,

280 parsed_fragment,

281 )

282

283

284def encode_host(host: str) -> str:

285 if not host:

286 return ""

287

288 elif IPv4_STYLE_HOSTNAME.match(host):

289 # Validate IPv4 hostnames like #.#.#.#

290 #

291 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2

292 #

293 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet

294 try:

295 ipaddress.IPv4Address(host)

296 except ipaddress.AddressValueError:

297 raise InvalidURL(f"Invalid IPv4 address: {host!r}")

298 return host

299

300 elif IPv6_STYLE_HOSTNAME.match(host):

301 # Validate IPv6 hostnames like [...]

302 #

303 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2

304 #

305 # "A host identified by an Internet Protocol literal address, version 6

306 # [RFC3513] or later, is distinguished by enclosing the IP literal

307 # within square brackets ("[" and "]"). This is the only place where

308 # square bracket characters are allowed in the URI syntax."

309 try:

310 ipaddress.IPv6Address(host[1:-1])

311 except ipaddress.AddressValueError:

312 raise InvalidURL(f"Invalid IPv6 address: {host!r}")

313 return host[1:-1]

314

315 elif host.isascii():

316 # Regular ASCII hostnames

317 #

318 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2

319 #

320 # reg-name = *( unreserved / pct-encoded / sub-delims )

321 return quote(host.lower(), safe=SUB_DELIMS)

322

323 # IDNA hostnames

324 try:

325 return idna.encode(host.lower()).decode("ascii")

326 except idna.IDNAError:

327 raise InvalidURL(f"Invalid IDNA hostname: {host!r}")

328

329

330def normalize_port(

331 port: typing.Optional[typing.Union[str, int]], scheme: str

332) -> typing.Optional[int]:

333 # From https://tools.ietf.org/html/rfc3986#section-3.2.3

334 #

335 # "A scheme may define a default port. For example, the "http" scheme

336 # defines a default port of "80", corresponding to its reserved TCP

337 # port number. The type of port designated by the port number (e.g.,

338 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and

339 # normalizers should omit the port component and its ":" delimiter if

340 # port is empty or if its value would be the same as that of the

341 # scheme's default."

342 if port is None or port == "":

343 return None

344

345 try:

346 port_as_int = int(port)

347 except ValueError:

348 raise InvalidURL(f"Invalid port: {port!r}")

349

350 # See https://url.spec.whatwg.org/#url-miscellaneous

351 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get(

352 scheme

353 )

354 if port_as_int == default_port:

355 return None

356 return port_as_int

357

358

359def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:

360 """

361 Path validation rules that depend on if the URL contains a scheme or authority component.

362

363 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3

364 """

365 if has_authority:

366 # > If a URI contains an authority component, then the path component

367 # > must either be empty or begin with a slash ("/") character."

368 if path and not path.startswith("/"):

369 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")

370 else:

371 # > If a URI does not contain an authority component, then the path cannot begin

372 # > with two slash characters ("//").

373 if path.startswith("//"):

374 raise InvalidURL(

375 "URLs with no authority component cannot have a path starting with '//'"

376 )

377 # > In addition, a URI reference (Section 4.1) may be a relative-path reference, in which

378 # > case the first path segment cannot contain a colon (":") character.

379 if path.startswith(":") and not has_scheme:

380 raise InvalidURL(

381 "URLs with no scheme component cannot have a path starting with ':'"

382 )

383

384

385def normalize_path(path: str) -> str:

386 """

387 Drop "." and ".." segments from a URL path.

388

389 For example:

390

391 normalize_path("/path/./to/somewhere/..") == "/path/to"

392 """

393 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4

394 components = path.split("/")

395 output: typing.List[str] = []

396 for component in components:

397 if component == ".":

398 pass

399 elif component == "..":

400 if output and output != [""]:

401 output.pop()

402 else:

403 output.append(component)

404 return "/".join(output)

405

406

407def percent_encode(char: str) -> str:

408 """

409 Replace a single character with the percent-encoded representation.

410

411 Characters outside the ASCII range are represented with their a percent-encoded

412 representation of their UTF-8 byte sequence.

413

414 For example:

415

416 percent_encode(" ") == "%20"

417 """

418 return "".join([f"%{byte:02x}" for byte in char.encode("utf-8")]).upper()

419

420

421def is_safe(string: str, safe: str = "/") -> bool:

422 """

423 Determine if a given string is already quote-safe.

424 """

425 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe + "%"

426

427 # All characters must already be non-escaping or '%'

428 for char in string:

429 if char not in NON_ESCAPED_CHARS:

430 return False

431

432 # Any '%' characters must be valid '%xx' escape sequences.

433 return string.count("%") == len(PERCENT_ENCODED_REGEX.findall(string))

434

435

436def quote(string: str, safe: str = "/") -> str:

437 """

438 Use percent-encoding to quote a string if required.

439 """

440 if is_safe(string, safe=safe):

441 return string

442

443 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe

444 return "".join(

445 [char if char in NON_ESCAPED_CHARS else percent_encode(char) for char in string]

446 )

447

448

449def urlencode(items: typing.List[typing.Tuple[str, str]]) -> str:

450 # We can use a much simpler version of the stdlib urlencode here because

451 # we don't need to handle a bunch of different typing cases, such as bytes vs str.

452 #

453 # https://github.com/python/cpython/blob/b2f7b2ef0b5421e01efb8c7bee2ef95d3bab77eb/Lib/urllib/parse.py#L926

454 #

455 # Note that we use '%20' encoding for spaces, and treat '/' as a safe

456 # character. This means our query params have the same escaping as other

457 # characters in the URL path. This is slightly different to `requests`,

458 # but is the behaviour that browsers use.

459 #

460 # See https://github.com/encode/httpx/issues/2536 and

461 # https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode

462 return "&".join([quote(k) + "=" + quote(v) for k, v in items])

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/httpx/_urlparse.py: 80%

162 statements