Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/w3lib/url.py: 53%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

244 statements  

1""" 

2This module contains general purpose URL functions not found in the standard 

3library. 

4""" 

5 

6from __future__ import annotations 

7 

8import base64 

9import codecs 

10import os 

11import posixpath 

12import re 

13import string 

14from pathlib import Path 

15from typing import TYPE_CHECKING, NamedTuple, cast, overload 

16from urllib.parse import ( # type: ignore[attr-defined] 

17 ParseResult, 

18 _coerce_args, 

19 parse_qs, 

20 parse_qsl, 

21 quote, 

22 unquote, 

23 unquote_to_bytes, 

24 urldefrag, 

25 urlencode, 

26 urlparse, 

27 urlsplit, 

28 urlunparse, 

29 urlunsplit, 

30) 

31from urllib.request import pathname2url, url2pathname 

32 

33from ._infra import _ASCII_TAB_OR_NEWLINE, _C0_CONTROL_OR_SPACE 

34from ._url import _SPECIAL_SCHEMES 

35from .util import to_unicode 

36 

37if TYPE_CHECKING: 

38 from collections.abc import Callable, Sequence 

39 

40 from ._types import AnyUnicodeError 

41 

42 

43# error handling function for bytes-to-Unicode decoding errors with URLs 

44def _quote_byte(error: UnicodeError) -> tuple[str, int]: 

45 error = cast("AnyUnicodeError", error) 

46 return (to_unicode(quote(error.object[error.start : error.end])), error.end) 

47 

48 

49codecs.register_error("percentencode", _quote_byte) 

50 

51# constants from RFC 3986, Section 2.2 and 2.3 

52RFC3986_GEN_DELIMS = b":/?#[]@" 

53RFC3986_SUB_DELIMS = b"!$&'()*+,;=" 

54RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS 

55RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode("ascii") 

56EXTRA_SAFE_CHARS = b"|" # see https://github.com/scrapy/w3lib/pull/25 

57 

58RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":" 

59_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b"%" 

60_path_safe_chars = _safe_chars.replace(b"#", b"") 

61 

62# Characters that are safe in all of: 

63# 

64# - RFC 2396 + RFC 2732, as interpreted by Java 8’s java.net.URI class 

65# - RFC 3986 

66# - The URL living standard 

67# 

68# NOTE: % is currently excluded from these lists of characters, due to 

69# limitations of the current safe_url_string implementation, but it should also 

70# be escaped as %25 when it is not already being used as part of an escape 

71# character. 

72_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, delete=b":;=") 

73_PATH_SAFEST_CHARS = _safe_chars.translate(None, delete=b"#[]|") 

74_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS 

75_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, delete=b"'") 

76_FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS 

77 

78 

79_ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE = { 

80 ord(char): None for char in _ASCII_TAB_OR_NEWLINE 

81} 

82 

83 

84def _strip(url: str) -> str: 

85 return url.strip(_C0_CONTROL_OR_SPACE).translate( 

86 _ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE 

87 ) 

88 

89 

90def safe_url_string( # pylint: disable=too-many-locals 

91 url: str | bytes, 

92 encoding: str = "utf8", 

93 path_encoding: str = "utf8", 

94 quote_path: bool = True, 

95) -> str: 

96 """Return a URL equivalent to *url* that a wide range of web browsers and 

97 web servers consider valid. 

98 

99 *url* is parsed according to the rules of the `URL living standard`_, 

100 and during serialization additional characters are percent-encoded to make 

101 the URL valid by additional URL standards. 

102 

103 .. _URL living standard: https://url.spec.whatwg.org/ 

104 

105 The returned URL should be valid by *all* of the following URL standards 

106 known to be enforced by modern-day web browsers and web servers: 

107 

108 - `URL living standard`_ 

109 

110 - `RFC 3986`_ 

111 

112 - `RFC 2396`_ and `RFC 2732`_, as interpreted by `Java 8’s java.net.URI 

113 class`_. 

114 

115 .. _Java 8’s java.net.URI class: https://docs.oracle.com/javase/8/docs/api/java/net/URI.html 

116 .. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt 

117 .. _RFC 2732: https://www.ietf.org/rfc/rfc2732.txt 

118 .. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt 

119 

120 If a bytes URL is given, it is first converted to `str` using the given 

121 encoding (which defaults to 'utf-8'). If quote_path is True (default), 

122 path_encoding ('utf-8' by default) is used to encode URL path component 

123 which is then quoted. Otherwise, if quote_path is False, path component 

124 is not encoded or quoted. Given encoding is used for query string 

125 or form data. 

126 

127 When passing an encoding, you should use the encoding of the 

128 original page (the page from which the URL was extracted from). 

129 

130 Calling this function on an already "safe" URL will return the URL 

131 unmodified. 

132 """ 

133 # urlsplit() chokes on bytes input with non-ASCII chars, 

134 # so let's decode (to Unicode) using page encoding: 

135 # - it is assumed that a raw bytes input comes from a document 

136 # encoded with the supplied encoding (or UTF8 by default) 

137 # - if the supplied (or default) encoding chokes, 

138 # percent-encode offending bytes 

139 decoded = to_unicode(url, encoding=encoding, errors="percentencode") 

140 parts = urlsplit(_strip(decoded)) 

141 

142 username, password, hostname, port = ( 

143 parts.username, 

144 parts.password, 

145 parts.hostname, 

146 parts.port, 

147 ) 

148 netloc_bytes = b"" 

149 if username is not None or password is not None: 

150 if username is not None: 

151 safe_username = quote(unquote(username), _USERINFO_SAFEST_CHARS) 

152 netloc_bytes += safe_username.encode(encoding) 

153 if password is not None: 

154 netloc_bytes += b":" 

155 safe_password = quote(unquote(password), _USERINFO_SAFEST_CHARS) 

156 netloc_bytes += safe_password.encode(encoding) 

157 netloc_bytes += b"@" 

158 if hostname is not None: 

159 if ":" in hostname: 

160 # IPv6 address: urlsplit() strips the brackets from the hostname, 

161 # but they are required in the netloc when rebuilding the URL. 

162 netloc_bytes += f"[{hostname}]".encode("ascii") 

163 else: 

164 try: 

165 netloc_bytes += hostname.encode("idna") 

166 except UnicodeError: 

167 # IDNA encoding can fail for too long labels (>63 characters) or 

168 # missing labels (e.g. http://.example.com) 

169 netloc_bytes += hostname.encode(encoding) 

170 if port is not None: 

171 netloc_bytes += b":" 

172 netloc_bytes += str(port).encode(encoding) 

173 

174 netloc = netloc_bytes.decode() 

175 

176 # default encoding for path component SHOULD be UTF-8 

177 if quote_path: 

178 path = quote(parts.path.encode(path_encoding), _PATH_SAFEST_CHARS) 

179 else: 

180 path = parts.path 

181 

182 if parts.scheme in _SPECIAL_SCHEMES: 

183 query = quote(parts.query.encode(encoding), _SPECIAL_QUERY_SAFEST_CHARS) 

184 else: 

185 query = quote(parts.query.encode(encoding), _QUERY_SAFEST_CHARS) 

186 

187 return urlunsplit( 

188 ( 

189 parts.scheme, 

190 netloc, 

191 path, 

192 query, 

193 quote(parts.fragment.encode(encoding), _FRAGMENT_SAFEST_CHARS), 

194 ) 

195 ) 

196 

197 

198_parent_dirs = re.compile(r"/?(\.\./)+") 

199 

200 

201def safe_download_url( 

202 url: str | bytes, encoding: str = "utf8", path_encoding: str = "utf8" 

203) -> str: 

204 """Make a url for download. This will call safe_url_string 

205 and then strip the fragment, if one exists. The path will 

206 be normalised. 

207 

208 If the path is outside the document root, it will be changed 

209 to be within the document root. 

210 """ 

211 safe_url = safe_url_string(url, encoding, path_encoding) 

212 scheme, netloc, path, query, _ = urlsplit(safe_url) 

213 if path: 

214 path = _parent_dirs.sub("", posixpath.normpath(path)) 

215 if safe_url.endswith("/") and not path.endswith("/"): 

216 path += "/" 

217 else: 

218 path = "/" 

219 return urlunsplit((scheme, netloc, path, query, "")) 

220 

221 

222def is_url(text: str) -> bool: 

223 return text.partition("://")[0] in ("file", "http", "https") 

224 

225 

226@overload 

227def url_query_parameter( 

228 url: str | bytes, 

229 parameter: str, 

230 default: None = None, 

231 keep_blank_values: bool | int = 0, 

232) -> str | None: ... 

233 

234 

235@overload 

236def url_query_parameter( 

237 url: str | bytes, 

238 parameter: str, 

239 default: str, 

240 keep_blank_values: bool | int = 0, 

241) -> str: ... 

242 

243 

244def url_query_parameter( 

245 url: str | bytes, 

246 parameter: str, 

247 default: str | None = None, 

248 keep_blank_values: bool | int = 0, 

249) -> str | None: 

250 """Return the value of a url parameter, given the url and parameter name 

251 

252 General case: 

253 

254 >>> import w3lib.url 

255 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "id") 

256 '200' 

257 >>> 

258 

259 Return a default value if the parameter is not found: 

260 

261 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault") 

262 'mydefault' 

263 >>> 

264 

265 Returns None if `keep_blank_values` not set or 0 (default): 

266 

267 >>> w3lib.url.url_query_parameter("product.html?id=", "id") 

268 >>> 

269 

270 Returns an empty string if `keep_blank_values` set to 1: 

271 

272 >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1) 

273 '' 

274 >>> 

275 

276 """ 

277 

278 queryparams = parse_qs( 

279 urlsplit(str(url))[3], keep_blank_values=bool(keep_blank_values) 

280 ) 

281 if parameter in queryparams: 

282 return queryparams[parameter][0] 

283 return default 

284 

285 

286def url_query_cleaner( 

287 url: str | bytes, 

288 parameterlist: str | bytes | Sequence[str | bytes] = (), 

289 sep: str = "&", 

290 kvsep: str = "=", 

291 remove: bool = False, 

292 unique: bool = True, 

293 keep_fragments: bool = False, 

294) -> str: 

295 """Clean URL arguments leaving only those passed in the parameterlist keeping order 

296 

297 >>> import w3lib.url 

298 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',)) 

299 'product.html?id=200' 

300 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']) 

301 'product.html?id=200&name=wired' 

302 >>> 

303 

304 If `unique` is ``False``, do not remove duplicated keys 

305 

306 >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False) 

307 'product.html?d=1&d=2&d=3' 

308 >>> 

309 

310 If `remove` is ``True``, leave only those **not in parameterlist**. 

311 

312 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True) 

313 'product.html?foo=bar&name=wired' 

314 >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True) 

315 'product.html?name=wired' 

316 >>> 

317 

318 By default, URL fragments are removed. If you need to preserve fragments, 

319 pass the ``keep_fragments`` argument as ``True``. 

320 

321 >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True) 

322 'http://domain.tld/#123123' 

323 

324 """ 

325 

326 if isinstance(parameterlist, (str, bytes)): 

327 parameterlist = [parameterlist] 

328 url, fragment = urldefrag(url) 

329 url = cast("str", url) 

330 fragment = cast("str", fragment) 

331 base, _, query = url.partition("?") 

332 seen = set() 

333 querylist = [] 

334 for ksv in query.split(sep): 

335 if not ksv: 

336 continue 

337 k, _, _ = ksv.partition(kvsep) 

338 if unique and k in seen: 

339 continue 

340 if remove and k in parameterlist: 

341 continue 

342 if not remove and k not in parameterlist: 

343 continue 

344 querylist.append(ksv) 

345 seen.add(k) 

346 url = "?".join([base, sep.join(querylist)]) if querylist else base 

347 if keep_fragments and fragment: 

348 url += "#" + fragment 

349 return url 

350 

351 

352def _add_or_replace_parameters(url: str, params: dict[str, str]) -> str: 

353 parsed = urlsplit(url) 

354 current_args = parse_qsl(parsed.query, keep_blank_values=True) 

355 

356 new_args = [] 

357 seen_params = set() 

358 for name, value in current_args: 

359 if name not in params: 

360 new_args.append((name, value)) 

361 elif name not in seen_params: 

362 new_args.append((name, params[name])) 

363 seen_params.add(name) 

364 

365 not_modified_args = [ 

366 (name, value) for name, value in params.items() if name not in seen_params 

367 ] 

368 new_args += not_modified_args 

369 

370 query = urlencode(new_args) 

371 return urlunsplit(parsed._replace(query=query)) 

372 

373 

374def add_or_replace_parameter(url: str, name: str, new_value: str) -> str: 

375 """Add or remove a parameter to a given url 

376 

377 >>> import w3lib.url 

378 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php', 'arg', 'v') 

379 'http://www.example.com/index.php?arg=v' 

380 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg4', 'v4') 

381 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3&arg4=v4' 

382 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg3', 'v3new') 

383 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new' 

384 >>> 

385 

386 """ 

387 return _add_or_replace_parameters(url, {name: new_value}) 

388 

389 

390def add_or_replace_parameters(url: str, new_parameters: dict[str, str]) -> str: 

391 """Add or remove a parameters to a given url 

392 

393 >>> import w3lib.url 

394 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php', {'arg': 'v'}) 

395 'http://www.example.com/index.php?arg=v' 

396 >>> args = {'arg4': 'v4', 'arg3': 'v3new'} 

397 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', args) 

398 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new&arg4=v4' 

399 >>> 

400 

401 """ 

402 return _add_or_replace_parameters(url, new_parameters) 

403 

404 

405def path_to_file_uri(path: str | os.PathLike[str]) -> str: 

406 """Convert local filesystem path to legal File URIs as described in: 

407 http://en.wikipedia.org/wiki/File_URI_scheme 

408 """ 

409 x = pathname2url(str(Path(path).absolute())) 

410 return f"file:///{x.lstrip('/')}" 

411 

412 

413def file_uri_to_path(uri: str) -> str: 

414 """Convert File URI to local filesystem path according to: 

415 http://en.wikipedia.org/wiki/File_URI_scheme 

416 """ 

417 uri_path = urlparse(uri).path 

418 return url2pathname(uri_path) 

419 

420 

421def any_to_uri(uri_or_path: str) -> str: 

422 """If given a path name, return its File URI, otherwise return it 

423 unmodified 

424 """ 

425 if os.path.splitdrive(uri_or_path)[0]: 

426 return path_to_file_uri(uri_or_path) 

427 u = urlparse(uri_or_path) 

428 return uri_or_path if u.scheme else path_to_file_uri(uri_or_path) 

429 

430 

431# ASCII characters. 

432_char = set(map(chr, range(127))) 

433 

434# RFC 2045 token. 

435_token = r"[{}]+".format( 

436 re.escape( 

437 "".join( 

438 _char 

439 - 

440 # Control characters. 

441 set(map(chr, range(32))) 

442 - 

443 # tspecials and space. 

444 set('()<>@,;:\\"/[]?= ') 

445 ) 

446 ) 

447) 

448 

449# RFC 822 quoted-string, without surrounding quotation marks. 

450_quoted_string = r"(?:[{}]|(?:\\[{}]))*".format( 

451 re.escape("".join(_char - {'"', "\\", "\r"})), re.escape("".join(_char)) 

452) 

453 

454# Encode the regular expression strings to make them into bytes, as Python 3 

455# bytes have no format() method, but bytes must be passed to re.compile() in 

456# order to make a pattern object that can be used to match on bytes. 

457 

458# RFC 2397 mediatype. 

459_mediatype_pattern = re.compile(rf"{_token}/{_token}".encode()) 

460_mediatype_parameter_pattern = re.compile( 

461 rf';({_token})=(?:({_token})|"({_quoted_string})")'.encode() 

462) 

463 

464 

465class ParseDataURIResult(NamedTuple): 

466 """Named tuple returned by :func:`parse_data_uri`.""" 

467 

468 #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``). 

469 media_type: str 

470 #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``). 

471 media_type_parameters: dict[str, str] 

472 #: Data, decoded if it was encoded in base64 format. 

473 data: bytes 

474 

475 

476def parse_data_uri(uri: str | bytes) -> ParseDataURIResult: 

477 """Parse a data: URI into :class:`ParseDataURIResult`.""" 

478 if not isinstance(uri, bytes): 

479 uri = safe_url_string(uri).encode("ascii") 

480 

481 try: 

482 scheme, uri = uri.split(b":", 1) 

483 except ValueError: 

484 raise ValueError("invalid URI") 

485 if scheme.lower() != b"data": 

486 raise ValueError("not a data URI") 

487 

488 # RFC 3986 section 2.1 allows percent encoding to escape characters that 

489 # would be interpreted as delimiters, implying that actual delimiters 

490 # should not be percent-encoded. 

491 # Decoding before parsing will allow malformed URIs with percent-encoded 

492 # delimiters, but it makes parsing easier and should not affect 

493 # well-formed URIs, as the delimiters used in this URI scheme are not 

494 # allowed, percent-encoded or not, in tokens. 

495 uri = unquote_to_bytes(uri) 

496 

497 media_type = "text/plain" 

498 media_type_params = {} 

499 

500 m = _mediatype_pattern.match(uri) 

501 if m: 

502 media_type = m.group().decode() 

503 uri = uri[m.end() :] 

504 else: 

505 media_type_params["charset"] = "US-ASCII" 

506 

507 while True: 

508 m = _mediatype_parameter_pattern.match(uri) 

509 if m: 

510 attribute, value, value_quoted = m.groups() 

511 if value_quoted: 

512 value = re.sub(rb"\\(.)", rb"\1", value_quoted) 

513 media_type_params[attribute.decode()] = value.decode() 

514 uri = uri[m.end() :] 

515 else: 

516 break 

517 

518 try: 

519 is_base64, data = uri.split(b",", 1) 

520 except ValueError: 

521 raise ValueError("invalid data URI") 

522 if is_base64: 

523 if is_base64 != b";base64": 

524 raise ValueError("invalid data URI") 

525 data = base64.b64decode(data) 

526 

527 return ParseDataURIResult(media_type, media_type_params, data) 

528 

529 

530__all__ = [ 

531 "add_or_replace_parameter", 

532 "add_or_replace_parameters", 

533 "any_to_uri", 

534 "canonicalize_url", 

535 "file_uri_to_path", 

536 "is_url", 

537 "parse_data_uri", 

538 "path_to_file_uri", 

539 "safe_download_url", 

540 "safe_url_string", 

541 "url_query_cleaner", 

542 "url_query_parameter", 

543] 

544 

545 

546def _safe_ParseResult( 

547 parts: ParseResult, encoding: str = "utf8", path_encoding: str = "utf8" 

548) -> tuple[str, str, str, str, str, str]: 

549 # IDNA encoding can fail for too long labels (>63 characters) 

550 # or missing labels (e.g. http://.example.com) 

551 try: 

552 netloc = parts.netloc.encode("idna").decode() 

553 except UnicodeError: 

554 netloc = parts.netloc 

555 

556 return ( 

557 parts.scheme, 

558 netloc, 

559 quote(parts.path.encode(path_encoding), _path_safe_chars), 

560 quote(parts.params.encode(path_encoding), _safe_chars), 

561 quote(parts.query.encode(encoding), _safe_chars), 

562 quote(parts.fragment.encode(encoding), _safe_chars), 

563 ) 

564 

565 

566def canonicalize_url( 

567 url: str | bytes | ParseResult, 

568 keep_blank_values: bool = True, 

569 keep_fragments: bool = False, 

570 encoding: str | None = None, 

571) -> str: 

572 r"""Canonicalize the given url by applying the following procedures: 

573 

574 - make the URL safe 

575 - sort query arguments, first by key, then by value 

576 - normalize all spaces (in query arguments) '+' (plus symbol) 

577 - normalize percent encodings case (%2f -> %2F) 

578 - remove query arguments with blank values (unless `keep_blank_values` is True) 

579 - remove fragments (unless `keep_fragments` is True) 

580 

581 The url passed can be bytes or unicode, while the url returned is 

582 always a native str (bytes in Python 2, unicode in Python 3). 

583 

584 >>> import w3lib.url 

585 >>> 

586 >>> # sorting query arguments 

587 >>> w3lib.url.canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50') 

588 'http://www.example.com/do?a=50&b=2&b=5&c=3' 

589 >>> 

590 >>> # UTF-8 conversion + percent-encoding of non-ASCII characters 

591 >>> w3lib.url.canonicalize_url('http://www.example.com/r\u00e9sum\u00e9') 

592 'http://www.example.com/r%C3%A9sum%C3%A9' 

593 >>> 

594 

595 For more examples, see the tests in `tests/test_url.py`. 

596 """ 

597 # If supplied `encoding` is not compatible with all characters in `url`, 

598 # fallback to UTF-8 as safety net. 

599 # UTF-8 can handle all Unicode characters, 

600 # so we should be covered regarding URL normalization, 

601 # if not for proper URL expected by remote website. 

602 if isinstance(url, str): 

603 url = _strip(url) 

604 try: 

605 scheme, netloc, path, params, query, fragment = _safe_ParseResult( 

606 parse_url(url), encoding=encoding or "utf8" 

607 ) 

608 except UnicodeEncodeError: 

609 scheme, netloc, path, params, query, fragment = _safe_ParseResult( 

610 parse_url(url), encoding="utf8" 

611 ) 

612 

613 # 1. decode query-string as UTF-8 (or keep raw bytes), 

614 # sort values, 

615 # and percent-encode them back 

616 

617 # Python's urllib.parse.parse_qsl does not work as wanted 

618 # for percent-encoded characters that do not match passed encoding, 

619 # they get lost. 

620 # 

621 # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')] 

622 # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD), 

623 # instead of \xa3 that you get with Python2's parse_qsl) 

624 # 

625 # what we want here is to keep raw bytes, and percent encode them 

626 # so as to preserve whatever encoding what originally used. 

627 # 

628 # See https://tools.ietf.org/html/rfc3987#section-6.4: 

629 # 

630 # For example, it is possible to have a URI reference of 

631 # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the 

632 # document name is encoded in iso-8859-1 based on server settings, but 

633 # where the fragment identifier is encoded in UTF-8 according to 

634 # [XPointer]. The IRI corresponding to the above URI would be (in XML 

635 # notation) 

636 # "http://www.example.org/r%E9sum%E9.xml#r&#xE9;sum&#xE9;". 

637 # Similar considerations apply to query parts. The functionality of 

638 # IRIs (namely, to be able to include non-ASCII characters) can only be 

639 # used if the query part is encoded in UTF-8. 

640 keyvals = parse_qsl_to_bytes(query, keep_blank_values) 

641 

642 keyvals.sort() 

643 query = urlencode(keyvals) 

644 

645 # 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes) 

646 # and percent-encode path again (this normalizes to upper-case %XX) 

647 uqp = _unquotepath(path) 

648 path = quote(uqp, _path_safe_chars) or "/" 

649 

650 fragment = "" if not keep_fragments else fragment 

651 

652 # Apply lowercase to the domain, but not to the userinfo. 

653 netloc_parts = netloc.split("@") 

654 netloc_parts[-1] = netloc_parts[-1].lower().rstrip(":") 

655 netloc = "@".join(netloc_parts) 

656 

657 # every part should be safe already 

658 return urlunparse((scheme, netloc, path, params, query, fragment)) 

659 

660 

661def _unquotepath(path: str) -> bytes: 

662 for reserved in ("2f", "2F", "3f", "3F"): 

663 path = path.replace("%" + reserved, "%25" + reserved.upper()) 

664 

665 # standard lib's unquote() does not work for non-UTF-8 

666 # percent-escaped characters, they get lost. 

667 # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD) 

668 # 

669 # unquote_to_bytes() returns raw bytes instead 

670 return unquote_to_bytes(path) 

671 

672 

673def parse_url( 

674 url: str | bytes | ParseResult, encoding: str | None = None 

675) -> ParseResult: 

676 """Return urlparsed url from the given argument (which could be an already 

677 parsed url) 

678 """ 

679 if isinstance(url, ParseResult): 

680 return url 

681 return urlparse(to_unicode(url, encoding)) 

682 

683 

684def parse_qsl_to_bytes( 

685 qs: str, keep_blank_values: bool = False 

686) -> list[tuple[bytes, bytes]]: 

687 """Parse a query given as a string argument. 

688 

689 Data are returned as a list of name, value pairs as bytes. 

690 

691 Arguments: 

692 

693 qs: percent-encoded query string to be parsed 

694 

695 keep_blank_values: flag indicating whether blank values in 

696 percent-encoded queries should be treated as blank strings. A 

697 true value indicates that blanks should be retained as blank 

698 strings. The default false value indicates that blank values 

699 are to be ignored and treated as if they were not included. 

700 

701 """ 

702 # This code is the same as Python3's parse_qsl() 

703 # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) 

704 # except for the unquote(s, encoding, errors) calls replaced 

705 # with unquote_to_bytes(s) 

706 coerce_args = cast("Callable[..., tuple[str, Callable[..., bytes]]]", _coerce_args) 

707 qs, _coerce_result = coerce_args(qs) 

708 pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")] 

709 r = [] 

710 for name_value in pairs: 

711 if not name_value: 

712 continue 

713 nv = name_value.split("=", 1) 

714 if len(nv) != 2: 

715 # Handle case of a control-name with no equal sign 

716 if keep_blank_values: 

717 nv.append("") 

718 else: 

719 continue 

720 if len(nv[1]) or keep_blank_values: 

721 name: str | bytes = nv[0].replace("+", " ") 

722 name = unquote_to_bytes(name) 

723 name = _coerce_result(name) 

724 value: str | bytes = nv[1].replace("+", " ") 

725 value = unquote_to_bytes(value) 

726 value = _coerce_result(value) 

727 r.append((name, value)) 

728 return r