Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/w3lib/url.py: 42%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

242 statements  

1""" 

2This module contains general purpose URL functions not found in the standard 

3library. 

4""" 

5 

6from __future__ import annotations 

7 

8import base64 

9import codecs 

10import os 

11import posixpath 

12import re 

13import string 

14from pathlib import Path 

15from typing import TYPE_CHECKING, Callable, NamedTuple, cast, overload 

16from urllib.parse import ( # type: ignore[attr-defined] 

17 ParseResult, 

18 _coerce_args, 

19 parse_qs, 

20 parse_qsl, 

21 quote, 

22 unquote, 

23 unquote_to_bytes, 

24 urldefrag, 

25 urlencode, 

26 urlparse, 

27 urlsplit, 

28 urlunparse, 

29 urlunsplit, 

30) 

31from urllib.request import pathname2url, url2pathname 

32 

33from ._infra import _ASCII_TAB_OR_NEWLINE, _C0_CONTROL_OR_SPACE 

34from ._url import _SPECIAL_SCHEMES 

35from .util import to_unicode 

36 

37if TYPE_CHECKING: 

38 from collections.abc import Sequence 

39 

40 from ._types import AnyUnicodeError 

41 

42 

43# error handling function for bytes-to-Unicode decoding errors with URLs 

44def _quote_byte(error: UnicodeError) -> tuple[str, int]: 

45 error = cast("AnyUnicodeError", error) 

46 return (to_unicode(quote(error.object[error.start : error.end])), error.end) 

47 

48 

49codecs.register_error("percentencode", _quote_byte) 

50 

51# constants from RFC 3986, Section 2.2 and 2.3 

52RFC3986_GEN_DELIMS = b":/?#[]@" 

53RFC3986_SUB_DELIMS = b"!$&'()*+,;=" 

54RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS 

55RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode("ascii") 

56EXTRA_SAFE_CHARS = b"|" # see https://github.com/scrapy/w3lib/pull/25 

57 

58RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":" 

59_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b"%" 

60_path_safe_chars = _safe_chars.replace(b"#", b"") 

61 

62# Characters that are safe in all of: 

63# 

64# - RFC 2396 + RFC 2732, as interpreted by Java 8’s java.net.URI class 

65# - RFC 3986 

66# - The URL living standard 

67# 

68# NOTE: % is currently excluded from these lists of characters, due to 

69# limitations of the current safe_url_string implementation, but it should also 

70# be escaped as %25 when it is not already being used as part of an escape 

71# character. 

72_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, delete=b":;=") 

73_PATH_SAFEST_CHARS = _safe_chars.translate(None, delete=b"#[]|") 

74_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS 

75_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, delete=b"'") 

76_FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS 

77 

78 

79_ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE = { 

80 ord(char): None for char in _ASCII_TAB_OR_NEWLINE 

81} 

82 

83 

84def _strip(url: str) -> str: 

85 return url.strip(_C0_CONTROL_OR_SPACE).translate( 

86 _ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE 

87 ) 

88 

89 

90def safe_url_string( # pylint: disable=too-many-locals 

91 url: str | bytes, 

92 encoding: str = "utf8", 

93 path_encoding: str = "utf8", 

94 quote_path: bool = True, 

95) -> str: 

96 """Return a URL equivalent to *url* that a wide range of web browsers and 

97 web servers consider valid. 

98 

99 *url* is parsed according to the rules of the `URL living standard`_, 

100 and during serialization additional characters are percent-encoded to make 

101 the URL valid by additional URL standards. 

102 

103 .. _URL living standard: https://url.spec.whatwg.org/ 

104 

105 The returned URL should be valid by *all* of the following URL standards 

106 known to be enforced by modern-day web browsers and web servers: 

107 

108 - `URL living standard`_ 

109 

110 - `RFC 3986`_ 

111 

112 - `RFC 2396`_ and `RFC 2732`_, as interpreted by `Java 8’s java.net.URI 

113 class`_. 

114 

115 .. _Java 8’s java.net.URI class: https://docs.oracle.com/javase/8/docs/api/java/net/URI.html 

116 .. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt 

117 .. _RFC 2732: https://www.ietf.org/rfc/rfc2732.txt 

118 .. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt 

119 

120 If a bytes URL is given, it is first converted to `str` using the given 

121 encoding (which defaults to 'utf-8'). If quote_path is True (default), 

122 path_encoding ('utf-8' by default) is used to encode URL path component 

123 which is then quoted. Otherwise, if quote_path is False, path component 

124 is not encoded or quoted. Given encoding is used for query string 

125 or form data. 

126 

127 When passing an encoding, you should use the encoding of the 

128 original page (the page from which the URL was extracted from). 

129 

130 Calling this function on an already "safe" URL will return the URL 

131 unmodified. 

132 """ 

133 # urlsplit() chokes on bytes input with non-ASCII chars, 

134 # so let's decode (to Unicode) using page encoding: 

135 # - it is assumed that a raw bytes input comes from a document 

136 # encoded with the supplied encoding (or UTF8 by default) 

137 # - if the supplied (or default) encoding chokes, 

138 # percent-encode offending bytes 

139 decoded = to_unicode(url, encoding=encoding, errors="percentencode") 

140 parts = urlsplit(_strip(decoded)) 

141 

142 username, password, hostname, port = ( 

143 parts.username, 

144 parts.password, 

145 parts.hostname, 

146 parts.port, 

147 ) 

148 netloc_bytes = b"" 

149 if username is not None or password is not None: 

150 if username is not None: 

151 safe_username = quote(unquote(username), _USERINFO_SAFEST_CHARS) 

152 netloc_bytes += safe_username.encode(encoding) 

153 if password is not None: 

154 netloc_bytes += b":" 

155 safe_password = quote(unquote(password), _USERINFO_SAFEST_CHARS) 

156 netloc_bytes += safe_password.encode(encoding) 

157 netloc_bytes += b"@" 

158 if hostname is not None: 

159 try: 

160 netloc_bytes += hostname.encode("idna") 

161 except UnicodeError: 

162 # IDNA encoding can fail for too long labels (>63 characters) or 

163 # missing labels (e.g. http://.example.com) 

164 netloc_bytes += hostname.encode(encoding) 

165 if port is not None: 

166 netloc_bytes += b":" 

167 netloc_bytes += str(port).encode(encoding) 

168 

169 netloc = netloc_bytes.decode() 

170 

171 # default encoding for path component SHOULD be UTF-8 

172 if quote_path: 

173 path = quote(parts.path.encode(path_encoding), _PATH_SAFEST_CHARS) 

174 else: 

175 path = parts.path 

176 

177 if parts.scheme in _SPECIAL_SCHEMES: 

178 query = quote(parts.query.encode(encoding), _SPECIAL_QUERY_SAFEST_CHARS) 

179 else: 

180 query = quote(parts.query.encode(encoding), _QUERY_SAFEST_CHARS) 

181 

182 return urlunsplit( 

183 ( 

184 parts.scheme, 

185 netloc, 

186 path, 

187 query, 

188 quote(parts.fragment.encode(encoding), _FRAGMENT_SAFEST_CHARS), 

189 ) 

190 ) 

191 

192 

193_parent_dirs = re.compile(r"/?(\.\./)+") 

194 

195 

196def safe_download_url( 

197 url: str | bytes, encoding: str = "utf8", path_encoding: str = "utf8" 

198) -> str: 

199 """Make a url for download. This will call safe_url_string 

200 and then strip the fragment, if one exists. The path will 

201 be normalised. 

202 

203 If the path is outside the document root, it will be changed 

204 to be within the document root. 

205 """ 

206 safe_url = safe_url_string(url, encoding, path_encoding) 

207 scheme, netloc, path, query, _ = urlsplit(safe_url) 

208 if path: 

209 path = _parent_dirs.sub("", posixpath.normpath(path)) 

210 if safe_url.endswith("/") and not path.endswith("/"): 

211 path += "/" 

212 else: 

213 path = "/" 

214 return urlunsplit((scheme, netloc, path, query, "")) 

215 

216 

217def is_url(text: str) -> bool: 

218 return text.partition("://")[0] in ("file", "http", "https") 

219 

220 

221@overload 

222def url_query_parameter( 

223 url: str | bytes, 

224 parameter: str, 

225 default: None = None, 

226 keep_blank_values: bool | int = 0, 

227) -> str | None: ... 

228 

229 

230@overload 

231def url_query_parameter( 

232 url: str | bytes, 

233 parameter: str, 

234 default: str, 

235 keep_blank_values: bool | int = 0, 

236) -> str: ... 

237 

238 

239def url_query_parameter( 

240 url: str | bytes, 

241 parameter: str, 

242 default: str | None = None, 

243 keep_blank_values: bool | int = 0, 

244) -> str | None: 

245 """Return the value of a url parameter, given the url and parameter name 

246 

247 General case: 

248 

249 >>> import w3lib.url 

250 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "id") 

251 '200' 

252 >>> 

253 

254 Return a default value if the parameter is not found: 

255 

256 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault") 

257 'mydefault' 

258 >>> 

259 

260 Returns None if `keep_blank_values` not set or 0 (default): 

261 

262 >>> w3lib.url.url_query_parameter("product.html?id=", "id") 

263 >>> 

264 

265 Returns an empty string if `keep_blank_values` set to 1: 

266 

267 >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1) 

268 '' 

269 >>> 

270 

271 """ 

272 

273 queryparams = parse_qs( 

274 urlsplit(str(url))[3], keep_blank_values=bool(keep_blank_values) 

275 ) 

276 if parameter in queryparams: 

277 return queryparams[parameter][0] 

278 return default 

279 

280 

281def url_query_cleaner( 

282 url: str | bytes, 

283 parameterlist: str | bytes | Sequence[str | bytes] = (), 

284 sep: str = "&", 

285 kvsep: str = "=", 

286 remove: bool = False, 

287 unique: bool = True, 

288 keep_fragments: bool = False, 

289) -> str: 

290 """Clean URL arguments leaving only those passed in the parameterlist keeping order 

291 

292 >>> import w3lib.url 

293 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',)) 

294 'product.html?id=200' 

295 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']) 

296 'product.html?id=200&name=wired' 

297 >>> 

298 

299 If `unique` is ``False``, do not remove duplicated keys 

300 

301 >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False) 

302 'product.html?d=1&d=2&d=3' 

303 >>> 

304 

305 If `remove` is ``True``, leave only those **not in parameterlist**. 

306 

307 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True) 

308 'product.html?foo=bar&name=wired' 

309 >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True) 

310 'product.html?name=wired' 

311 >>> 

312 

313 By default, URL fragments are removed. If you need to preserve fragments, 

314 pass the ``keep_fragments`` argument as ``True``. 

315 

316 >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True) 

317 'http://domain.tld/#123123' 

318 

319 """ 

320 

321 if isinstance(parameterlist, (str, bytes)): 

322 parameterlist = [parameterlist] 

323 url, fragment = urldefrag(url) 

324 url = cast("str", url) 

325 fragment = cast("str", fragment) 

326 base, _, query = url.partition("?") 

327 seen = set() 

328 querylist = [] 

329 for ksv in query.split(sep): 

330 if not ksv: 

331 continue 

332 k, _, _ = ksv.partition(kvsep) 

333 if unique and k in seen: 

334 continue 

335 if remove and k in parameterlist: 

336 continue 

337 if not remove and k not in parameterlist: 

338 continue 

339 querylist.append(ksv) 

340 seen.add(k) 

341 url = "?".join([base, sep.join(querylist)]) if querylist else base 

342 if keep_fragments and fragment: 

343 url += "#" + fragment 

344 return url 

345 

346 

347def _add_or_replace_parameters(url: str, params: dict[str, str]) -> str: 

348 parsed = urlsplit(url) 

349 current_args = parse_qsl(parsed.query, keep_blank_values=True) 

350 

351 new_args = [] 

352 seen_params = set() 

353 for name, value in current_args: 

354 if name not in params: 

355 new_args.append((name, value)) 

356 elif name not in seen_params: 

357 new_args.append((name, params[name])) 

358 seen_params.add(name) 

359 

360 not_modified_args = [ 

361 (name, value) for name, value in params.items() if name not in seen_params 

362 ] 

363 new_args += not_modified_args 

364 

365 query = urlencode(new_args) 

366 return urlunsplit(parsed._replace(query=query)) 

367 

368 

369def add_or_replace_parameter(url: str, name: str, new_value: str) -> str: 

370 """Add or remove a parameter to a given url 

371 

372 >>> import w3lib.url 

373 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php', 'arg', 'v') 

374 'http://www.example.com/index.php?arg=v' 

375 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg4', 'v4') 

376 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3&arg4=v4' 

377 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg3', 'v3new') 

378 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new' 

379 >>> 

380 

381 """ 

382 return _add_or_replace_parameters(url, {name: new_value}) 

383 

384 

385def add_or_replace_parameters(url: str, new_parameters: dict[str, str]) -> str: 

386 """Add or remove a parameters to a given url 

387 

388 >>> import w3lib.url 

389 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php', {'arg': 'v'}) 

390 'http://www.example.com/index.php?arg=v' 

391 >>> args = {'arg4': 'v4', 'arg3': 'v3new'} 

392 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', args) 

393 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new&arg4=v4' 

394 >>> 

395 

396 """ 

397 return _add_or_replace_parameters(url, new_parameters) 

398 

399 

400def path_to_file_uri(path: str | os.PathLike[str]) -> str: 

401 """Convert local filesystem path to legal File URIs as described in: 

402 http://en.wikipedia.org/wiki/File_URI_scheme 

403 """ 

404 x = pathname2url(str(Path(path).absolute())) 

405 return f"file:///{x.lstrip('/')}" 

406 

407 

408def file_uri_to_path(uri: str) -> str: 

409 """Convert File URI to local filesystem path according to: 

410 http://en.wikipedia.org/wiki/File_URI_scheme 

411 """ 

412 uri_path = urlparse(uri).path 

413 return url2pathname(uri_path) 

414 

415 

416def any_to_uri(uri_or_path: str) -> str: 

417 """If given a path name, return its File URI, otherwise return it 

418 unmodified 

419 """ 

420 if os.path.splitdrive(uri_or_path)[0]: 

421 return path_to_file_uri(uri_or_path) 

422 u = urlparse(uri_or_path) 

423 return uri_or_path if u.scheme else path_to_file_uri(uri_or_path) 

424 

425 

426# ASCII characters. 

427_char = set(map(chr, range(127))) 

428 

429# RFC 2045 token. 

430_token = r"[{}]+".format( 

431 re.escape( 

432 "".join( 

433 _char 

434 - 

435 # Control characters. 

436 set(map(chr, range(32))) 

437 - 

438 # tspecials and space. 

439 set('()<>@,;:\\"/[]?= ') 

440 ) 

441 ) 

442) 

443 

444# RFC 822 quoted-string, without surrounding quotation marks. 

445_quoted_string = r"(?:[{}]|(?:\\[{}]))*".format( 

446 re.escape("".join(_char - {'"', "\\", "\r"})), re.escape("".join(_char)) 

447) 

448 

449# Encode the regular expression strings to make them into bytes, as Python 3 

450# bytes have no format() method, but bytes must be passed to re.compile() in 

451# order to make a pattern object that can be used to match on bytes. 

452 

453# RFC 2397 mediatype. 

454_mediatype_pattern = re.compile(rf"{_token}/{_token}".encode()) 

455_mediatype_parameter_pattern = re.compile( 

456 rf';({_token})=(?:({_token})|"({_quoted_string})")'.encode() 

457) 

458 

459 

460class ParseDataURIResult(NamedTuple): 

461 """Named tuple returned by :func:`parse_data_uri`.""" 

462 

463 #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``). 

464 media_type: str 

465 #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``). 

466 media_type_parameters: dict[str, str] 

467 #: Data, decoded if it was encoded in base64 format. 

468 data: bytes 

469 

470 

471def parse_data_uri(uri: str | bytes) -> ParseDataURIResult: 

472 """Parse a data: URI into :class:`ParseDataURIResult`.""" 

473 if not isinstance(uri, bytes): 

474 uri = safe_url_string(uri).encode("ascii") 

475 

476 try: 

477 scheme, uri = uri.split(b":", 1) 

478 except ValueError: 

479 raise ValueError("invalid URI") 

480 if scheme.lower() != b"data": 

481 raise ValueError("not a data URI") 

482 

483 # RFC 3986 section 2.1 allows percent encoding to escape characters that 

484 # would be interpreted as delimiters, implying that actual delimiters 

485 # should not be percent-encoded. 

486 # Decoding before parsing will allow malformed URIs with percent-encoded 

487 # delimiters, but it makes parsing easier and should not affect 

488 # well-formed URIs, as the delimiters used in this URI scheme are not 

489 # allowed, percent-encoded or not, in tokens. 

490 uri = unquote_to_bytes(uri) 

491 

492 media_type = "text/plain" 

493 media_type_params = {} 

494 

495 m = _mediatype_pattern.match(uri) 

496 if m: 

497 media_type = m.group().decode() 

498 uri = uri[m.end() :] 

499 else: 

500 media_type_params["charset"] = "US-ASCII" 

501 

502 while True: 

503 m = _mediatype_parameter_pattern.match(uri) 

504 if m: 

505 attribute, value, value_quoted = m.groups() 

506 if value_quoted: 

507 value = re.sub(rb"\\(.)", rb"\1", value_quoted) 

508 media_type_params[attribute.decode()] = value.decode() 

509 uri = uri[m.end() :] 

510 else: 

511 break 

512 

513 try: 

514 is_base64, data = uri.split(b",", 1) 

515 except ValueError: 

516 raise ValueError("invalid data URI") 

517 if is_base64: 

518 if is_base64 != b";base64": 

519 raise ValueError("invalid data URI") 

520 data = base64.b64decode(data) 

521 

522 return ParseDataURIResult(media_type, media_type_params, data) 

523 

524 

525__all__ = [ 

526 "add_or_replace_parameter", 

527 "add_or_replace_parameters", 

528 "any_to_uri", 

529 "canonicalize_url", 

530 "file_uri_to_path", 

531 "is_url", 

532 "parse_data_uri", 

533 "path_to_file_uri", 

534 "safe_download_url", 

535 "safe_url_string", 

536 "url_query_cleaner", 

537 "url_query_parameter", 

538] 

539 

540 

541def _safe_ParseResult( 

542 parts: ParseResult, encoding: str = "utf8", path_encoding: str = "utf8" 

543) -> tuple[str, str, str, str, str, str]: 

544 # IDNA encoding can fail for too long labels (>63 characters) 

545 # or missing labels (e.g. http://.example.com) 

546 try: 

547 netloc = parts.netloc.encode("idna").decode() 

548 except UnicodeError: 

549 netloc = parts.netloc 

550 

551 return ( 

552 parts.scheme, 

553 netloc, 

554 quote(parts.path.encode(path_encoding), _path_safe_chars), 

555 quote(parts.params.encode(path_encoding), _safe_chars), 

556 quote(parts.query.encode(encoding), _safe_chars), 

557 quote(parts.fragment.encode(encoding), _safe_chars), 

558 ) 

559 

560 

561def canonicalize_url( 

562 url: str | bytes | ParseResult, 

563 keep_blank_values: bool = True, 

564 keep_fragments: bool = False, 

565 encoding: str | None = None, 

566) -> str: 

567 r"""Canonicalize the given url by applying the following procedures: 

568 

569 - make the URL safe 

570 - sort query arguments, first by key, then by value 

571 - normalize all spaces (in query arguments) '+' (plus symbol) 

572 - normalize percent encodings case (%2f -> %2F) 

573 - remove query arguments with blank values (unless `keep_blank_values` is True) 

574 - remove fragments (unless `keep_fragments` is True) 

575 

576 The url passed can be bytes or unicode, while the url returned is 

577 always a native str (bytes in Python 2, unicode in Python 3). 

578 

579 >>> import w3lib.url 

580 >>> 

581 >>> # sorting query arguments 

582 >>> w3lib.url.canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50') 

583 'http://www.example.com/do?a=50&b=2&b=5&c=3' 

584 >>> 

585 >>> # UTF-8 conversion + percent-encoding of non-ASCII characters 

586 >>> w3lib.url.canonicalize_url('http://www.example.com/r\u00e9sum\u00e9') 

587 'http://www.example.com/r%C3%A9sum%C3%A9' 

588 >>> 

589 

590 For more examples, see the tests in `tests/test_url.py`. 

591 """ 

592 # If supplied `encoding` is not compatible with all characters in `url`, 

593 # fallback to UTF-8 as safety net. 

594 # UTF-8 can handle all Unicode characters, 

595 # so we should be covered regarding URL normalization, 

596 # if not for proper URL expected by remote website. 

597 if isinstance(url, str): 

598 url = _strip(url) 

599 try: 

600 scheme, netloc, path, params, query, fragment = _safe_ParseResult( 

601 parse_url(url), encoding=encoding or "utf8" 

602 ) 

603 except UnicodeEncodeError: 

604 scheme, netloc, path, params, query, fragment = _safe_ParseResult( 

605 parse_url(url), encoding="utf8" 

606 ) 

607 

608 # 1. decode query-string as UTF-8 (or keep raw bytes), 

609 # sort values, 

610 # and percent-encode them back 

611 

612 # Python's urllib.parse.parse_qsl does not work as wanted 

613 # for percent-encoded characters that do not match passed encoding, 

614 # they get lost. 

615 # 

616 # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')] 

617 # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD), 

618 # instead of \xa3 that you get with Python2's parse_qsl) 

619 # 

620 # what we want here is to keep raw bytes, and percent encode them 

621 # so as to preserve whatever encoding what originally used. 

622 # 

623 # See https://tools.ietf.org/html/rfc3987#section-6.4: 

624 # 

625 # For example, it is possible to have a URI reference of 

626 # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the 

627 # document name is encoded in iso-8859-1 based on server settings, but 

628 # where the fragment identifier is encoded in UTF-8 according to 

629 # [XPointer]. The IRI corresponding to the above URI would be (in XML 

630 # notation) 

631 # "http://www.example.org/r%E9sum%E9.xml#r&#xE9;sum&#xE9;". 

632 # Similar considerations apply to query parts. The functionality of 

633 # IRIs (namely, to be able to include non-ASCII characters) can only be 

634 # used if the query part is encoded in UTF-8. 

635 keyvals = parse_qsl_to_bytes(query, keep_blank_values) 

636 

637 keyvals.sort() 

638 query = urlencode(keyvals) 

639 

640 # 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes) 

641 # and percent-encode path again (this normalizes to upper-case %XX) 

642 uqp = _unquotepath(path) 

643 path = quote(uqp, _path_safe_chars) or "/" 

644 

645 fragment = "" if not keep_fragments else fragment 

646 

647 # Apply lowercase to the domain, but not to the userinfo. 

648 netloc_parts = netloc.split("@") 

649 netloc_parts[-1] = netloc_parts[-1].lower().rstrip(":") 

650 netloc = "@".join(netloc_parts) 

651 

652 # every part should be safe already 

653 return urlunparse((scheme, netloc, path, params, query, fragment)) 

654 

655 

656def _unquotepath(path: str) -> bytes: 

657 for reserved in ("2f", "2F", "3f", "3F"): 

658 path = path.replace("%" + reserved, "%25" + reserved.upper()) 

659 

660 # standard lib's unquote() does not work for non-UTF-8 

661 # percent-escaped characters, they get lost. 

662 # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD) 

663 # 

664 # unquote_to_bytes() returns raw bytes instead 

665 return unquote_to_bytes(path) 

666 

667 

668def parse_url( 

669 url: str | bytes | ParseResult, encoding: str | None = None 

670) -> ParseResult: 

671 """Return urlparsed url from the given argument (which could be an already 

672 parsed url) 

673 """ 

674 if isinstance(url, ParseResult): 

675 return url 

676 return urlparse(to_unicode(url, encoding)) 

677 

678 

679def parse_qsl_to_bytes( 

680 qs: str, keep_blank_values: bool = False 

681) -> list[tuple[bytes, bytes]]: 

682 """Parse a query given as a string argument. 

683 

684 Data are returned as a list of name, value pairs as bytes. 

685 

686 Arguments: 

687 

688 qs: percent-encoded query string to be parsed 

689 

690 keep_blank_values: flag indicating whether blank values in 

691 percent-encoded queries should be treated as blank strings. A 

692 true value indicates that blanks should be retained as blank 

693 strings. The default false value indicates that blank values 

694 are to be ignored and treated as if they were not included. 

695 

696 """ 

697 # This code is the same as Python3's parse_qsl() 

698 # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) 

699 # except for the unquote(s, encoding, errors) calls replaced 

700 # with unquote_to_bytes(s) 

701 coerce_args = cast("Callable[..., tuple[str, Callable[..., bytes]]]", _coerce_args) 

702 qs, _coerce_result = coerce_args(qs) 

703 pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")] 

704 r = [] 

705 for name_value in pairs: 

706 if not name_value: 

707 continue 

708 nv = name_value.split("=", 1) 

709 if len(nv) != 2: 

710 # Handle case of a control-name with no equal sign 

711 if keep_blank_values: 

712 nv.append("") 

713 else: 

714 continue 

715 if len(nv[1]) or keep_blank_values: 

716 name: str | bytes = nv[0].replace("+", " ") 

717 name = unquote_to_bytes(name) 

718 name = _coerce_result(name) 

719 value: str | bytes = nv[1].replace("+", " ") 

720 value = unquote_to_bytes(value) 

721 value = _coerce_result(value) 

722 r.append((name, value)) 

723 return r