Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/w3lib/url.py: 56%

230 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 07:14 +0000

1""" 

2This module contains general purpose URL functions not found in the standard 

3library. 

4""" 

5import base64 

6import codecs 

7import os 

8import posixpath 

9import re 

10import string 

11from typing import ( 

12 cast, 

13 Callable, 

14 Dict, 

15 List, 

16 NamedTuple, 

17 Optional, 

18 Sequence, 

19 Tuple, 

20 Union, 

21) 

22from urllib.parse import ( 

23 parse_qs, 

24 parse_qsl, 

25 ParseResult, 

26 quote, 

27 unquote_to_bytes, 

28 urldefrag, 

29 urlencode, 

30 urlparse, 

31 urlsplit, 

32 urlunparse, 

33 urlunsplit, 

34 unquote, 

35) 

36from urllib.parse import _coerce_args # type: ignore 

37from urllib.request import pathname2url, url2pathname 

38 

39from .util import to_unicode 

40from ._infra import _ASCII_TAB_OR_NEWLINE, _C0_CONTROL_OR_SPACE 

41from ._types import AnyUnicodeError, StrOrBytes 

42from ._url import _SPECIAL_SCHEMES 

43 

44 

45# error handling function for bytes-to-Unicode decoding errors with URLs 

46def _quote_byte(error: UnicodeError) -> Tuple[str, int]: 

47 error = cast(AnyUnicodeError, error) 

48 return (to_unicode(quote(error.object[error.start : error.end])), error.end) 

49 

50 

51codecs.register_error("percentencode", _quote_byte) 

52 

53# constants from RFC 3986, Section 2.2 and 2.3 

54RFC3986_GEN_DELIMS = b":/?#[]@" 

55RFC3986_SUB_DELIMS = b"!$&'()*+,;=" 

56RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS 

57RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode("ascii") 

58EXTRA_SAFE_CHARS = b"|" # see https://github.com/scrapy/w3lib/pull/25 

59 

60RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":" 

61_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b"%" 

62_path_safe_chars = _safe_chars.replace(b"#", b"") 

63 

64# Characters that are safe in all of: 

65# 

66# - RFC 2396 + RFC 2732, as interpreted by Java 8’s java.net.URI class 

67# - RFC 3986 

68# - The URL living standard 

69# 

70# NOTE: % is currently excluded from these lists of characters, due to 

71# limitations of the current safe_url_string implementation, but it should also 

72# be escaped as %25 when it is not already being used as part of an escape 

73# character. 

74_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, delete=b":;=") 

75_PATH_SAFEST_CHARS = _safe_chars.translate(None, delete=b"#[]|") 

76_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS 

77_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, delete=b"'") 

78_FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS 

79 

80 

81_ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE = { 

82 ord(char): None for char in _ASCII_TAB_OR_NEWLINE 

83} 

84 

85 

86def _strip(url: str) -> str: 

87 return url.strip(_C0_CONTROL_OR_SPACE).translate( 

88 _ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE 

89 ) 

90 

91 

92def safe_url_string( # pylint: disable=too-many-locals 

93 url: StrOrBytes, 

94 encoding: str = "utf8", 

95 path_encoding: str = "utf8", 

96 quote_path: bool = True, 

97) -> str: 

98 """Return a URL equivalent to *url* that a wide range of web browsers and 

99 web servers consider valid. 

100 

101 *url* is parsed according to the rules of the `URL living standard`_, 

102 and during serialization additional characters are percent-encoded to make 

103 the URL valid by additional URL standards. 

104 

105 .. _URL living standard: https://url.spec.whatwg.org/ 

106 

107 The returned URL should be valid by *all* of the following URL standards 

108 known to be enforced by modern-day web browsers and web servers: 

109 

110 - `URL living standard`_ 

111 

112 - `RFC 3986`_ 

113 

114 - `RFC 2396`_ and `RFC 2732`_, as interpreted by `Java 8’s java.net.URI 

115 class`_. 

116 

117 .. _Java 8’s java.net.URI class: https://docs.oracle.com/javase/8/docs/api/java/net/URI.html 

118 .. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt 

119 .. _RFC 2732: https://www.ietf.org/rfc/rfc2732.txt 

120 .. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt 

121 

122 If a bytes URL is given, it is first converted to `str` using the given 

123 encoding (which defaults to 'utf-8'). If quote_path is True (default), 

124 path_encoding ('utf-8' by default) is used to encode URL path component 

125 which is then quoted. Otherwise, if quote_path is False, path component 

126 is not encoded or quoted. Given encoding is used for query string 

127 or form data. 

128 

129 When passing an encoding, you should use the encoding of the 

130 original page (the page from which the URL was extracted from). 

131 

132 Calling this function on an already "safe" URL will return the URL 

133 unmodified. 

134 """ 

135 # urlsplit() chokes on bytes input with non-ASCII chars, 

136 # so let's decode (to Unicode) using page encoding: 

137 # - it is assumed that a raw bytes input comes from a document 

138 # encoded with the supplied encoding (or UTF8 by default) 

139 # - if the supplied (or default) encoding chokes, 

140 # percent-encode offending bytes 

141 decoded = to_unicode(url, encoding=encoding, errors="percentencode") 

142 parts = urlsplit(_strip(decoded)) 

143 

144 username, password, hostname, port = ( 

145 parts.username, 

146 parts.password, 

147 parts.hostname, 

148 parts.port, 

149 ) 

150 netloc_bytes = b"" 

151 if username is not None or password is not None: 

152 if username is not None: 

153 safe_username = quote(unquote(username), _USERINFO_SAFEST_CHARS) 

154 netloc_bytes += safe_username.encode(encoding) 

155 if password is not None: 

156 netloc_bytes += b":" 

157 safe_password = quote(unquote(password), _USERINFO_SAFEST_CHARS) 

158 netloc_bytes += safe_password.encode(encoding) 

159 netloc_bytes += b"@" 

160 if hostname is not None: 

161 try: 

162 netloc_bytes += hostname.encode("idna") 

163 except UnicodeError: 

164 # IDNA encoding can fail for too long labels (>63 characters) or 

165 # missing labels (e.g. http://.example.com) 

166 netloc_bytes += hostname.encode(encoding) 

167 if port is not None: 

168 netloc_bytes += b":" 

169 netloc_bytes += str(port).encode(encoding) 

170 

171 netloc = netloc_bytes.decode() 

172 

173 # default encoding for path component SHOULD be UTF-8 

174 if quote_path: 

175 path = quote(parts.path.encode(path_encoding), _PATH_SAFEST_CHARS) 

176 else: 

177 path = parts.path 

178 

179 if parts.scheme in _SPECIAL_SCHEMES: 

180 query = quote(parts.query.encode(encoding), _SPECIAL_QUERY_SAFEST_CHARS) 

181 else: 

182 query = quote(parts.query.encode(encoding), _QUERY_SAFEST_CHARS) 

183 

184 return urlunsplit( 

185 ( 

186 parts.scheme, 

187 netloc, 

188 path, 

189 query, 

190 quote(parts.fragment.encode(encoding), _FRAGMENT_SAFEST_CHARS), 

191 ) 

192 ) 

193 

194 

195_parent_dirs = re.compile(r"/?(\.\./)+") 

196 

197 

198def safe_download_url( 

199 url: StrOrBytes, encoding: str = "utf8", path_encoding: str = "utf8" 

200) -> str: 

201 """Make a url for download. This will call safe_url_string 

202 and then strip the fragment, if one exists. The path will 

203 be normalised. 

204 

205 If the path is outside the document root, it will be changed 

206 to be within the document root. 

207 """ 

208 safe_url = safe_url_string(url, encoding, path_encoding) 

209 scheme, netloc, path, query, _ = urlsplit(safe_url) 

210 if path: 

211 path = _parent_dirs.sub("", posixpath.normpath(path)) 

212 if safe_url.endswith("/") and not path.endswith("/"): 

213 path += "/" 

214 else: 

215 path = "/" 

216 return urlunsplit((scheme, netloc, path, query, "")) 

217 

218 

219def is_url(text: str) -> bool: 

220 return text.partition("://")[0] in ("file", "http", "https") 

221 

222 

223def url_query_parameter( 

224 url: StrOrBytes, 

225 parameter: str, 

226 default: Optional[str] = None, 

227 keep_blank_values: Union[bool, int] = 0, 

228) -> Optional[str]: 

229 """Return the value of a url parameter, given the url and parameter name 

230 

231 General case: 

232 

233 >>> import w3lib.url 

234 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "id") 

235 '200' 

236 >>> 

237 

238 Return a default value if the parameter is not found: 

239 

240 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault") 

241 'mydefault' 

242 >>> 

243 

244 Returns None if `keep_blank_values` not set or 0 (default): 

245 

246 >>> w3lib.url.url_query_parameter("product.html?id=", "id") 

247 >>> 

248 

249 Returns an empty string if `keep_blank_values` set to 1: 

250 

251 >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1) 

252 '' 

253 >>> 

254 

255 """ 

256 

257 queryparams = parse_qs( 

258 urlsplit(str(url))[3], keep_blank_values=bool(keep_blank_values) 

259 ) 

260 if parameter in queryparams: 

261 return queryparams[parameter][0] 

262 else: 

263 return default 

264 

265 

266def url_query_cleaner( 

267 url: StrOrBytes, 

268 parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), 

269 sep: str = "&", 

270 kvsep: str = "=", 

271 remove: bool = False, 

272 unique: bool = True, 

273 keep_fragments: bool = False, 

274) -> str: 

275 """Clean URL arguments leaving only those passed in the parameterlist keeping order 

276 

277 >>> import w3lib.url 

278 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',)) 

279 'product.html?id=200' 

280 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']) 

281 'product.html?id=200&name=wired' 

282 >>> 

283 

284 If `unique` is ``False``, do not remove duplicated keys 

285 

286 >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False) 

287 'product.html?d=1&d=2&d=3' 

288 >>> 

289 

290 If `remove` is ``True``, leave only those **not in parameterlist**. 

291 

292 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True) 

293 'product.html?foo=bar&name=wired' 

294 >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True) 

295 'product.html?name=wired' 

296 >>> 

297 

298 By default, URL fragments are removed. If you need to preserve fragments, 

299 pass the ``keep_fragments`` argument as ``True``. 

300 

301 >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True) 

302 'http://domain.tld/#123123' 

303 

304 """ 

305 

306 if isinstance(parameterlist, (str, bytes)): 

307 parameterlist = [parameterlist] 

308 url, fragment = urldefrag(url) 

309 url = cast(str, url) 

310 fragment = cast(str, fragment) 

311 base, _, query = url.partition("?") 

312 seen = set() 

313 querylist = [] 

314 for ksv in query.split(sep): 

315 if not ksv: 

316 continue 

317 k, _, _ = ksv.partition(kvsep) 

318 if unique and k in seen: 

319 continue 

320 elif remove and k in parameterlist: 

321 continue 

322 elif not remove and k not in parameterlist: 

323 continue 

324 else: 

325 querylist.append(ksv) 

326 seen.add(k) 

327 url = "?".join([base, sep.join(querylist)]) if querylist else base 

328 if keep_fragments and fragment: 

329 url += "#" + fragment 

330 return url 

331 

332 

333def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str: 

334 parsed = urlsplit(url) 

335 current_args = parse_qsl(parsed.query, keep_blank_values=True) 

336 

337 new_args = [] 

338 seen_params = set() 

339 for name, value in current_args: 

340 if name not in params: 

341 new_args.append((name, value)) 

342 elif name not in seen_params: 

343 new_args.append((name, params[name])) 

344 seen_params.add(name) 

345 

346 not_modified_args = [ 

347 (name, value) for name, value in params.items() if name not in seen_params 

348 ] 

349 new_args += not_modified_args 

350 

351 query = urlencode(new_args) 

352 return urlunsplit(parsed._replace(query=query)) 

353 

354 

355def add_or_replace_parameter(url: str, name: str, new_value: str) -> str: 

356 """Add or remove a parameter to a given url 

357 

358 >>> import w3lib.url 

359 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php', 'arg', 'v') 

360 'http://www.example.com/index.php?arg=v' 

361 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg4', 'v4') 

362 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3&arg4=v4' 

363 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg3', 'v3new') 

364 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new' 

365 >>> 

366 

367 """ 

368 return _add_or_replace_parameters(url, {name: new_value}) 

369 

370 

371def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str: 

372 """Add or remove a parameters to a given url 

373 

374 >>> import w3lib.url 

375 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php', {'arg': 'v'}) 

376 'http://www.example.com/index.php?arg=v' 

377 >>> args = {'arg4': 'v4', 'arg3': 'v3new'} 

378 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', args) 

379 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new&arg4=v4' 

380 >>> 

381 

382 """ 

383 return _add_or_replace_parameters(url, new_parameters) 

384 

385 

386def path_to_file_uri(path: str) -> str: 

387 """Convert local filesystem path to legal File URIs as described in: 

388 http://en.wikipedia.org/wiki/File_URI_scheme 

389 """ 

390 x = pathname2url(os.path.abspath(path)) 

391 return f"file:///{x.lstrip('/')}" 

392 

393 

394def file_uri_to_path(uri: str) -> str: 

395 """Convert File URI to local filesystem path according to: 

396 http://en.wikipedia.org/wiki/File_URI_scheme 

397 """ 

398 uri_path = urlparse(uri).path 

399 return url2pathname(uri_path) 

400 

401 

402def any_to_uri(uri_or_path: str) -> str: 

403 """If given a path name, return its File URI, otherwise return it 

404 unmodified 

405 """ 

406 if os.path.splitdrive(uri_or_path)[0]: 

407 return path_to_file_uri(uri_or_path) 

408 u = urlparse(uri_or_path) 

409 return uri_or_path if u.scheme else path_to_file_uri(uri_or_path) 

410 

411 

412# ASCII characters. 

413_char = set(map(chr, range(127))) 

414 

415# RFC 2045 token. 

416# pylint: disable=consider-using-f-string 

417_token = r"[{}]+".format( 

418 re.escape( 

419 "".join( 

420 _char 

421 - 

422 # Control characters. 

423 set(map(chr, range(0, 32))) 

424 - 

425 # tspecials and space. 

426 set('()<>@,;:\\"/[]?= ') 

427 ) 

428 ) 

429) 

430 

431# RFC 822 quoted-string, without surrounding quotation marks. 

432# pylint: disable=consider-using-f-string 

433_quoted_string = r"(?:[{}]|(?:\\[{}]))*".format( 

434 re.escape("".join(_char - {'"', "\\", "\r"})), re.escape("".join(_char)) 

435) 

436 

437# Encode the regular expression strings to make them into bytes, as Python 3 

438# bytes have no format() method, but bytes must be passed to re.compile() in 

439# order to make a pattern object that can be used to match on bytes. 

440 

441# RFC 2397 mediatype. 

442_mediatype_pattern = re.compile(r"{token}/{token}".format(token=_token).encode()) 

443_mediatype_parameter_pattern = re.compile( 

444 r';({token})=(?:({token})|"({quoted})")'.format( 

445 token=_token, quoted=_quoted_string 

446 ).encode() 

447) 

448 

449 

450class ParseDataURIResult(NamedTuple): 

451 """Named tuple returned by :func:`parse_data_uri`.""" 

452 

453 #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``). 

454 media_type: str 

455 #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``). 

456 media_type_parameters: Dict[str, str] 

457 #: Data, decoded if it was encoded in base64 format. 

458 data: bytes 

459 

460 

461def parse_data_uri(uri: StrOrBytes) -> ParseDataURIResult: 

462 """Parse a data: URI into :class:`ParseDataURIResult`.""" 

463 if not isinstance(uri, bytes): 

464 uri = safe_url_string(uri).encode("ascii") 

465 

466 try: 

467 scheme, uri = uri.split(b":", 1) 

468 except ValueError: 

469 raise ValueError("invalid URI") 

470 if scheme.lower() != b"data": 

471 raise ValueError("not a data URI") 

472 

473 # RFC 3986 section 2.1 allows percent encoding to escape characters that 

474 # would be interpreted as delimiters, implying that actual delimiters 

475 # should not be percent-encoded. 

476 # Decoding before parsing will allow malformed URIs with percent-encoded 

477 # delimiters, but it makes parsing easier and should not affect 

478 # well-formed URIs, as the delimiters used in this URI scheme are not 

479 # allowed, percent-encoded or not, in tokens. 

480 uri = unquote_to_bytes(uri) 

481 

482 media_type = "text/plain" 

483 media_type_params = {} 

484 

485 m = _mediatype_pattern.match(uri) 

486 if m: 

487 media_type = m.group().decode() 

488 uri = uri[m.end() :] 

489 else: 

490 media_type_params["charset"] = "US-ASCII" 

491 

492 while True: 

493 m = _mediatype_parameter_pattern.match(uri) 

494 if m: 

495 attribute, value, value_quoted = m.groups() 

496 if value_quoted: 

497 value = re.sub(rb"\\(.)", rb"\1", value_quoted) 

498 media_type_params[attribute.decode()] = value.decode() 

499 uri = uri[m.end() :] 

500 else: 

501 break 

502 

503 try: 

504 is_base64, data = uri.split(b",", 1) 

505 except ValueError: 

506 raise ValueError("invalid data URI") 

507 if is_base64: 

508 if is_base64 != b";base64": 

509 raise ValueError("invalid data URI") 

510 data = base64.b64decode(data) 

511 

512 return ParseDataURIResult(media_type, media_type_params, data) 

513 

514 

515__all__ = [ 

516 "add_or_replace_parameter", 

517 "add_or_replace_parameters", 

518 "any_to_uri", 

519 "canonicalize_url", 

520 "file_uri_to_path", 

521 "is_url", 

522 "parse_data_uri", 

523 "path_to_file_uri", 

524 "safe_download_url", 

525 "safe_url_string", 

526 "url_query_cleaner", 

527 "url_query_parameter", 

528] 

529 

530 

531def _safe_ParseResult( 

532 parts: ParseResult, encoding: str = "utf8", path_encoding: str = "utf8" 

533) -> Tuple[str, str, str, str, str, str]: 

534 # IDNA encoding can fail for too long labels (>63 characters) 

535 # or missing labels (e.g. http://.example.com) 

536 try: 

537 netloc = parts.netloc.encode("idna").decode() 

538 except UnicodeError: 

539 netloc = parts.netloc 

540 

541 return ( 

542 parts.scheme, 

543 netloc, 

544 quote(parts.path.encode(path_encoding), _path_safe_chars), 

545 quote(parts.params.encode(path_encoding), _safe_chars), 

546 quote(parts.query.encode(encoding), _safe_chars), 

547 quote(parts.fragment.encode(encoding), _safe_chars), 

548 ) 

549 

550 

551def canonicalize_url( 

552 url: Union[StrOrBytes, ParseResult], 

553 keep_blank_values: bool = True, 

554 keep_fragments: bool = False, 

555 encoding: Optional[str] = None, 

556) -> str: 

557 r"""Canonicalize the given url by applying the following procedures: 

558 

559 - make the URL safe 

560 - sort query arguments, first by key, then by value 

561 - normalize all spaces (in query arguments) '+' (plus symbol) 

562 - normalize percent encodings case (%2f -> %2F) 

563 - remove query arguments with blank values (unless `keep_blank_values` is True) 

564 - remove fragments (unless `keep_fragments` is True) 

565 

566 The url passed can be bytes or unicode, while the url returned is 

567 always a native str (bytes in Python 2, unicode in Python 3). 

568 

569 >>> import w3lib.url 

570 >>> 

571 >>> # sorting query arguments 

572 >>> w3lib.url.canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50') 

573 'http://www.example.com/do?a=50&b=2&b=5&c=3' 

574 >>> 

575 >>> # UTF-8 conversion + percent-encoding of non-ASCII characters 

576 >>> w3lib.url.canonicalize_url('http://www.example.com/r\u00e9sum\u00e9') 

577 'http://www.example.com/r%C3%A9sum%C3%A9' 

578 >>> 

579 

580 For more examples, see the tests in `tests/test_url.py`. 

581 """ 

582 # If supplied `encoding` is not compatible with all characters in `url`, 

583 # fallback to UTF-8 as safety net. 

584 # UTF-8 can handle all Unicode characters, 

585 # so we should be covered regarding URL normalization, 

586 # if not for proper URL expected by remote website. 

587 if isinstance(url, str): 

588 url = _strip(url) 

589 try: 

590 scheme, netloc, path, params, query, fragment = _safe_ParseResult( 

591 parse_url(url), encoding=encoding or "utf8" 

592 ) 

593 except UnicodeEncodeError: 

594 scheme, netloc, path, params, query, fragment = _safe_ParseResult( 

595 parse_url(url), encoding="utf8" 

596 ) 

597 

598 # 1. decode query-string as UTF-8 (or keep raw bytes), 

599 # sort values, 

600 # and percent-encode them back 

601 

602 # Python's urllib.parse.parse_qsl does not work as wanted 

603 # for percent-encoded characters that do not match passed encoding, 

604 # they get lost. 

605 # 

606 # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')] 

607 # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD), 

608 # instead of \xa3 that you get with Python2's parse_qsl) 

609 # 

610 # what we want here is to keep raw bytes, and percent encode them 

611 # so as to preserve whatever encoding what originally used. 

612 # 

613 # See https://tools.ietf.org/html/rfc3987#section-6.4: 

614 # 

615 # For example, it is possible to have a URI reference of 

616 # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the 

617 # document name is encoded in iso-8859-1 based on server settings, but 

618 # where the fragment identifier is encoded in UTF-8 according to 

619 # [XPointer]. The IRI corresponding to the above URI would be (in XML 

620 # notation) 

621 # "http://www.example.org/r%E9sum%E9.xml#r&#xE9;sum&#xE9;". 

622 # Similar considerations apply to query parts. The functionality of 

623 # IRIs (namely, to be able to include non-ASCII characters) can only be 

624 # used if the query part is encoded in UTF-8. 

625 keyvals = parse_qsl_to_bytes(query, keep_blank_values) 

626 

627 keyvals.sort() 

628 query = urlencode(keyvals) 

629 

630 # 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes) 

631 # and percent-encode path again (this normalizes to upper-case %XX) 

632 uqp = _unquotepath(path) 

633 path = quote(uqp, _path_safe_chars) or "/" 

634 

635 fragment = "" if not keep_fragments else fragment 

636 

637 # every part should be safe already 

638 return urlunparse( 

639 (scheme, netloc.lower().rstrip(":"), path, params, query, fragment) 

640 ) 

641 

642 

643def _unquotepath(path: str) -> bytes: 

644 for reserved in ("2f", "2F", "3f", "3F"): 

645 path = path.replace("%" + reserved, "%25" + reserved.upper()) 

646 

647 # standard lib's unquote() does not work for non-UTF-8 

648 # percent-escaped characters, they get lost. 

649 # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD) 

650 # 

651 # unquote_to_bytes() returns raw bytes instead 

652 return unquote_to_bytes(path) 

653 

654 

655def parse_url( 

656 url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = None 

657) -> ParseResult: 

658 """Return urlparsed url from the given argument (which could be an already 

659 parsed url) 

660 """ 

661 if isinstance(url, ParseResult): 

662 return url 

663 return urlparse(to_unicode(url, encoding)) 

664 

665 

666def parse_qsl_to_bytes( 

667 qs: str, keep_blank_values: bool = False 

668) -> List[Tuple[bytes, bytes]]: 

669 """Parse a query given as a string argument. 

670 

671 Data are returned as a list of name, value pairs as bytes. 

672 

673 Arguments: 

674 

675 qs: percent-encoded query string to be parsed 

676 

677 keep_blank_values: flag indicating whether blank values in 

678 percent-encoded queries should be treated as blank strings. A 

679 true value indicates that blanks should be retained as blank 

680 strings. The default false value indicates that blank values 

681 are to be ignored and treated as if they were not included. 

682 

683 """ 

684 # This code is the same as Python3's parse_qsl() 

685 # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) 

686 # except for the unquote(s, encoding, errors) calls replaced 

687 # with unquote_to_bytes(s) 

688 coerce_args = cast(Callable[..., Tuple[str, Callable[..., bytes]]], _coerce_args) 

689 qs, _coerce_result = coerce_args(qs) 

690 pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")] 

691 r = [] 

692 for name_value in pairs: 

693 if not name_value: 

694 continue 

695 nv = name_value.split("=", 1) 

696 if len(nv) != 2: 

697 # Handle case of a control-name with no equal sign 

698 if keep_blank_values: 

699 nv.append("") 

700 else: 

701 continue 

702 if len(nv[1]) or keep_blank_values: 

703 name: StrOrBytes = nv[0].replace("+", " ") 

704 name = unquote_to_bytes(name) 

705 name = _coerce_result(name) 

706 value: StrOrBytes = nv[1].replace("+", " ") 

707 value = unquote_to_bytes(value) 

708 value = _coerce_result(value) 

709 r.append((name, value)) 

710 return r