Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/w3lib/url.py: 56%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

237 statements  

1""" 

2This module contains general purpose URL functions not found in the standard 

3library. 

4""" 

5 

6import base64 

7import codecs 

8import os 

9import posixpath 

10import re 

11import string 

12from typing import ( 

13 Callable, 

14 Dict, 

15 List, 

16 NamedTuple, 

17 Optional, 

18 Sequence, 

19 Tuple, 

20 Union, 

21 cast, 

22 overload, 

23) 

24from urllib.parse import _coerce_args # type: ignore 

25from urllib.parse import ( 

26 ParseResult, 

27 parse_qs, 

28 parse_qsl, 

29 quote, 

30 unquote, 

31 unquote_to_bytes, 

32 urldefrag, 

33 urlencode, 

34 urlparse, 

35 urlsplit, 

36 urlunparse, 

37 urlunsplit, 

38) 

39from urllib.request import pathname2url, url2pathname 

40 

41from ._infra import _ASCII_TAB_OR_NEWLINE, _C0_CONTROL_OR_SPACE 

42from ._types import AnyUnicodeError, StrOrBytes 

43from ._url import _SPECIAL_SCHEMES 

44from .util import to_unicode 

45 

46 

47# error handling function for bytes-to-Unicode decoding errors with URLs 

48def _quote_byte(error: UnicodeError) -> Tuple[str, int]: 

49 error = cast(AnyUnicodeError, error) 

50 return (to_unicode(quote(error.object[error.start : error.end])), error.end) 

51 

52 

53codecs.register_error("percentencode", _quote_byte) 

54 

55# constants from RFC 3986, Section 2.2 and 2.3 

56RFC3986_GEN_DELIMS = b":/?#[]@" 

57RFC3986_SUB_DELIMS = b"!$&'()*+,;=" 

58RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS 

59RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode("ascii") 

60EXTRA_SAFE_CHARS = b"|" # see https://github.com/scrapy/w3lib/pull/25 

61 

62RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":" 

63_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b"%" 

64_path_safe_chars = _safe_chars.replace(b"#", b"") 

65 

66# Characters that are safe in all of: 

67# 

68# - RFC 2396 + RFC 2732, as interpreted by Java 8’s java.net.URI class 

69# - RFC 3986 

70# - The URL living standard 

71# 

72# NOTE: % is currently excluded from these lists of characters, due to 

73# limitations of the current safe_url_string implementation, but it should also 

74# be escaped as %25 when it is not already being used as part of an escape 

75# character. 

76_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, delete=b":;=") 

77_PATH_SAFEST_CHARS = _safe_chars.translate(None, delete=b"#[]|") 

78_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS 

79_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, delete=b"'") 

80_FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS 

81 

82 

83_ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE = { 

84 ord(char): None for char in _ASCII_TAB_OR_NEWLINE 

85} 

86 

87 

88def _strip(url: str) -> str: 

89 return url.strip(_C0_CONTROL_OR_SPACE).translate( 

90 _ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE 

91 ) 

92 

93 

94def safe_url_string( # pylint: disable=too-many-locals 

95 url: StrOrBytes, 

96 encoding: str = "utf8", 

97 path_encoding: str = "utf8", 

98 quote_path: bool = True, 

99) -> str: 

100 """Return a URL equivalent to *url* that a wide range of web browsers and 

101 web servers consider valid. 

102 

103 *url* is parsed according to the rules of the `URL living standard`_, 

104 and during serialization additional characters are percent-encoded to make 

105 the URL valid by additional URL standards. 

106 

107 .. _URL living standard: https://url.spec.whatwg.org/ 

108 

109 The returned URL should be valid by *all* of the following URL standards 

110 known to be enforced by modern-day web browsers and web servers: 

111 

112 - `URL living standard`_ 

113 

114 - `RFC 3986`_ 

115 

116 - `RFC 2396`_ and `RFC 2732`_, as interpreted by `Java 8’s java.net.URI 

117 class`_. 

118 

119 .. _Java 8’s java.net.URI class: https://docs.oracle.com/javase/8/docs/api/java/net/URI.html 

120 .. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt 

121 .. _RFC 2732: https://www.ietf.org/rfc/rfc2732.txt 

122 .. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt 

123 

124 If a bytes URL is given, it is first converted to `str` using the given 

125 encoding (which defaults to 'utf-8'). If quote_path is True (default), 

126 path_encoding ('utf-8' by default) is used to encode URL path component 

127 which is then quoted. Otherwise, if quote_path is False, path component 

128 is not encoded or quoted. Given encoding is used for query string 

129 or form data. 

130 

131 When passing an encoding, you should use the encoding of the 

132 original page (the page from which the URL was extracted from). 

133 

134 Calling this function on an already "safe" URL will return the URL 

135 unmodified. 

136 """ 

137 # urlsplit() chokes on bytes input with non-ASCII chars, 

138 # so let's decode (to Unicode) using page encoding: 

139 # - it is assumed that a raw bytes input comes from a document 

140 # encoded with the supplied encoding (or UTF8 by default) 

141 # - if the supplied (or default) encoding chokes, 

142 # percent-encode offending bytes 

143 decoded = to_unicode(url, encoding=encoding, errors="percentencode") 

144 parts = urlsplit(_strip(decoded)) 

145 

146 username, password, hostname, port = ( 

147 parts.username, 

148 parts.password, 

149 parts.hostname, 

150 parts.port, 

151 ) 

152 netloc_bytes = b"" 

153 if username is not None or password is not None: 

154 if username is not None: 

155 safe_username = quote(unquote(username), _USERINFO_SAFEST_CHARS) 

156 netloc_bytes += safe_username.encode(encoding) 

157 if password is not None: 

158 netloc_bytes += b":" 

159 safe_password = quote(unquote(password), _USERINFO_SAFEST_CHARS) 

160 netloc_bytes += safe_password.encode(encoding) 

161 netloc_bytes += b"@" 

162 if hostname is not None: 

163 try: 

164 netloc_bytes += hostname.encode("idna") 

165 except UnicodeError: 

166 # IDNA encoding can fail for too long labels (>63 characters) or 

167 # missing labels (e.g. http://.example.com) 

168 netloc_bytes += hostname.encode(encoding) 

169 if port is not None: 

170 netloc_bytes += b":" 

171 netloc_bytes += str(port).encode(encoding) 

172 

173 netloc = netloc_bytes.decode() 

174 

175 # default encoding for path component SHOULD be UTF-8 

176 if quote_path: 

177 path = quote(parts.path.encode(path_encoding), _PATH_SAFEST_CHARS) 

178 else: 

179 path = parts.path 

180 

181 if parts.scheme in _SPECIAL_SCHEMES: 

182 query = quote(parts.query.encode(encoding), _SPECIAL_QUERY_SAFEST_CHARS) 

183 else: 

184 query = quote(parts.query.encode(encoding), _QUERY_SAFEST_CHARS) 

185 

186 return urlunsplit( 

187 ( 

188 parts.scheme, 

189 netloc, 

190 path, 

191 query, 

192 quote(parts.fragment.encode(encoding), _FRAGMENT_SAFEST_CHARS), 

193 ) 

194 ) 

195 

196 

197_parent_dirs = re.compile(r"/?(\.\./)+") 

198 

199 

200def safe_download_url( 

201 url: StrOrBytes, encoding: str = "utf8", path_encoding: str = "utf8" 

202) -> str: 

203 """Make a url for download. This will call safe_url_string 

204 and then strip the fragment, if one exists. The path will 

205 be normalised. 

206 

207 If the path is outside the document root, it will be changed 

208 to be within the document root. 

209 """ 

210 safe_url = safe_url_string(url, encoding, path_encoding) 

211 scheme, netloc, path, query, _ = urlsplit(safe_url) 

212 if path: 

213 path = _parent_dirs.sub("", posixpath.normpath(path)) 

214 if safe_url.endswith("/") and not path.endswith("/"): 

215 path += "/" 

216 else: 

217 path = "/" 

218 return urlunsplit((scheme, netloc, path, query, "")) 

219 

220 

221def is_url(text: str) -> bool: 

222 return text.partition("://")[0] in ("file", "http", "https") 

223 

224 

225@overload 

226def url_query_parameter( 

227 url: StrOrBytes, 

228 parameter: str, 

229 default: None = None, 

230 keep_blank_values: Union[bool, int] = 0, 

231) -> Optional[str]: ... 

232 

233 

234@overload 

235def url_query_parameter( 

236 url: StrOrBytes, 

237 parameter: str, 

238 default: str, 

239 keep_blank_values: Union[bool, int] = 0, 

240) -> str: ... 

241 

242 

243def url_query_parameter( 

244 url: StrOrBytes, 

245 parameter: str, 

246 default: Optional[str] = None, 

247 keep_blank_values: Union[bool, int] = 0, 

248) -> Optional[str]: 

249 """Return the value of a url parameter, given the url and parameter name 

250 

251 General case: 

252 

253 >>> import w3lib.url 

254 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "id") 

255 '200' 

256 >>> 

257 

258 Return a default value if the parameter is not found: 

259 

260 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault") 

261 'mydefault' 

262 >>> 

263 

264 Returns None if `keep_blank_values` not set or 0 (default): 

265 

266 >>> w3lib.url.url_query_parameter("product.html?id=", "id") 

267 >>> 

268 

269 Returns an empty string if `keep_blank_values` set to 1: 

270 

271 >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1) 

272 '' 

273 >>> 

274 

275 """ 

276 

277 queryparams = parse_qs( 

278 urlsplit(str(url))[3], keep_blank_values=bool(keep_blank_values) 

279 ) 

280 if parameter in queryparams: 

281 return queryparams[parameter][0] 

282 else: 

283 return default 

284 

285 

286def url_query_cleaner( 

287 url: StrOrBytes, 

288 parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), 

289 sep: str = "&", 

290 kvsep: str = "=", 

291 remove: bool = False, 

292 unique: bool = True, 

293 keep_fragments: bool = False, 

294) -> str: 

295 """Clean URL arguments leaving only those passed in the parameterlist keeping order 

296 

297 >>> import w3lib.url 

298 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',)) 

299 'product.html?id=200' 

300 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']) 

301 'product.html?id=200&name=wired' 

302 >>> 

303 

304 If `unique` is ``False``, do not remove duplicated keys 

305 

306 >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False) 

307 'product.html?d=1&d=2&d=3' 

308 >>> 

309 

310 If `remove` is ``True``, leave only those **not in parameterlist**. 

311 

312 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True) 

313 'product.html?foo=bar&name=wired' 

314 >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True) 

315 'product.html?name=wired' 

316 >>> 

317 

318 By default, URL fragments are removed. If you need to preserve fragments, 

319 pass the ``keep_fragments`` argument as ``True``. 

320 

321 >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True) 

322 'http://domain.tld/#123123' 

323 

324 """ 

325 

326 if isinstance(parameterlist, (str, bytes)): 

327 parameterlist = [parameterlist] 

328 url, fragment = urldefrag(url) 

329 url = cast(str, url) 

330 fragment = cast(str, fragment) 

331 base, _, query = url.partition("?") 

332 seen = set() 

333 querylist = [] 

334 for ksv in query.split(sep): 

335 if not ksv: 

336 continue 

337 k, _, _ = ksv.partition(kvsep) 

338 if unique and k in seen: 

339 continue 

340 elif remove and k in parameterlist: 

341 continue 

342 elif not remove and k not in parameterlist: 

343 continue 

344 else: 

345 querylist.append(ksv) 

346 seen.add(k) 

347 url = "?".join([base, sep.join(querylist)]) if querylist else base 

348 if keep_fragments and fragment: 

349 url += "#" + fragment 

350 return url 

351 

352 

353def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str: 

354 parsed = urlsplit(url) 

355 current_args = parse_qsl(parsed.query, keep_blank_values=True) 

356 

357 new_args = [] 

358 seen_params = set() 

359 for name, value in current_args: 

360 if name not in params: 

361 new_args.append((name, value)) 

362 elif name not in seen_params: 

363 new_args.append((name, params[name])) 

364 seen_params.add(name) 

365 

366 not_modified_args = [ 

367 (name, value) for name, value in params.items() if name not in seen_params 

368 ] 

369 new_args += not_modified_args 

370 

371 query = urlencode(new_args) 

372 return urlunsplit(parsed._replace(query=query)) 

373 

374 

375def add_or_replace_parameter(url: str, name: str, new_value: str) -> str: 

376 """Add or remove a parameter to a given url 

377 

378 >>> import w3lib.url 

379 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php', 'arg', 'v') 

380 'http://www.example.com/index.php?arg=v' 

381 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg4', 'v4') 

382 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3&arg4=v4' 

383 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg3', 'v3new') 

384 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new' 

385 >>> 

386 

387 """ 

388 return _add_or_replace_parameters(url, {name: new_value}) 

389 

390 

391def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str: 

392 """Add or remove a parameters to a given url 

393 

394 >>> import w3lib.url 

395 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php', {'arg': 'v'}) 

396 'http://www.example.com/index.php?arg=v' 

397 >>> args = {'arg4': 'v4', 'arg3': 'v3new'} 

398 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', args) 

399 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new&arg4=v4' 

400 >>> 

401 

402 """ 

403 return _add_or_replace_parameters(url, new_parameters) 

404 

405 

406def path_to_file_uri(path: str) -> str: 

407 """Convert local filesystem path to legal File URIs as described in: 

408 http://en.wikipedia.org/wiki/File_URI_scheme 

409 """ 

410 x = pathname2url(os.path.abspath(path)) 

411 return f"file:///{x.lstrip('/')}" 

412 

413 

414def file_uri_to_path(uri: str) -> str: 

415 """Convert File URI to local filesystem path according to: 

416 http://en.wikipedia.org/wiki/File_URI_scheme 

417 """ 

418 uri_path = urlparse(uri).path 

419 return url2pathname(uri_path) 

420 

421 

422def any_to_uri(uri_or_path: str) -> str: 

423 """If given a path name, return its File URI, otherwise return it 

424 unmodified 

425 """ 

426 if os.path.splitdrive(uri_or_path)[0]: 

427 return path_to_file_uri(uri_or_path) 

428 u = urlparse(uri_or_path) 

429 return uri_or_path if u.scheme else path_to_file_uri(uri_or_path) 

430 

431 

432# ASCII characters. 

433_char = set(map(chr, range(127))) 

434 

435# RFC 2045 token. 

436# pylint: disable=consider-using-f-string 

437_token = r"[{}]+".format( 

438 re.escape( 

439 "".join( 

440 _char 

441 - 

442 # Control characters. 

443 set(map(chr, range(0, 32))) 

444 - 

445 # tspecials and space. 

446 set('()<>@,;:\\"/[]?= ') 

447 ) 

448 ) 

449) 

450 

451# RFC 822 quoted-string, without surrounding quotation marks. 

452# pylint: disable=consider-using-f-string 

453_quoted_string = r"(?:[{}]|(?:\\[{}]))*".format( 

454 re.escape("".join(_char - {'"', "\\", "\r"})), re.escape("".join(_char)) 

455) 

456 

457# Encode the regular expression strings to make them into bytes, as Python 3 

458# bytes have no format() method, but bytes must be passed to re.compile() in 

459# order to make a pattern object that can be used to match on bytes. 

460 

461# RFC 2397 mediatype. 

462_mediatype_pattern = re.compile(r"{token}/{token}".format(token=_token).encode()) 

463_mediatype_parameter_pattern = re.compile( 

464 r';({token})=(?:({token})|"({quoted})")'.format( 

465 token=_token, quoted=_quoted_string 

466 ).encode() 

467) 

468 

469 

470class ParseDataURIResult(NamedTuple): 

471 """Named tuple returned by :func:`parse_data_uri`.""" 

472 

473 #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``). 

474 media_type: str 

475 #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``). 

476 media_type_parameters: Dict[str, str] 

477 #: Data, decoded if it was encoded in base64 format. 

478 data: bytes 

479 

480 

481def parse_data_uri(uri: StrOrBytes) -> ParseDataURIResult: 

482 """Parse a data: URI into :class:`ParseDataURIResult`.""" 

483 if not isinstance(uri, bytes): 

484 uri = safe_url_string(uri).encode("ascii") 

485 

486 try: 

487 scheme, uri = uri.split(b":", 1) 

488 except ValueError: 

489 raise ValueError("invalid URI") 

490 if scheme.lower() != b"data": 

491 raise ValueError("not a data URI") 

492 

493 # RFC 3986 section 2.1 allows percent encoding to escape characters that 

494 # would be interpreted as delimiters, implying that actual delimiters 

495 # should not be percent-encoded. 

496 # Decoding before parsing will allow malformed URIs with percent-encoded 

497 # delimiters, but it makes parsing easier and should not affect 

498 # well-formed URIs, as the delimiters used in this URI scheme are not 

499 # allowed, percent-encoded or not, in tokens. 

500 uri = unquote_to_bytes(uri) 

501 

502 media_type = "text/plain" 

503 media_type_params = {} 

504 

505 m = _mediatype_pattern.match(uri) 

506 if m: 

507 media_type = m.group().decode() 

508 uri = uri[m.end() :] 

509 else: 

510 media_type_params["charset"] = "US-ASCII" 

511 

512 while True: 

513 m = _mediatype_parameter_pattern.match(uri) 

514 if m: 

515 attribute, value, value_quoted = m.groups() 

516 if value_quoted: 

517 value = re.sub(rb"\\(.)", rb"\1", value_quoted) 

518 media_type_params[attribute.decode()] = value.decode() 

519 uri = uri[m.end() :] 

520 else: 

521 break 

522 

523 try: 

524 is_base64, data = uri.split(b",", 1) 

525 except ValueError: 

526 raise ValueError("invalid data URI") 

527 if is_base64: 

528 if is_base64 != b";base64": 

529 raise ValueError("invalid data URI") 

530 data = base64.b64decode(data) 

531 

532 return ParseDataURIResult(media_type, media_type_params, data) 

533 

534 

535__all__ = [ 

536 "add_or_replace_parameter", 

537 "add_or_replace_parameters", 

538 "any_to_uri", 

539 "canonicalize_url", 

540 "file_uri_to_path", 

541 "is_url", 

542 "parse_data_uri", 

543 "path_to_file_uri", 

544 "safe_download_url", 

545 "safe_url_string", 

546 "url_query_cleaner", 

547 "url_query_parameter", 

548] 

549 

550 

551def _safe_ParseResult( 

552 parts: ParseResult, encoding: str = "utf8", path_encoding: str = "utf8" 

553) -> Tuple[str, str, str, str, str, str]: 

554 # IDNA encoding can fail for too long labels (>63 characters) 

555 # or missing labels (e.g. http://.example.com) 

556 try: 

557 netloc = parts.netloc.encode("idna").decode() 

558 except UnicodeError: 

559 netloc = parts.netloc 

560 

561 return ( 

562 parts.scheme, 

563 netloc, 

564 quote(parts.path.encode(path_encoding), _path_safe_chars), 

565 quote(parts.params.encode(path_encoding), _safe_chars), 

566 quote(parts.query.encode(encoding), _safe_chars), 

567 quote(parts.fragment.encode(encoding), _safe_chars), 

568 ) 

569 

570 

571def canonicalize_url( 

572 url: Union[StrOrBytes, ParseResult], 

573 keep_blank_values: bool = True, 

574 keep_fragments: bool = False, 

575 encoding: Optional[str] = None, 

576) -> str: 

577 r"""Canonicalize the given url by applying the following procedures: 

578 

579 - make the URL safe 

580 - sort query arguments, first by key, then by value 

581 - normalize all spaces (in query arguments) '+' (plus symbol) 

582 - normalize percent encodings case (%2f -> %2F) 

583 - remove query arguments with blank values (unless `keep_blank_values` is True) 

584 - remove fragments (unless `keep_fragments` is True) 

585 

586 The url passed can be bytes or unicode, while the url returned is 

587 always a native str (bytes in Python 2, unicode in Python 3). 

588 

589 >>> import w3lib.url 

590 >>> 

591 >>> # sorting query arguments 

592 >>> w3lib.url.canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50') 

593 'http://www.example.com/do?a=50&b=2&b=5&c=3' 

594 >>> 

595 >>> # UTF-8 conversion + percent-encoding of non-ASCII characters 

596 >>> w3lib.url.canonicalize_url('http://www.example.com/r\u00e9sum\u00e9') 

597 'http://www.example.com/r%C3%A9sum%C3%A9' 

598 >>> 

599 

600 For more examples, see the tests in `tests/test_url.py`. 

601 """ 

602 # If supplied `encoding` is not compatible with all characters in `url`, 

603 # fallback to UTF-8 as safety net. 

604 # UTF-8 can handle all Unicode characters, 

605 # so we should be covered regarding URL normalization, 

606 # if not for proper URL expected by remote website. 

607 if isinstance(url, str): 

608 url = _strip(url) 

609 try: 

610 scheme, netloc, path, params, query, fragment = _safe_ParseResult( 

611 parse_url(url), encoding=encoding or "utf8" 

612 ) 

613 except UnicodeEncodeError: 

614 scheme, netloc, path, params, query, fragment = _safe_ParseResult( 

615 parse_url(url), encoding="utf8" 

616 ) 

617 

618 # 1. decode query-string as UTF-8 (or keep raw bytes), 

619 # sort values, 

620 # and percent-encode them back 

621 

622 # Python's urllib.parse.parse_qsl does not work as wanted 

623 # for percent-encoded characters that do not match passed encoding, 

624 # they get lost. 

625 # 

626 # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')] 

627 # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD), 

628 # instead of \xa3 that you get with Python2's parse_qsl) 

629 # 

630 # what we want here is to keep raw bytes, and percent encode them 

631 # so as to preserve whatever encoding what originally used. 

632 # 

633 # See https://tools.ietf.org/html/rfc3987#section-6.4: 

634 # 

635 # For example, it is possible to have a URI reference of 

636 # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the 

637 # document name is encoded in iso-8859-1 based on server settings, but 

638 # where the fragment identifier is encoded in UTF-8 according to 

639 # [XPointer]. The IRI corresponding to the above URI would be (in XML 

640 # notation) 

641 # "http://www.example.org/r%E9sum%E9.xml#r&#xE9;sum&#xE9;". 

642 # Similar considerations apply to query parts. The functionality of 

643 # IRIs (namely, to be able to include non-ASCII characters) can only be 

644 # used if the query part is encoded in UTF-8. 

645 keyvals = parse_qsl_to_bytes(query, keep_blank_values) 

646 

647 keyvals.sort() 

648 query = urlencode(keyvals) 

649 

650 # 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes) 

651 # and percent-encode path again (this normalizes to upper-case %XX) 

652 uqp = _unquotepath(path) 

653 path = quote(uqp, _path_safe_chars) or "/" 

654 

655 fragment = "" if not keep_fragments else fragment 

656 

657 # Apply lowercase to the domain, but not to the userinfo. 

658 netloc_parts = netloc.split("@") 

659 netloc_parts[-1] = netloc_parts[-1].lower().rstrip(":") 

660 netloc = "@".join(netloc_parts) 

661 

662 # every part should be safe already 

663 return urlunparse((scheme, netloc, path, params, query, fragment)) 

664 

665 

666def _unquotepath(path: str) -> bytes: 

667 for reserved in ("2f", "2F", "3f", "3F"): 

668 path = path.replace("%" + reserved, "%25" + reserved.upper()) 

669 

670 # standard lib's unquote() does not work for non-UTF-8 

671 # percent-escaped characters, they get lost. 

672 # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD) 

673 # 

674 # unquote_to_bytes() returns raw bytes instead 

675 return unquote_to_bytes(path) 

676 

677 

678def parse_url( 

679 url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = None 

680) -> ParseResult: 

681 """Return urlparsed url from the given argument (which could be an already 

682 parsed url) 

683 """ 

684 if isinstance(url, ParseResult): 

685 return url 

686 return urlparse(to_unicode(url, encoding)) 

687 

688 

689def parse_qsl_to_bytes( 

690 qs: str, keep_blank_values: bool = False 

691) -> List[Tuple[bytes, bytes]]: 

692 """Parse a query given as a string argument. 

693 

694 Data are returned as a list of name, value pairs as bytes. 

695 

696 Arguments: 

697 

698 qs: percent-encoded query string to be parsed 

699 

700 keep_blank_values: flag indicating whether blank values in 

701 percent-encoded queries should be treated as blank strings. A 

702 true value indicates that blanks should be retained as blank 

703 strings. The default false value indicates that blank values 

704 are to be ignored and treated as if they were not included. 

705 

706 """ 

707 # This code is the same as Python3's parse_qsl() 

708 # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) 

709 # except for the unquote(s, encoding, errors) calls replaced 

710 # with unquote_to_bytes(s) 

711 coerce_args = cast(Callable[..., Tuple[str, Callable[..., bytes]]], _coerce_args) 

712 qs, _coerce_result = coerce_args(qs) 

713 pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")] 

714 r = [] 

715 for name_value in pairs: 

716 if not name_value: 

717 continue 

718 nv = name_value.split("=", 1) 

719 if len(nv) != 2: 

720 # Handle case of a control-name with no equal sign 

721 if keep_blank_values: 

722 nv.append("") 

723 else: 

724 continue 

725 if len(nv[1]) or keep_blank_values: 

726 name: StrOrBytes = nv[0].replace("+", " ") 

727 name = unquote_to_bytes(name) 

728 name = _coerce_result(name) 

729 value: StrOrBytes = nv[1].replace("+", " ") 

730 value = unquote_to_bytes(value) 

731 value = _coerce_result(value) 

732 r.append((name, value)) 

733 return r