Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/w3lib/url.py: 50%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

235 statements  

1""" 

2This module contains general purpose URL functions not found in the standard 

3library. 

4""" 

5 

6from __future__ import annotations 

7 

8import base64 

9import codecs 

10import os 

11import posixpath 

12import re 

13from pathlib import Path 

14from typing import TYPE_CHECKING, NamedTuple, cast, overload 

15from urllib.parse import ParseResult 

16from urllib.request import pathname2url 

17 

18from ._url import ( 

19 _PATH_SAFE_CHARS, 

20 _SAFE_CHARS, 

21 _SPECIAL_SCHEMES, 

22 # reexports 

23 RFC3986_GEN_DELIMS as RFC3986_GEN_DELIMS, 

24 RFC3986_RESERVED as RFC3986_RESERVED, 

25 RFC3986_SUB_DELIMS as RFC3986_SUB_DELIMS, 

26 RFC3986_UNRESERVED as RFC3986_UNRESERVED, 

27 RFC3986_USERINFO_SAFE_CHARS as RFC3986_USERINFO_SAFE_CHARS, 

28 _idna_bytes, 

29 _idna_str, 

30 _parse_qs, 

31 _parse_qsl, 

32 _quote, 

33 _quote_into, 

34 _strip, 

35 _unquote, 

36 _url2pathname, 

37 _urlencode, 

38 _urlparse, 

39 _urlsplit, 

40 _urlunparse, 

41 _urlunsplit, 

42) 

43from .util import to_unicode 

44 

45if TYPE_CHECKING: 

46 from collections.abc import Sequence 

47 

48 from ._types import AnyUnicodeError 

49 

50 

51# error handling function for bytes-to-Unicode decoding errors with URLs 

52def _quote_byte(error: UnicodeError) -> tuple[str, int]: 

53 error = cast("AnyUnicodeError", error) 

54 text = error.object[error.start : error.end] 

55 if isinstance(text, str): # pragma: no cover 

56 text = text.encode() 

57 return (to_unicode(_quote(text)), error.end) 

58 

59 

60codecs.register_error("percentencode", _quote_byte) 

61 

62# Characters that are safe in all of: 

63# 

64# - RFC 2396 + RFC 2732, as interpreted by Java 8’s java.net.URI class 

65# - RFC 3986 

66# - The URL living standard 

67# 

68# NOTE: % is currently excluded from these lists of characters, due to 

69# limitations of the current safe_url_string implementation, but it should also 

70# be escaped as %25 when it is not already being used as part of an escape 

71# character. 

72_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, delete=b":;=") 

73_PATH_SAFEST_CHARS = _SAFE_CHARS.translate(None, delete=b"#[]|") 

74_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS 

75_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, delete=b"'") 

76_FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS 

77 

78 

79def safe_url_string( 

80 url: str | bytes, 

81 encoding: str = "utf8", 

82 path_encoding: str = "utf8", 

83 quote_path: bool = True, 

84) -> str: 

85 """Return a URL equivalent to *url* that a wide range of web browsers and 

86 web servers consider valid. 

87 

88 *url* is parsed according to the rules of the `URL living standard`_, 

89 and during serialization additional characters are percent-encoded to make 

90 the URL valid by additional URL standards. 

91 

92 .. _URL living standard: https://url.spec.whatwg.org/ 

93 

94 The returned URL should be valid by *all* of the following URL standards 

95 known to be enforced by modern-day web browsers and web servers: 

96 

97 - `URL living standard`_ 

98 

99 - `RFC 3986`_ 

100 

101 - `RFC 2396`_ and `RFC 2732`_, as interpreted by `Java 8’s java.net.URI 

102 class`_. 

103 

104 .. _Java 8’s java.net.URI class: https://docs.oracle.com/javase/8/docs/api/java/net/URI.html 

105 .. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt 

106 .. _RFC 2732: https://www.ietf.org/rfc/rfc2732.txt 

107 .. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt 

108 

109 If a bytes URL is given, it is first converted to `str` using the given 

110 encoding (which defaults to 'utf-8'). If quote_path is True (default), 

111 path_encoding ('utf-8' by default) is used to encode URL path component 

112 which is then quoted. Otherwise, if quote_path is False, path component 

113 is not encoded or quoted. Given encoding is used for query string 

114 or form data. 

115 

116 When passing an encoding, you should use the encoding of the 

117 original page (the page from which the URL was extracted from). 

118 

119 Calling this function on an already "safe" URL will return the URL 

120 unmodified. 

121 """ 

122 # urlsplit() chokes on bytes input with non-ASCII chars, 

123 # so let's decode (to Unicode) using page encoding: 

124 # - it is assumed that a raw bytes input comes from a document 

125 # encoded with the supplied encoding (or UTF8 by default) 

126 # - if the supplied (or default) encoding chokes, 

127 # percent-encode offending bytes 

128 parts = _urlsplit( 

129 _strip(to_unicode(url, encoding=encoding, errors="percentencode")) 

130 ) 

131 tmp_buf = bytearray() 

132 

133 if parts.username is not None or parts.password is not None: 

134 if parts.username is not None: 

135 _quote_into( 

136 _unquote(parts.username), 

137 tmp_buf, 

138 _USERINFO_SAFEST_CHARS, 

139 ) 

140 

141 if parts.password is not None: 

142 tmp_buf.append(58) # ord(":") 

143 _quote_into( 

144 _unquote(parts.password), 

145 tmp_buf, 

146 _USERINFO_SAFEST_CHARS, 

147 ) 

148 

149 tmp_buf.append(64) # ord("@") 

150 

151 if parts.hostname is not None: 

152 if ":" in parts.hostname: 

153 # IPv6 address: urlsplit() strips the brackets from the hostname, 

154 # but they are required in the netloc when rebuilding the URL. 

155 tmp_buf.append(91) # ord("[") 

156 tmp_buf += parts.hostname.encode("ascii") 

157 tmp_buf.append(93) # ord("]") 

158 else: 

159 try: 

160 tmp_buf += _idna_bytes(parts.hostname) 

161 except UnicodeError: 

162 # IDNA encoding can fail for too long labels (>63 characters) or 

163 # missing labels (e.g. http://.example.com) 

164 tmp_buf += parts.hostname.encode(encoding) 

165 

166 if parts.port is not None: 

167 tmp_buf.append(58) # ord(":") 

168 tmp_buf += str(parts.port).encode(encoding) 

169 

170 netloc = tmp_buf.decode() 

171 tmp_buf.clear() 

172 

173 if quote_path: 

174 _quote_into(parts.path.encode(path_encoding), tmp_buf, _PATH_SAFEST_CHARS) 

175 path = tmp_buf.decode() 

176 tmp_buf.clear() 

177 else: 

178 path = parts.path 

179 

180 _quote_into( 

181 parts.query.encode(encoding), 

182 tmp_buf, 

183 _SPECIAL_QUERY_SAFEST_CHARS 

184 if parts.scheme in _SPECIAL_SCHEMES 

185 else _QUERY_SAFEST_CHARS, 

186 ) 

187 query = tmp_buf.decode() 

188 tmp_buf.clear() 

189 

190 if parts.fragment: 

191 _quote_into(parts.fragment.encode(encoding), tmp_buf, _FRAGMENT_SAFEST_CHARS) 

192 fragment = tmp_buf.decode() 

193 tmp_buf.clear() 

194 else: 

195 fragment = parts.fragment 

196 

197 return _urlunsplit( 

198 parts.scheme, 

199 netloc, 

200 path, 

201 query, 

202 fragment, 

203 ) 

204 

205 

206_parent_dirs = re.compile(r"/?(\.\./)+") 

207 

208 

209def safe_download_url( 

210 url: str | bytes, encoding: str = "utf8", path_encoding: str = "utf8" 

211) -> str: 

212 """Make a url for download. This will call safe_url_string 

213 and then strip the fragment, if one exists. The path will 

214 be normalised. 

215 

216 If the path is outside the document root, it will be changed 

217 to be within the document root. 

218 """ 

219 safe_url = safe_url_string(url, encoding, path_encoding) 

220 scheme, netloc, path, query, _ = _urlsplit(safe_url) 

221 if path: 

222 path = _parent_dirs.sub("", posixpath.normpath(path)) 

223 if safe_url[-1] == "/" and path[-1] != "/": 

224 path = f"{path}/" 

225 else: 

226 path = "/" 

227 return _urlunsplit(scheme, netloc, path, query, "") 

228 

229 

230def is_url(text: str) -> bool: 

231 return text.partition("://")[0] in {"file", "http", "https"} 

232 

233 

234@overload 

235def url_query_parameter( 

236 url: str | bytes, 

237 parameter: str, 

238 default: None = None, 

239 keep_blank_values: bool | int = 0, 

240) -> str | None: ... 

241 

242 

243@overload 

244def url_query_parameter( 

245 url: str | bytes, 

246 parameter: str, 

247 default: str, 

248 keep_blank_values: bool | int = 0, 

249) -> str: ... 

250 

251 

252def url_query_parameter( 

253 url: str | bytes, 

254 parameter: str, 

255 default: str | None = None, 

256 keep_blank_values: bool | int = 0, 

257) -> str | None: 

258 """Return the value of a url parameter, given the url and parameter name 

259 NOTE: If url contains multiple parameters, the first leftmost one is returned 

260 

261 General case: 

262 

263 >>> import w3lib.url 

264 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "id") 

265 '200' 

266 >>> 

267 

268 Return a default value if the parameter is not found: 

269 

270 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault") 

271 'mydefault' 

272 >>> 

273 

274 Returns None if `keep_blank_values` not set or 0 (default): 

275 

276 >>> w3lib.url.url_query_parameter("product.html?id=", "id") 

277 >>> 

278 

279 Returns an empty string if `keep_blank_values` set to 1: 

280 

281 >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1) 

282 '' 

283 >>> 

284 

285 """ 

286 

287 queryparams = _parse_qs( 

288 _urlsplit(str(url)).query, keep_blank_values=bool(keep_blank_values) 

289 ) 

290 parameter_bytes = parameter.encode() 

291 if parameter_bytes in queryparams: 

292 return queryparams[parameter_bytes][0].decode() 

293 return default 

294 

295 

296def url_query_cleaner( 

297 url: str | bytes, 

298 parameterlist: str | bytes | Sequence[str | bytes] = (), 

299 sep: str = "&", 

300 kvsep: str = "=", 

301 remove: bool = False, 

302 unique: bool = True, 

303 keep_fragments: bool = False, 

304) -> str: 

305 """Clean URL arguments leaving only those passed in the parameterlist keeping order 

306 

307 >>> import w3lib.url 

308 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',)) 

309 'product.html?id=200' 

310 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']) 

311 'product.html?id=200&name=wired' 

312 >>> 

313 

314 If `unique` is ``False``, do not remove duplicated keys 

315 

316 >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False) 

317 'product.html?d=1&d=2&d=3' 

318 >>> 

319 

320 If `remove` is ``True``, leave only those **not in parameterlist**. 

321 

322 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True) 

323 'product.html?foo=bar&name=wired' 

324 >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True) 

325 'product.html?name=wired' 

326 >>> 

327 

328 By default, URL fragments are removed. If you need to preserve fragments, 

329 pass the ``keep_fragments`` argument as ``True``. 

330 

331 >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True) 

332 'http://domain.tld/#123123' 

333 

334 """ 

335 

336 if parameterlist and isinstance(parameterlist, (str, bytes)): 

337 parameterlist = (parameterlist,) 

338 

339 if isinstance(url, bytes): 

340 url = url.decode() 

341 

342 url, _, fragment = url.partition("#") 

343 base, _, query = url.partition("?") 

344 

345 if not query or (not parameterlist and not remove): 

346 return base if not keep_fragments else f"{base}#{fragment}" 

347 

348 param_lookup = frozenset(parameterlist) 

349 

350 seen: set[str] | None = set() if unique else None 

351 result: list[str] = [] 

352 

353 for ksv in query.split(sep): 

354 if not ksv: 

355 continue 

356 

357 k, _, _ = ksv.partition(kvsep) 

358 

359 if seen is not None: 

360 if k in seen: 

361 continue 

362 seen.add(k) 

363 

364 if remove: 

365 if k in param_lookup: 

366 continue 

367 elif k not in param_lookup: 

368 continue 

369 

370 result.append(ksv) 

371 del param_lookup, seen 

372 

373 url = base if not result else f"{base}?{sep.join(result)}" 

374 del result 

375 

376 if keep_fragments and fragment: 

377 url = f"{url}#{fragment}" 

378 

379 return url 

380 

381 

382def _add_or_replace_parameters(url: str, params: dict[bytes, bytes]) -> str: 

383 parsed = _urlsplit(url) 

384 

385 current_args = _parse_qsl(parsed.query, keep_blank_values=True) 

386 

387 new_args: list[tuple[bytes, bytes]] = [] 

388 seen_params: set[bytes] = set() 

389 

390 for name, value in current_args: 

391 if name in seen_params: 

392 continue 

393 replacement = params.get(name) 

394 if replacement is None: 

395 new_args.append((name, value)) 

396 else: 

397 new_args.append((name, replacement)) 

398 seen_params.add(name) 

399 

400 for name, value in params.items(): 

401 if name not in seen_params: 

402 new_args.append((name, value)) 

403 del seen_params, current_args 

404 

405 return _urlunsplit( 

406 parsed.scheme, 

407 parsed.netloc, 

408 parsed.path, 

409 _urlencode(new_args).decode(), 

410 parsed.fragment, 

411 ) 

412 

413 

414def add_or_replace_parameter(url: str, name: str, new_value: str) -> str: 

415 """Add or remove a parameter to a given url 

416 

417 >>> import w3lib.url 

418 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php', 'arg', 'v') 

419 'http://www.example.com/index.php?arg=v' 

420 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg4', 'v4') 

421 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3&arg4=v4' 

422 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg3', 'v3new') 

423 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new' 

424 >>> 

425 

426 """ 

427 return _add_or_replace_parameters(url, {name.encode(): new_value.encode()}) 

428 

429 

430def add_or_replace_parameters(url: str, new_parameters: dict[str, str]) -> str: 

431 """Add or remove a parameters to a given url 

432 

433 >>> import w3lib.url 

434 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php', {'arg': 'v'}) 

435 'http://www.example.com/index.php?arg=v' 

436 >>> args = {'arg4': 'v4', 'arg3': 'v3new'} 

437 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', args) 

438 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new&arg4=v4' 

439 >>> 

440 

441 """ 

442 return _add_or_replace_parameters( 

443 url, {k.encode(): v.encode() for k, v in new_parameters.items()} 

444 ) 

445 

446 

447def path_to_file_uri(path: str | os.PathLike[str]) -> str: 

448 """Convert local filesystem path to legal File URIs as described in: 

449 http://en.wikipedia.org/wiki/File_URI_scheme 

450 """ 

451 return f"file:///{pathname2url(str(Path(path).absolute())).lstrip('/')}" 

452 

453 

454def file_uri_to_path(uri: str) -> str: 

455 """Convert File URI to local filesystem path according to: 

456 http://en.wikipedia.org/wiki/File_URI_scheme 

457 """ 

458 return _url2pathname(_urlparse(uri)[2]) 

459 

460 

461def any_to_uri(uri_or_path: str) -> str: 

462 """If given a path name, return its File URI, otherwise return it 

463 unmodified 

464 """ 

465 if os.path.splitdrive(uri_or_path)[0]: 

466 return path_to_file_uri(uri_or_path) 

467 return uri_or_path if _urlparse(uri_or_path)[0] else path_to_file_uri(uri_or_path) 

468 

469 

470# ASCII characters. 

471_char = set(map(chr, range(127))) 

472 

473# RFC 2045 token. 

474_token = r"[{}]+".format( 

475 re.escape( 

476 "".join( 

477 _char 

478 - 

479 # Control characters. 

480 set(map(chr, range(32))) 

481 - 

482 # tspecials and space. 

483 set('()<>@,;:\\"/[]?= ') 

484 ) 

485 ) 

486) 

487 

488# RFC 822 quoted-string, without surrounding quotation marks. 

489_quoted_string = r"(?:[{}]|(?:\\[{}]))*".format( 

490 re.escape("".join(_char - {'"', "\\", "\r"})), re.escape("".join(_char)) 

491) 

492 

493# Encode the regular expression strings to make them into bytes, as Python 3 

494# bytes have no format() method, but bytes must be passed to re.compile() in 

495# order to make a pattern object that can be used to match on bytes. 

496 

497# RFC 2397 mediatype. 

498_mediatype_pattern = re.compile(rf"{_token}/{_token}".encode()) 

499_mediatype_parameter_pattern = re.compile( 

500 rf';({_token})=(?:({_token})|"({_quoted_string})")'.encode() 

501) 

502del _char, _token, _quoted_string 

503 

504 

505class ParseDataURIResult(NamedTuple): 

506 """Named tuple returned by :func:`parse_data_uri`.""" 

507 

508 #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``). 

509 media_type: str 

510 #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``). 

511 media_type_parameters: dict[str, str] 

512 #: Data, decoded if it was encoded in base64 format. 

513 data: bytes 

514 

515 

516def parse_data_uri(uri: str | bytes) -> ParseDataURIResult: 

517 """Parse a data: URI into :class:`ParseDataURIResult`.""" 

518 if not isinstance(uri, bytes): 

519 uri = safe_url_string(uri).encode("ascii") 

520 

521 scheme, _, uri = uri.partition(b":") 

522 if not scheme or not uri: 

523 raise ValueError("invalid URI") 

524 if scheme[:4].lower() != b"data": 

525 raise ValueError("not a data URI") 

526 

527 # RFC 3986 section 2.1 allows percent encoding to escape characters that 

528 # would be interpreted as delimiters, implying that actual delimiters 

529 # should not be percent-encoded. 

530 # Decoding before parsing will allow malformed URIs with percent-encoded 

531 # delimiters, but it makes parsing easier and should not affect 

532 # well-formed URIs, as the delimiters used in this URI scheme are not 

533 # allowed, percent-encoded or not, in tokens. 

534 uri = _unquote(uri) 

535 

536 media_type = "text/plain" 

537 media_type_params = {} 

538 

539 m = _mediatype_pattern.match(uri) 

540 if m: 

541 media_type = m.group().decode() 

542 uri = uri[m.end() :] 

543 else: 

544 media_type_params["charset"] = "US-ASCII" 

545 

546 while m := _mediatype_parameter_pattern.match(uri): 

547 attribute, value, value_quoted = m.groups() 

548 if value_quoted: 

549 value = re.sub(rb"\\(.)", rb"\1", value_quoted) 

550 media_type_params[attribute.decode()] = value.decode() 

551 uri = uri[m.end() :] 

552 

553 is_base64, _, data = uri.partition(b",") 

554 if is_base64: 

555 if is_base64 != b";base64": 

556 raise ValueError("invalid data URI") 

557 data = base64.b64decode(data) 

558 

559 return ParseDataURIResult(media_type, media_type_params, data) 

560 

561 

562__all__ = [ 

563 "add_or_replace_parameter", 

564 "add_or_replace_parameters", 

565 "any_to_uri", 

566 "canonicalize_url", 

567 "file_uri_to_path", 

568 "is_url", 

569 "parse_data_uri", 

570 "path_to_file_uri", 

571 "safe_download_url", 

572 "safe_url_string", 

573 "url_query_cleaner", 

574 "url_query_parameter", 

575] 

576 

577 

578def _safe_ParseResult( 

579 parts: ParseResult, encoding: str = "utf8", path_encoding: str = "utf8" 

580) -> tuple[str, str, str, str, str, str]: 

581 # IDNA encoding can fail for too long labels (>63 characters) 

582 # or missing labels (e.g. http://.example.com) 

583 try: 

584 netloc = _idna_str(parts.netloc) 

585 except UnicodeError: 

586 netloc = parts.netloc 

587 

588 tmp_buf = bytearray() 

589 

590 _quote_into(parts.path.encode(path_encoding), tmp_buf, _PATH_SAFE_CHARS) 

591 path = tmp_buf.decode() 

592 tmp_buf.clear() 

593 

594 _quote_into(parts.params.encode(encoding), tmp_buf, _SAFE_CHARS) 

595 params = tmp_buf.decode() 

596 tmp_buf.clear() 

597 

598 _quote_into(parts.query.encode(encoding), tmp_buf, _SAFE_CHARS) 

599 query = tmp_buf.decode() 

600 tmp_buf.clear() 

601 

602 _quote_into(parts.fragment.encode(encoding), tmp_buf, _SAFE_CHARS) 

603 fragment = tmp_buf.decode() 

604 tmp_buf.clear() 

605 

606 return ( 

607 parts.scheme, 

608 netloc, 

609 path, 

610 params, 

611 query, 

612 fragment, 

613 ) 

614 

615 

616def canonicalize_url( 

617 url: str | bytes | ParseResult, 

618 keep_blank_values: bool = True, 

619 keep_fragments: bool = False, 

620 encoding: str | None = None, 

621) -> str: 

622 r"""Canonicalize the given url by applying the following procedures: 

623 

624 - make the URL safe 

625 - sort query arguments, first by key, then by value 

626 - normalize all spaces (in query arguments) '+' (plus symbol) 

627 - normalize percent encodings case (%2f -> %2F) 

628 - remove query arguments with blank values (unless `keep_blank_values` is True) 

629 - remove fragments (unless `keep_fragments` is True) 

630 

631 The url passed can be bytes or unicode, while the url returned is 

632 always a native str (bytes in Python 2, unicode in Python 3). 

633 

634 >>> import w3lib.url 

635 >>> 

636 >>> # sorting query arguments 

637 >>> w3lib.url.canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50') 

638 'http://www.example.com/do?a=50&b=2&b=5&c=3' 

639 >>> 

640 >>> # UTF-8 conversion + percent-encoding of non-ASCII characters 

641 >>> w3lib.url.canonicalize_url('http://www.example.com/r\u00e9sum\u00e9') 

642 'http://www.example.com/r%C3%A9sum%C3%A9' 

643 >>> 

644 

645 For more examples, see the tests in `tests/test_url.py`. 

646 """ 

647 # If supplied `encoding` is not compatible with all characters in `url`, 

648 # fallback to UTF-8 as safety net. 

649 # UTF-8 can handle all Unicode characters, 

650 # so we should be covered regarding URL normalization, 

651 # if not for proper URL expected by remote website. 

652 if isinstance(url, str): 

653 url = _strip(url) 

654 try: 

655 scheme, netloc, path, params, query, fragment = _safe_ParseResult( 

656 parse_url(url), encoding=encoding or "utf8" 

657 ) 

658 except UnicodeEncodeError: 

659 scheme, netloc, path, params, query, fragment = _safe_ParseResult( 

660 parse_url(url), encoding="utf8" 

661 ) 

662 

663 # 1. decode query-string as UTF-8 (or keep raw bytes), 

664 # sort values, 

665 # and percent-encode them back 

666 

667 # Python's urllib.parse.parse_qsl does not work as wanted 

668 # for percent-encoded characters that do not match passed encoding, 

669 # they get lost. 

670 # 

671 # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')] 

672 # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD), 

673 # instead of \xa3 that you get with Python2's parse_qsl) 

674 # 

675 # what we want here is to keep raw bytes, and percent encode them 

676 # so as to preserve whatever encoding what originally used. 

677 # 

678 # See https://tools.ietf.org/html/rfc3987#section-6.4: 

679 # 

680 # For example, it is possible to have a URI reference of 

681 # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the 

682 # document name is encoded in iso-8859-1 based on server settings, but 

683 # where the fragment identifier is encoded in UTF-8 according to 

684 # [XPointer]. The IRI corresponding to the above URI would be (in XML 

685 # notation) 

686 # "http://www.example.org/r%E9sum%E9.xml#r&#xE9;sum&#xE9;". 

687 # Similar considerations apply to query parts. The functionality of 

688 # IRIs (namely, to be able to include non-ASCII characters) can only be 

689 # used if the query part is encoded in UTF-8. 

690 if query: 

691 keyvals = _parse_qsl(query, keep_blank_values) 

692 

693 if len(keyvals) > 1: 

694 keyvals.sort() 

695 

696 query = _urlencode(keyvals).decode() 

697 del keyvals 

698 

699 # 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes) 

700 # and percent-encode path again (this normalizes to upper-case %XX) 

701 path = _quote(_unquotepath(path), _PATH_SAFE_CHARS).decode() if path else "/" 

702 

703 fragment = "" if not keep_fragments else fragment 

704 

705 # Apply lowercase to the domain, but not to the userinfo. 

706 uinf_sep_idx = netloc.rfind("@") 

707 host = ( 

708 (netloc[uinf_sep_idx + 1 :] if uinf_sep_idx != -1 else netloc) 

709 .lower() 

710 .removesuffix(":") 

711 ) 

712 netloc = (netloc[: uinf_sep_idx + 1] + host) if uinf_sep_idx != -1 else host 

713 

714 # every part should be safe already 

715 return _urlunparse(scheme, netloc, path, params, query, fragment) 

716 

717 

718def _unquotepath(path: str) -> bytes: 

719 if "%" not in path: 

720 return path.encode() 

721 # standard lib's unquote() does not work for non-UTF-8 

722 # percent-escaped characters, they get lost. 

723 # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD) 

724 return _unquote( 

725 path.replace("%2f", "%252F") 

726 .replace("%2F", "%252F") 

727 .replace("%3f", "%253F") 

728 .replace("%3F", "%253F") 

729 ) 

730 

731 

732def parse_url( 

733 url: str | bytes | ParseResult, encoding: str | None = None 

734) -> ParseResult: 

735 """Return urlparsed url from the given argument (which could be an already 

736 parsed url) 

737 """ 

738 if isinstance(url, ParseResult): 

739 return url 

740 return _urlparse(to_unicode(url, encoding)) 

741 

742 

743def parse_qsl_to_bytes( 

744 qs: str, keep_blank_values: bool = False 

745) -> list[tuple[bytes, bytes]]: 

746 """Parse a query given as a string argument. 

747 

748 Data are returned as a list of name, value pairs as bytes. 

749 

750 Arguments: 

751 

752 qs: percent-encoded query string to be parsed 

753 

754 keep_blank_values: flag indicating whether blank values in 

755 percent-encoded queries should be treated as blank strings. A 

756 true value indicates that blanks should be retained as blank 

757 strings. The default false value indicates that blank values 

758 are to be ignored and treated as if they were not included. 

759 

760 """ 

761 

762 return _parse_qsl(qs, keep_blank_values)