Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/w3lib/url.py: 42%

1"""

2This module contains general purpose URL functions not found in the standard

3library.

4"""

6from __future__ import annotations

8import base64

9import codecs

10import os

11import posixpath

12import re

13import string

14from pathlib import Path

15from typing import TYPE_CHECKING, Callable, NamedTuple, cast, overload

16from urllib.parse import ( # type: ignore[attr-defined]

17 ParseResult,

18 _coerce_args,

19 parse_qs,

20 parse_qsl,

21 quote,

22 unquote,

23 unquote_to_bytes,

24 urldefrag,

25 urlencode,

26 urlparse,

27 urlsplit,

28 urlunparse,

29 urlunsplit,

30)

31from urllib.request import pathname2url, url2pathname

33from ._infra import _ASCII_TAB_OR_NEWLINE, _C0_CONTROL_OR_SPACE

34from ._url import _SPECIAL_SCHEMES

35from .util import to_unicode

37if TYPE_CHECKING:

38 from collections.abc import Sequence

40 from ._types import AnyUnicodeError

43# error handling function for bytes-to-Unicode decoding errors with URLs

44def _quote_byte(error: UnicodeError) -> tuple[str, int]:

45 error = cast("AnyUnicodeError", error)

46 return (to_unicode(quote(error.object[error.start : error.end])), error.end)

49codecs.register_error("percentencode", _quote_byte)

51# constants from RFC 3986, Section 2.2 and 2.3

52RFC3986_GEN_DELIMS = b":/?#[]@"

53RFC3986_SUB_DELIMS = b"!$&'()*+,;="

54RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS

55RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode("ascii")

56EXTRA_SAFE_CHARS = b"|" # see https://github.com/scrapy/w3lib/pull/25

58RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":"

59_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b"%"

60_path_safe_chars = _safe_chars.replace(b"#", b"")

62# Characters that are safe in all of:

63#

64# - RFC 2396 + RFC 2732, as interpreted by Java 8’s java.net.URI class

65# - RFC 3986

66# - The URL living standard

67#

68# NOTE: % is currently excluded from these lists of characters, due to

69# limitations of the current safe_url_string implementation, but it should also

70# be escaped as %25 when it is not already being used as part of an escape

71# character.

72_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, delete=b":;=")

73_PATH_SAFEST_CHARS = _safe_chars.translate(None, delete=b"#[]|")

74_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS

75_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, delete=b"'")

76_FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS

79_ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE = {

80 ord(char): None for char in _ASCII_TAB_OR_NEWLINE

81}

84def _strip(url: str) -> str:

85 return url.strip(_C0_CONTROL_OR_SPACE).translate(

86 _ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE

87 )

90def safe_url_string( # pylint: disable=too-many-locals

91 url: str | bytes,

92 encoding: str = "utf8",

93 path_encoding: str = "utf8",

94 quote_path: bool = True,

95) -> str:

96 """Return a URL equivalent to *url* that a wide range of web browsers and

97 web servers consider valid.

99 *url* is parsed according to the rules of the `URL living standard`_,

100 and during serialization additional characters are percent-encoded to make

101 the URL valid by additional URL standards.

102

103 .. _URL living standard: https://url.spec.whatwg.org/

104

105 The returned URL should be valid by *all* of the following URL standards

106 known to be enforced by modern-day web browsers and web servers:

107

108 - `URL living standard`_

109

110 - `RFC 3986`_

111

112 - `RFC 2396`_ and `RFC 2732`_, as interpreted by `Java 8’s java.net.URI

113 class`_.

114

115 .. _Java 8’s java.net.URI class: https://docs.oracle.com/javase/8/docs/api/java/net/URI.html

116 .. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt

117 .. _RFC 2732: https://www.ietf.org/rfc/rfc2732.txt

118 .. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt

119

120 If a bytes URL is given, it is first converted to `str` using the given

121 encoding (which defaults to 'utf-8'). If quote_path is True (default),

122 path_encoding ('utf-8' by default) is used to encode URL path component

123 which is then quoted. Otherwise, if quote_path is False, path component

124 is not encoded or quoted. Given encoding is used for query string

125 or form data.

126

127 When passing an encoding, you should use the encoding of the

128 original page (the page from which the URL was extracted from).

129

130 Calling this function on an already "safe" URL will return the URL

131 unmodified.

132 """

133 # urlsplit() chokes on bytes input with non-ASCII chars,

134 # so let's decode (to Unicode) using page encoding:

135 # - it is assumed that a raw bytes input comes from a document

136 # encoded with the supplied encoding (or UTF8 by default)

137 # - if the supplied (or default) encoding chokes,

138 # percent-encode offending bytes

139 decoded = to_unicode(url, encoding=encoding, errors="percentencode")

140 parts = urlsplit(_strip(decoded))

141

142 username, password, hostname, port = (

143 parts.username,

144 parts.password,

145 parts.hostname,

146 parts.port,

147 )

148 netloc_bytes = b""

149 if username is not None or password is not None:

150 if username is not None:

151 safe_username = quote(unquote(username), _USERINFO_SAFEST_CHARS)

152 netloc_bytes += safe_username.encode(encoding)

153 if password is not None:

154 netloc_bytes += b":"

155 safe_password = quote(unquote(password), _USERINFO_SAFEST_CHARS)

156 netloc_bytes += safe_password.encode(encoding)

157 netloc_bytes += b"@"

158 if hostname is not None:

159 try:

160 netloc_bytes += hostname.encode("idna")

161 except UnicodeError:

162 # IDNA encoding can fail for too long labels (>63 characters) or

163 # missing labels (e.g. http://.example.com)

164 netloc_bytes += hostname.encode(encoding)

165 if port is not None:

166 netloc_bytes += b":"

167 netloc_bytes += str(port).encode(encoding)

168

169 netloc = netloc_bytes.decode()

170

171 # default encoding for path component SHOULD be UTF-8

172 if quote_path:

173 path = quote(parts.path.encode(path_encoding), _PATH_SAFEST_CHARS)

174 else:

175 path = parts.path

176

177 if parts.scheme in _SPECIAL_SCHEMES:

178 query = quote(parts.query.encode(encoding), _SPECIAL_QUERY_SAFEST_CHARS)

179 else:

180 query = quote(parts.query.encode(encoding), _QUERY_SAFEST_CHARS)

181

182 return urlunsplit(

183 (

184 parts.scheme,

185 netloc,

186 path,

187 query,

188 quote(parts.fragment.encode(encoding), _FRAGMENT_SAFEST_CHARS),

189 )

190 )

191

192

193_parent_dirs = re.compile(r"/?(\.\./)+")

194

195

196def safe_download_url(

197 url: str | bytes, encoding: str = "utf8", path_encoding: str = "utf8"

198) -> str:

199 """Make a url for download. This will call safe_url_string

200 and then strip the fragment, if one exists. The path will

201 be normalised.

202

203 If the path is outside the document root, it will be changed

204 to be within the document root.

205 """

206 safe_url = safe_url_string(url, encoding, path_encoding)

207 scheme, netloc, path, query, _ = urlsplit(safe_url)

208 if path:

209 path = _parent_dirs.sub("", posixpath.normpath(path))

210 if safe_url.endswith("/") and not path.endswith("/"):

211 path += "/"

212 else:

213 path = "/"

214 return urlunsplit((scheme, netloc, path, query, ""))

215

216

217def is_url(text: str) -> bool:

218 return text.partition("://")[0] in ("file", "http", "https")

219

220

221@overload

222def url_query_parameter(

223 url: str | bytes,

224 parameter: str,

225 default: None = None,

226 keep_blank_values: bool | int = 0,

227) -> str | None: ...

228

229

230@overload

231def url_query_parameter(

232 url: str | bytes,

233 parameter: str,

234 default: str,

235 keep_blank_values: bool | int = 0,

236) -> str: ...

237

238

239def url_query_parameter(

240 url: str | bytes,

241 parameter: str,

242 default: str | None = None,

243 keep_blank_values: bool | int = 0,

244) -> str | None:

245 """Return the value of a url parameter, given the url and parameter name

246

247 General case:

248

249 >>> import w3lib.url

250 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "id")

251 '200'

252 >>>

253

254 Return a default value if the parameter is not found:

255

256 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault")

257 'mydefault'

258 >>>

259

260 Returns None if `keep_blank_values` not set or 0 (default):

261

262 >>> w3lib.url.url_query_parameter("product.html?id=", "id")

263 >>>

264

265 Returns an empty string if `keep_blank_values` set to 1:

266

267 >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1)

268 ''

269 >>>

270

271 """

272

273 queryparams = parse_qs(

274 urlsplit(str(url))[3], keep_blank_values=bool(keep_blank_values)

275 )

276 if parameter in queryparams:

277 return queryparams[parameter][0]

278 return default

279

280

281def url_query_cleaner(

282 url: str | bytes,

283 parameterlist: str | bytes | Sequence[str | bytes] = (),

284 sep: str = "&",

285 kvsep: str = "=",

286 remove: bool = False,

287 unique: bool = True,

288 keep_fragments: bool = False,

289) -> str:

290 """Clean URL arguments leaving only those passed in the parameterlist keeping order

291

292 >>> import w3lib.url

293 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',))

294 'product.html?id=200'

295 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])

296 'product.html?id=200&name=wired'

297 >>>

298

299 If `unique` is ``False``, do not remove duplicated keys

300

301 >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)

302 'product.html?d=1&d=2&d=3'

303 >>>

304

305 If `remove` is ``True``, leave only those **not in parameterlist**.

306

307 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)

308 'product.html?foo=bar&name=wired'

309 >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)

310 'product.html?name=wired'

311 >>>

312

313 By default, URL fragments are removed. If you need to preserve fragments,

314 pass the ``keep_fragments`` argument as ``True``.

315

316 >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True)

317 'http://domain.tld/#123123'

318

319 """

320

321 if isinstance(parameterlist, (str, bytes)):

322 parameterlist = [parameterlist]

323 url, fragment = urldefrag(url)

324 url = cast("str", url)

325 fragment = cast("str", fragment)

326 base, _, query = url.partition("?")

327 seen = set()

328 querylist = []

329 for ksv in query.split(sep):

330 if not ksv:

331 continue

332 k, _, _ = ksv.partition(kvsep)

333 if unique and k in seen:

334 continue

335 if remove and k in parameterlist:

336 continue

337 if not remove and k not in parameterlist:

338 continue

339 querylist.append(ksv)

340 seen.add(k)

341 url = "?".join([base, sep.join(querylist)]) if querylist else base

342 if keep_fragments and fragment:

343 url += "#" + fragment

344 return url

345

346

347def _add_or_replace_parameters(url: str, params: dict[str, str]) -> str:

348 parsed = urlsplit(url)

349 current_args = parse_qsl(parsed.query, keep_blank_values=True)

350

351 new_args = []

352 seen_params = set()

353 for name, value in current_args:

354 if name not in params:

355 new_args.append((name, value))

356 elif name not in seen_params:

357 new_args.append((name, params[name]))

358 seen_params.add(name)

359

360 not_modified_args = [

361 (name, value) for name, value in params.items() if name not in seen_params

362 ]

363 new_args += not_modified_args

364

365 query = urlencode(new_args)

366 return urlunsplit(parsed._replace(query=query))

367

368

369def add_or_replace_parameter(url: str, name: str, new_value: str) -> str:

370 """Add or remove a parameter to a given url

371

372 >>> import w3lib.url

373 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php', 'arg', 'v')

374 'http://www.example.com/index.php?arg=v'

375 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg4', 'v4')

376 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3&arg4=v4'

377 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg3', 'v3new')

378 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new'

379 >>>

380

381 """

382 return _add_or_replace_parameters(url, {name: new_value})

383

384

385def add_or_replace_parameters(url: str, new_parameters: dict[str, str]) -> str:

386 """Add or remove a parameters to a given url

387

388 >>> import w3lib.url

389 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php', {'arg': 'v'})

390 'http://www.example.com/index.php?arg=v'

391 >>> args = {'arg4': 'v4', 'arg3': 'v3new'}

392 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', args)

393 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new&arg4=v4'

394 >>>

395

396 """

397 return _add_or_replace_parameters(url, new_parameters)

398

399

400def path_to_file_uri(path: str | os.PathLike[str]) -> str:

401 """Convert local filesystem path to legal File URIs as described in:

402 http://en.wikipedia.org/wiki/File_URI_scheme

403 """

404 x = pathname2url(str(Path(path).absolute()))

405 return f"file:///{x.lstrip('/')}"

406

407

408def file_uri_to_path(uri: str) -> str:

409 """Convert File URI to local filesystem path according to:

410 http://en.wikipedia.org/wiki/File_URI_scheme

411 """

412 uri_path = urlparse(uri).path

413 return url2pathname(uri_path)

414

415

416def any_to_uri(uri_or_path: str) -> str:

417 """If given a path name, return its File URI, otherwise return it

418 unmodified

419 """

420 if os.path.splitdrive(uri_or_path)[0]:

421 return path_to_file_uri(uri_or_path)

422 u = urlparse(uri_or_path)

423 return uri_or_path if u.scheme else path_to_file_uri(uri_or_path)

424

425

426# ASCII characters.

427_char = set(map(chr, range(127)))

428

429# RFC 2045 token.

430_token = r"[{}]+".format(

431 re.escape(

432 "".join(

433 _char

434 -

435 # Control characters.

436 set(map(chr, range(32)))

437 -

438 # tspecials and space.

439 set('()<>@,;:\\"/[]?= ')

440 )

441 )

442)

443

444# RFC 822 quoted-string, without surrounding quotation marks.

445_quoted_string = r"(?:[{}]|(?:\\[{}]))*".format(

446 re.escape("".join(_char - {'"', "\\", "\r"})), re.escape("".join(_char))

447)

448

449# Encode the regular expression strings to make them into bytes, as Python 3

450# bytes have no format() method, but bytes must be passed to re.compile() in

451# order to make a pattern object that can be used to match on bytes.

452

453# RFC 2397 mediatype.

454_mediatype_pattern = re.compile(rf"{_token}/{_token}".encode())

455_mediatype_parameter_pattern = re.compile(

456 rf';({_token})=(?:({_token})|"({_quoted_string})")'.encode()

457)

458

459

460class ParseDataURIResult(NamedTuple):

461 """Named tuple returned by :func:`parse_data_uri`."""

462

463 #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``).

464 media_type: str

465 #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``).

466 media_type_parameters: dict[str, str]

467 #: Data, decoded if it was encoded in base64 format.

468 data: bytes

469

470

471def parse_data_uri(uri: str | bytes) -> ParseDataURIResult:

472 """Parse a data: URI into :class:`ParseDataURIResult`."""

473 if not isinstance(uri, bytes):

474 uri = safe_url_string(uri).encode("ascii")

475

476 try:

477 scheme, uri = uri.split(b":", 1)

478 except ValueError:

479 raise ValueError("invalid URI")

480 if scheme.lower() != b"data":

481 raise ValueError("not a data URI")

482

483 # RFC 3986 section 2.1 allows percent encoding to escape characters that

484 # would be interpreted as delimiters, implying that actual delimiters

485 # should not be percent-encoded.

486 # Decoding before parsing will allow malformed URIs with percent-encoded

487 # delimiters, but it makes parsing easier and should not affect

488 # well-formed URIs, as the delimiters used in this URI scheme are not

489 # allowed, percent-encoded or not, in tokens.

490 uri = unquote_to_bytes(uri)

491

492 media_type = "text/plain"

493 media_type_params = {}

494

495 m = _mediatype_pattern.match(uri)

496 if m:

497 media_type = m.group().decode()

498 uri = uri[m.end() :]

499 else:

500 media_type_params["charset"] = "US-ASCII"

501

502 while True:

503 m = _mediatype_parameter_pattern.match(uri)

504 if m:

505 attribute, value, value_quoted = m.groups()

506 if value_quoted:

507 value = re.sub(rb"\\(.)", rb"\1", value_quoted)

508 media_type_params[attribute.decode()] = value.decode()

509 uri = uri[m.end() :]

510 else:

511 break

512

513 try:

514 is_base64, data = uri.split(b",", 1)

515 except ValueError:

516 raise ValueError("invalid data URI")

517 if is_base64:

518 if is_base64 != b";base64":

519 raise ValueError("invalid data URI")

520 data = base64.b64decode(data)

521

522 return ParseDataURIResult(media_type, media_type_params, data)

523

524

525__all__ = [

526 "add_or_replace_parameter",

527 "add_or_replace_parameters",

528 "any_to_uri",

529 "canonicalize_url",

530 "file_uri_to_path",

531 "is_url",

532 "parse_data_uri",

533 "path_to_file_uri",

534 "safe_download_url",

535 "safe_url_string",

536 "url_query_cleaner",

537 "url_query_parameter",

538]

539

540

541def _safe_ParseResult(

542 parts: ParseResult, encoding: str = "utf8", path_encoding: str = "utf8"

543) -> tuple[str, str, str, str, str, str]:

544 # IDNA encoding can fail for too long labels (>63 characters)

545 # or missing labels (e.g. http://.example.com)

546 try:

547 netloc = parts.netloc.encode("idna").decode()

548 except UnicodeError:

549 netloc = parts.netloc

550

551 return (

552 parts.scheme,

553 netloc,

554 quote(parts.path.encode(path_encoding), _path_safe_chars),

555 quote(parts.params.encode(path_encoding), _safe_chars),

556 quote(parts.query.encode(encoding), _safe_chars),

557 quote(parts.fragment.encode(encoding), _safe_chars),

558 )

559

560

561def canonicalize_url(

562 url: str | bytes | ParseResult,

563 keep_blank_values: bool = True,

564 keep_fragments: bool = False,

565 encoding: str | None = None,

566) -> str:

567 r"""Canonicalize the given url by applying the following procedures:

568

569 - make the URL safe

570 - sort query arguments, first by key, then by value

571 - normalize all spaces (in query arguments) '+' (plus symbol)

572 - normalize percent encodings case (%2f -> %2F)

573 - remove query arguments with blank values (unless `keep_blank_values` is True)

574 - remove fragments (unless `keep_fragments` is True)

575

576 The url passed can be bytes or unicode, while the url returned is

577 always a native str (bytes in Python 2, unicode in Python 3).

578

579 >>> import w3lib.url

580 >>>

581 >>> # sorting query arguments

582 >>> w3lib.url.canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50')

583 'http://www.example.com/do?a=50&b=2&b=5&c=3'

584 >>>

585 >>> # UTF-8 conversion + percent-encoding of non-ASCII characters

586 >>> w3lib.url.canonicalize_url('http://www.example.com/r\u00e9sum\u00e9')

587 'http://www.example.com/r%C3%A9sum%C3%A9'

588 >>>

589

590 For more examples, see the tests in `tests/test_url.py`.

591 """

592 # If supplied `encoding` is not compatible with all characters in `url`,

593 # fallback to UTF-8 as safety net.

594 # UTF-8 can handle all Unicode characters,

595 # so we should be covered regarding URL normalization,

596 # if not for proper URL expected by remote website.

597 if isinstance(url, str):

598 url = _strip(url)

599 try:

600 scheme, netloc, path, params, query, fragment = _safe_ParseResult(

601 parse_url(url), encoding=encoding or "utf8"

602 )

603 except UnicodeEncodeError:

604 scheme, netloc, path, params, query, fragment = _safe_ParseResult(

605 parse_url(url), encoding="utf8"

606 )

607

608 # 1. decode query-string as UTF-8 (or keep raw bytes),

609 # sort values,

610 # and percent-encode them back

611

612 # Python's urllib.parse.parse_qsl does not work as wanted

613 # for percent-encoded characters that do not match passed encoding,

614 # they get lost.

615 #

616 # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')]

617 # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD),

618 # instead of \xa3 that you get with Python2's parse_qsl)

619 #

620 # what we want here is to keep raw bytes, and percent encode them

621 # so as to preserve whatever encoding what originally used.

622 #

623 # See https://tools.ietf.org/html/rfc3987#section-6.4:

624 #

625 # For example, it is possible to have a URI reference of

626 # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the

627 # document name is encoded in iso-8859-1 based on server settings, but

628 # where the fragment identifier is encoded in UTF-8 according to

629 # [XPointer]. The IRI corresponding to the above URI would be (in XML

630 # notation)

631 # "http://www.example.org/r%E9sum%E9.xml#résumé".

632 # Similar considerations apply to query parts. The functionality of

633 # IRIs (namely, to be able to include non-ASCII characters) can only be

634 # used if the query part is encoded in UTF-8.

635 keyvals = parse_qsl_to_bytes(query, keep_blank_values)

636

637 keyvals.sort()

638 query = urlencode(keyvals)

639

640 # 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes)

641 # and percent-encode path again (this normalizes to upper-case %XX)

642 uqp = _unquotepath(path)

643 path = quote(uqp, _path_safe_chars) or "/"

644

645 fragment = "" if not keep_fragments else fragment

646

647 # Apply lowercase to the domain, but not to the userinfo.

648 netloc_parts = netloc.split("@")

649 netloc_parts[-1] = netloc_parts[-1].lower().rstrip(":")

650 netloc = "@".join(netloc_parts)

651

652 # every part should be safe already

653 return urlunparse((scheme, netloc, path, params, query, fragment))

654

655

656def _unquotepath(path: str) -> bytes:

657 for reserved in ("2f", "2F", "3f", "3F"):

658 path = path.replace("%" + reserved, "%25" + reserved.upper())

659

660 # standard lib's unquote() does not work for non-UTF-8

661 # percent-escaped characters, they get lost.

662 # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)

663 #

664 # unquote_to_bytes() returns raw bytes instead

665 return unquote_to_bytes(path)

666

667

668def parse_url(

669 url: str | bytes | ParseResult, encoding: str | None = None

670) -> ParseResult:

671 """Return urlparsed url from the given argument (which could be an already

672 parsed url)

673 """

674 if isinstance(url, ParseResult):

675 return url

676 return urlparse(to_unicode(url, encoding))

677

678

679def parse_qsl_to_bytes(

680 qs: str, keep_blank_values: bool = False

681) -> list[tuple[bytes, bytes]]:

682 """Parse a query given as a string argument.

683

684 Data are returned as a list of name, value pairs as bytes.

685

686 Arguments:

687

688 qs: percent-encoded query string to be parsed

689

690 keep_blank_values: flag indicating whether blank values in

691 percent-encoded queries should be treated as blank strings. A

692 true value indicates that blanks should be retained as blank

693 strings. The default false value indicates that blank values

694 are to be ignored and treated as if they were not included.

695

696 """

697 # This code is the same as Python3's parse_qsl()

698 # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)

699 # except for the unquote(s, encoding, errors) calls replaced

700 # with unquote_to_bytes(s)

701 coerce_args = cast("Callable[..., tuple[str, Callable[..., bytes]]]", _coerce_args)

702 qs, _coerce_result = coerce_args(qs)

703 pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")]

704 r = []

705 for name_value in pairs:

706 if not name_value:

707 continue

708 nv = name_value.split("=", 1)

709 if len(nv) != 2:

710 # Handle case of a control-name with no equal sign

711 if keep_blank_values:

712 nv.append("")

713 else:

714 continue

715 if len(nv[1]) or keep_blank_values:

716 name: str | bytes = nv[0].replace("+", " ")

717 name = unquote_to_bytes(name)

718 name = _coerce_result(name)

719 value: str | bytes = nv[1].replace("+", " ")

720 value = unquote_to_bytes(value)

721 value = _coerce_result(value)

722 r.append((name, value))

723 return r