Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/w3lib/url.py: 42%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2This module contains general purpose URL functions not found in the standard
3library.
4"""
6from __future__ import annotations
8import base64
9import codecs
10import os
11import posixpath
12import re
13import string
14from pathlib import Path
15from typing import TYPE_CHECKING, Callable, NamedTuple, cast, overload
16from urllib.parse import ( # type: ignore[attr-defined]
17 ParseResult,
18 _coerce_args,
19 parse_qs,
20 parse_qsl,
21 quote,
22 unquote,
23 unquote_to_bytes,
24 urldefrag,
25 urlencode,
26 urlparse,
27 urlsplit,
28 urlunparse,
29 urlunsplit,
30)
31from urllib.request import pathname2url, url2pathname
33from ._infra import _ASCII_TAB_OR_NEWLINE, _C0_CONTROL_OR_SPACE
34from ._url import _SPECIAL_SCHEMES
35from .util import to_unicode
37if TYPE_CHECKING:
38 from collections.abc import Sequence
40 from ._types import AnyUnicodeError
43# error handling function for bytes-to-Unicode decoding errors with URLs
44def _quote_byte(error: UnicodeError) -> tuple[str, int]:
45 error = cast("AnyUnicodeError", error)
46 return (to_unicode(quote(error.object[error.start : error.end])), error.end)
49codecs.register_error("percentencode", _quote_byte)
51# constants from RFC 3986, Section 2.2 and 2.3
52RFC3986_GEN_DELIMS = b":/?#[]@"
53RFC3986_SUB_DELIMS = b"!$&'()*+,;="
54RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS
55RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode("ascii")
56EXTRA_SAFE_CHARS = b"|" # see https://github.com/scrapy/w3lib/pull/25
58RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":"
59_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b"%"
60_path_safe_chars = _safe_chars.replace(b"#", b"")
62# Characters that are safe in all of:
63#
64# - RFC 2396 + RFC 2732, as interpreted by Java 8’s java.net.URI class
65# - RFC 3986
66# - The URL living standard
67#
68# NOTE: % is currently excluded from these lists of characters, due to
69# limitations of the current safe_url_string implementation, but it should also
70# be escaped as %25 when it is not already being used as part of an escape
71# character.
72_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, delete=b":;=")
73_PATH_SAFEST_CHARS = _safe_chars.translate(None, delete=b"#[]|")
74_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS
75_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, delete=b"'")
76_FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS
79_ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE = {
80 ord(char): None for char in _ASCII_TAB_OR_NEWLINE
81}
84def _strip(url: str) -> str:
85 return url.strip(_C0_CONTROL_OR_SPACE).translate(
86 _ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE
87 )
90def safe_url_string( # pylint: disable=too-many-locals
91 url: str | bytes,
92 encoding: str = "utf8",
93 path_encoding: str = "utf8",
94 quote_path: bool = True,
95) -> str:
96 """Return a URL equivalent to *url* that a wide range of web browsers and
97 web servers consider valid.
99 *url* is parsed according to the rules of the `URL living standard`_,
100 and during serialization additional characters are percent-encoded to make
101 the URL valid by additional URL standards.
103 .. _URL living standard: https://url.spec.whatwg.org/
105 The returned URL should be valid by *all* of the following URL standards
106 known to be enforced by modern-day web browsers and web servers:
108 - `URL living standard`_
110 - `RFC 3986`_
112 - `RFC 2396`_ and `RFC 2732`_, as interpreted by `Java 8’s java.net.URI
113 class`_.
115 .. _Java 8’s java.net.URI class: https://docs.oracle.com/javase/8/docs/api/java/net/URI.html
116 .. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt
117 .. _RFC 2732: https://www.ietf.org/rfc/rfc2732.txt
118 .. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt
120 If a bytes URL is given, it is first converted to `str` using the given
121 encoding (which defaults to 'utf-8'). If quote_path is True (default),
122 path_encoding ('utf-8' by default) is used to encode URL path component
123 which is then quoted. Otherwise, if quote_path is False, path component
124 is not encoded or quoted. Given encoding is used for query string
125 or form data.
127 When passing an encoding, you should use the encoding of the
128 original page (the page from which the URL was extracted from).
130 Calling this function on an already "safe" URL will return the URL
131 unmodified.
132 """
133 # urlsplit() chokes on bytes input with non-ASCII chars,
134 # so let's decode (to Unicode) using page encoding:
135 # - it is assumed that a raw bytes input comes from a document
136 # encoded with the supplied encoding (or UTF8 by default)
137 # - if the supplied (or default) encoding chokes,
138 # percent-encode offending bytes
139 decoded = to_unicode(url, encoding=encoding, errors="percentencode")
140 parts = urlsplit(_strip(decoded))
142 username, password, hostname, port = (
143 parts.username,
144 parts.password,
145 parts.hostname,
146 parts.port,
147 )
148 netloc_bytes = b""
149 if username is not None or password is not None:
150 if username is not None:
151 safe_username = quote(unquote(username), _USERINFO_SAFEST_CHARS)
152 netloc_bytes += safe_username.encode(encoding)
153 if password is not None:
154 netloc_bytes += b":"
155 safe_password = quote(unquote(password), _USERINFO_SAFEST_CHARS)
156 netloc_bytes += safe_password.encode(encoding)
157 netloc_bytes += b"@"
158 if hostname is not None:
159 try:
160 netloc_bytes += hostname.encode("idna")
161 except UnicodeError:
162 # IDNA encoding can fail for too long labels (>63 characters) or
163 # missing labels (e.g. http://.example.com)
164 netloc_bytes += hostname.encode(encoding)
165 if port is not None:
166 netloc_bytes += b":"
167 netloc_bytes += str(port).encode(encoding)
169 netloc = netloc_bytes.decode()
171 # default encoding for path component SHOULD be UTF-8
172 if quote_path:
173 path = quote(parts.path.encode(path_encoding), _PATH_SAFEST_CHARS)
174 else:
175 path = parts.path
177 if parts.scheme in _SPECIAL_SCHEMES:
178 query = quote(parts.query.encode(encoding), _SPECIAL_QUERY_SAFEST_CHARS)
179 else:
180 query = quote(parts.query.encode(encoding), _QUERY_SAFEST_CHARS)
182 return urlunsplit(
183 (
184 parts.scheme,
185 netloc,
186 path,
187 query,
188 quote(parts.fragment.encode(encoding), _FRAGMENT_SAFEST_CHARS),
189 )
190 )
193_parent_dirs = re.compile(r"/?(\.\./)+")
196def safe_download_url(
197 url: str | bytes, encoding: str = "utf8", path_encoding: str = "utf8"
198) -> str:
199 """Make a url for download. This will call safe_url_string
200 and then strip the fragment, if one exists. The path will
201 be normalised.
203 If the path is outside the document root, it will be changed
204 to be within the document root.
205 """
206 safe_url = safe_url_string(url, encoding, path_encoding)
207 scheme, netloc, path, query, _ = urlsplit(safe_url)
208 if path:
209 path = _parent_dirs.sub("", posixpath.normpath(path))
210 if safe_url.endswith("/") and not path.endswith("/"):
211 path += "/"
212 else:
213 path = "/"
214 return urlunsplit((scheme, netloc, path, query, ""))
217def is_url(text: str) -> bool:
218 return text.partition("://")[0] in ("file", "http", "https")
221@overload
222def url_query_parameter(
223 url: str | bytes,
224 parameter: str,
225 default: None = None,
226 keep_blank_values: bool | int = 0,
227) -> str | None: ...
230@overload
231def url_query_parameter(
232 url: str | bytes,
233 parameter: str,
234 default: str,
235 keep_blank_values: bool | int = 0,
236) -> str: ...
239def url_query_parameter(
240 url: str | bytes,
241 parameter: str,
242 default: str | None = None,
243 keep_blank_values: bool | int = 0,
244) -> str | None:
245 """Return the value of a url parameter, given the url and parameter name
247 General case:
249 >>> import w3lib.url
250 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "id")
251 '200'
252 >>>
254 Return a default value if the parameter is not found:
256 >>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault")
257 'mydefault'
258 >>>
260 Returns None if `keep_blank_values` not set or 0 (default):
262 >>> w3lib.url.url_query_parameter("product.html?id=", "id")
263 >>>
265 Returns an empty string if `keep_blank_values` set to 1:
267 >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1)
268 ''
269 >>>
271 """
273 queryparams = parse_qs(
274 urlsplit(str(url))[3], keep_blank_values=bool(keep_blank_values)
275 )
276 if parameter in queryparams:
277 return queryparams[parameter][0]
278 return default
281def url_query_cleaner(
282 url: str | bytes,
283 parameterlist: str | bytes | Sequence[str | bytes] = (),
284 sep: str = "&",
285 kvsep: str = "=",
286 remove: bool = False,
287 unique: bool = True,
288 keep_fragments: bool = False,
289) -> str:
290 """Clean URL arguments leaving only those passed in the parameterlist keeping order
292 >>> import w3lib.url
293 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',))
294 'product.html?id=200'
295 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])
296 'product.html?id=200&name=wired'
297 >>>
299 If `unique` is ``False``, do not remove duplicated keys
301 >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)
302 'product.html?d=1&d=2&d=3'
303 >>>
305 If `remove` is ``True``, leave only those **not in parameterlist**.
307 >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)
308 'product.html?foo=bar&name=wired'
309 >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)
310 'product.html?name=wired'
311 >>>
313 By default, URL fragments are removed. If you need to preserve fragments,
314 pass the ``keep_fragments`` argument as ``True``.
316 >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True)
317 'http://domain.tld/#123123'
319 """
321 if isinstance(parameterlist, (str, bytes)):
322 parameterlist = [parameterlist]
323 url, fragment = urldefrag(url)
324 url = cast("str", url)
325 fragment = cast("str", fragment)
326 base, _, query = url.partition("?")
327 seen = set()
328 querylist = []
329 for ksv in query.split(sep):
330 if not ksv:
331 continue
332 k, _, _ = ksv.partition(kvsep)
333 if unique and k in seen:
334 continue
335 if remove and k in parameterlist:
336 continue
337 if not remove and k not in parameterlist:
338 continue
339 querylist.append(ksv)
340 seen.add(k)
341 url = "?".join([base, sep.join(querylist)]) if querylist else base
342 if keep_fragments and fragment:
343 url += "#" + fragment
344 return url
347def _add_or_replace_parameters(url: str, params: dict[str, str]) -> str:
348 parsed = urlsplit(url)
349 current_args = parse_qsl(parsed.query, keep_blank_values=True)
351 new_args = []
352 seen_params = set()
353 for name, value in current_args:
354 if name not in params:
355 new_args.append((name, value))
356 elif name not in seen_params:
357 new_args.append((name, params[name]))
358 seen_params.add(name)
360 not_modified_args = [
361 (name, value) for name, value in params.items() if name not in seen_params
362 ]
363 new_args += not_modified_args
365 query = urlencode(new_args)
366 return urlunsplit(parsed._replace(query=query))
369def add_or_replace_parameter(url: str, name: str, new_value: str) -> str:
370 """Add or remove a parameter to a given url
372 >>> import w3lib.url
373 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php', 'arg', 'v')
374 'http://www.example.com/index.php?arg=v'
375 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg4', 'v4')
376 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3&arg4=v4'
377 >>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg3', 'v3new')
378 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new'
379 >>>
381 """
382 return _add_or_replace_parameters(url, {name: new_value})
385def add_or_replace_parameters(url: str, new_parameters: dict[str, str]) -> str:
386 """Add or remove a parameters to a given url
388 >>> import w3lib.url
389 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php', {'arg': 'v'})
390 'http://www.example.com/index.php?arg=v'
391 >>> args = {'arg4': 'v4', 'arg3': 'v3new'}
392 >>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', args)
393 'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new&arg4=v4'
394 >>>
396 """
397 return _add_or_replace_parameters(url, new_parameters)
400def path_to_file_uri(path: str | os.PathLike[str]) -> str:
401 """Convert local filesystem path to legal File URIs as described in:
402 http://en.wikipedia.org/wiki/File_URI_scheme
403 """
404 x = pathname2url(str(Path(path).absolute()))
405 return f"file:///{x.lstrip('/')}"
408def file_uri_to_path(uri: str) -> str:
409 """Convert File URI to local filesystem path according to:
410 http://en.wikipedia.org/wiki/File_URI_scheme
411 """
412 uri_path = urlparse(uri).path
413 return url2pathname(uri_path)
416def any_to_uri(uri_or_path: str) -> str:
417 """If given a path name, return its File URI, otherwise return it
418 unmodified
419 """
420 if os.path.splitdrive(uri_or_path)[0]:
421 return path_to_file_uri(uri_or_path)
422 u = urlparse(uri_or_path)
423 return uri_or_path if u.scheme else path_to_file_uri(uri_or_path)
426# ASCII characters.
427_char = set(map(chr, range(127)))
429# RFC 2045 token.
430_token = r"[{}]+".format(
431 re.escape(
432 "".join(
433 _char
434 -
435 # Control characters.
436 set(map(chr, range(32)))
437 -
438 # tspecials and space.
439 set('()<>@,;:\\"/[]?= ')
440 )
441 )
442)
444# RFC 822 quoted-string, without surrounding quotation marks.
445_quoted_string = r"(?:[{}]|(?:\\[{}]))*".format(
446 re.escape("".join(_char - {'"', "\\", "\r"})), re.escape("".join(_char))
447)
449# Encode the regular expression strings to make them into bytes, as Python 3
450# bytes have no format() method, but bytes must be passed to re.compile() in
451# order to make a pattern object that can be used to match on bytes.
453# RFC 2397 mediatype.
454_mediatype_pattern = re.compile(rf"{_token}/{_token}".encode())
455_mediatype_parameter_pattern = re.compile(
456 rf';({_token})=(?:({_token})|"({_quoted_string})")'.encode()
457)
460class ParseDataURIResult(NamedTuple):
461 """Named tuple returned by :func:`parse_data_uri`."""
463 #: MIME type type and subtype, separated by / (e.g. ``"text/plain"``).
464 media_type: str
465 #: MIME type parameters (e.g. ``{"charset": "US-ASCII"}``).
466 media_type_parameters: dict[str, str]
467 #: Data, decoded if it was encoded in base64 format.
468 data: bytes
471def parse_data_uri(uri: str | bytes) -> ParseDataURIResult:
472 """Parse a data: URI into :class:`ParseDataURIResult`."""
473 if not isinstance(uri, bytes):
474 uri = safe_url_string(uri).encode("ascii")
476 try:
477 scheme, uri = uri.split(b":", 1)
478 except ValueError:
479 raise ValueError("invalid URI")
480 if scheme.lower() != b"data":
481 raise ValueError("not a data URI")
483 # RFC 3986 section 2.1 allows percent encoding to escape characters that
484 # would be interpreted as delimiters, implying that actual delimiters
485 # should not be percent-encoded.
486 # Decoding before parsing will allow malformed URIs with percent-encoded
487 # delimiters, but it makes parsing easier and should not affect
488 # well-formed URIs, as the delimiters used in this URI scheme are not
489 # allowed, percent-encoded or not, in tokens.
490 uri = unquote_to_bytes(uri)
492 media_type = "text/plain"
493 media_type_params = {}
495 m = _mediatype_pattern.match(uri)
496 if m:
497 media_type = m.group().decode()
498 uri = uri[m.end() :]
499 else:
500 media_type_params["charset"] = "US-ASCII"
502 while True:
503 m = _mediatype_parameter_pattern.match(uri)
504 if m:
505 attribute, value, value_quoted = m.groups()
506 if value_quoted:
507 value = re.sub(rb"\\(.)", rb"\1", value_quoted)
508 media_type_params[attribute.decode()] = value.decode()
509 uri = uri[m.end() :]
510 else:
511 break
513 try:
514 is_base64, data = uri.split(b",", 1)
515 except ValueError:
516 raise ValueError("invalid data URI")
517 if is_base64:
518 if is_base64 != b";base64":
519 raise ValueError("invalid data URI")
520 data = base64.b64decode(data)
522 return ParseDataURIResult(media_type, media_type_params, data)
525__all__ = [
526 "add_or_replace_parameter",
527 "add_or_replace_parameters",
528 "any_to_uri",
529 "canonicalize_url",
530 "file_uri_to_path",
531 "is_url",
532 "parse_data_uri",
533 "path_to_file_uri",
534 "safe_download_url",
535 "safe_url_string",
536 "url_query_cleaner",
537 "url_query_parameter",
538]
541def _safe_ParseResult(
542 parts: ParseResult, encoding: str = "utf8", path_encoding: str = "utf8"
543) -> tuple[str, str, str, str, str, str]:
544 # IDNA encoding can fail for too long labels (>63 characters)
545 # or missing labels (e.g. http://.example.com)
546 try:
547 netloc = parts.netloc.encode("idna").decode()
548 except UnicodeError:
549 netloc = parts.netloc
551 return (
552 parts.scheme,
553 netloc,
554 quote(parts.path.encode(path_encoding), _path_safe_chars),
555 quote(parts.params.encode(path_encoding), _safe_chars),
556 quote(parts.query.encode(encoding), _safe_chars),
557 quote(parts.fragment.encode(encoding), _safe_chars),
558 )
561def canonicalize_url(
562 url: str | bytes | ParseResult,
563 keep_blank_values: bool = True,
564 keep_fragments: bool = False,
565 encoding: str | None = None,
566) -> str:
567 r"""Canonicalize the given url by applying the following procedures:
569 - make the URL safe
570 - sort query arguments, first by key, then by value
571 - normalize all spaces (in query arguments) '+' (plus symbol)
572 - normalize percent encodings case (%2f -> %2F)
573 - remove query arguments with blank values (unless `keep_blank_values` is True)
574 - remove fragments (unless `keep_fragments` is True)
576 The url passed can be bytes or unicode, while the url returned is
577 always a native str (bytes in Python 2, unicode in Python 3).
579 >>> import w3lib.url
580 >>>
581 >>> # sorting query arguments
582 >>> w3lib.url.canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50')
583 'http://www.example.com/do?a=50&b=2&b=5&c=3'
584 >>>
585 >>> # UTF-8 conversion + percent-encoding of non-ASCII characters
586 >>> w3lib.url.canonicalize_url('http://www.example.com/r\u00e9sum\u00e9')
587 'http://www.example.com/r%C3%A9sum%C3%A9'
588 >>>
590 For more examples, see the tests in `tests/test_url.py`.
591 """
592 # If supplied `encoding` is not compatible with all characters in `url`,
593 # fallback to UTF-8 as safety net.
594 # UTF-8 can handle all Unicode characters,
595 # so we should be covered regarding URL normalization,
596 # if not for proper URL expected by remote website.
597 if isinstance(url, str):
598 url = _strip(url)
599 try:
600 scheme, netloc, path, params, query, fragment = _safe_ParseResult(
601 parse_url(url), encoding=encoding or "utf8"
602 )
603 except UnicodeEncodeError:
604 scheme, netloc, path, params, query, fragment = _safe_ParseResult(
605 parse_url(url), encoding="utf8"
606 )
608 # 1. decode query-string as UTF-8 (or keep raw bytes),
609 # sort values,
610 # and percent-encode them back
612 # Python's urllib.parse.parse_qsl does not work as wanted
613 # for percent-encoded characters that do not match passed encoding,
614 # they get lost.
615 #
616 # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')]
617 # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD),
618 # instead of \xa3 that you get with Python2's parse_qsl)
619 #
620 # what we want here is to keep raw bytes, and percent encode them
621 # so as to preserve whatever encoding what originally used.
622 #
623 # See https://tools.ietf.org/html/rfc3987#section-6.4:
624 #
625 # For example, it is possible to have a URI reference of
626 # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the
627 # document name is encoded in iso-8859-1 based on server settings, but
628 # where the fragment identifier is encoded in UTF-8 according to
629 # [XPointer]. The IRI corresponding to the above URI would be (in XML
630 # notation)
631 # "http://www.example.org/r%E9sum%E9.xml#résumé".
632 # Similar considerations apply to query parts. The functionality of
633 # IRIs (namely, to be able to include non-ASCII characters) can only be
634 # used if the query part is encoded in UTF-8.
635 keyvals = parse_qsl_to_bytes(query, keep_blank_values)
637 keyvals.sort()
638 query = urlencode(keyvals)
640 # 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes)
641 # and percent-encode path again (this normalizes to upper-case %XX)
642 uqp = _unquotepath(path)
643 path = quote(uqp, _path_safe_chars) or "/"
645 fragment = "" if not keep_fragments else fragment
647 # Apply lowercase to the domain, but not to the userinfo.
648 netloc_parts = netloc.split("@")
649 netloc_parts[-1] = netloc_parts[-1].lower().rstrip(":")
650 netloc = "@".join(netloc_parts)
652 # every part should be safe already
653 return urlunparse((scheme, netloc, path, params, query, fragment))
656def _unquotepath(path: str) -> bytes:
657 for reserved in ("2f", "2F", "3f", "3F"):
658 path = path.replace("%" + reserved, "%25" + reserved.upper())
660 # standard lib's unquote() does not work for non-UTF-8
661 # percent-escaped characters, they get lost.
662 # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)
663 #
664 # unquote_to_bytes() returns raw bytes instead
665 return unquote_to_bytes(path)
668def parse_url(
669 url: str | bytes | ParseResult, encoding: str | None = None
670) -> ParseResult:
671 """Return urlparsed url from the given argument (which could be an already
672 parsed url)
673 """
674 if isinstance(url, ParseResult):
675 return url
676 return urlparse(to_unicode(url, encoding))
679def parse_qsl_to_bytes(
680 qs: str, keep_blank_values: bool = False
681) -> list[tuple[bytes, bytes]]:
682 """Parse a query given as a string argument.
684 Data are returned as a list of name, value pairs as bytes.
686 Arguments:
688 qs: percent-encoded query string to be parsed
690 keep_blank_values: flag indicating whether blank values in
691 percent-encoded queries should be treated as blank strings. A
692 true value indicates that blanks should be retained as blank
693 strings. The default false value indicates that blank values
694 are to be ignored and treated as if they were not included.
696 """
697 # This code is the same as Python3's parse_qsl()
698 # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
699 # except for the unquote(s, encoding, errors) calls replaced
700 # with unquote_to_bytes(s)
701 coerce_args = cast("Callable[..., tuple[str, Callable[..., bytes]]]", _coerce_args)
702 qs, _coerce_result = coerce_args(qs)
703 pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")]
704 r = []
705 for name_value in pairs:
706 if not name_value:
707 continue
708 nv = name_value.split("=", 1)
709 if len(nv) != 2:
710 # Handle case of a control-name with no equal sign
711 if keep_blank_values:
712 nv.append("")
713 else:
714 continue
715 if len(nv[1]) or keep_blank_values:
716 name: str | bytes = nv[0].replace("+", " ")
717 name = unquote_to_bytes(name)
718 name = _coerce_result(name)
719 value: str | bytes = nv[1].replace("+", " ")
720 value = unquote_to_bytes(value)
721 value = _coerce_result(value)
722 r.append((name, value))
723 return r