Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/_vendor/parse.py: 25%
566 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1"""Parse (absolute and relative) URLs.
3urlparse module is based upon the following RFC specifications.
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28"""
30import re
31import sys
32import collections
34__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
35 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
37 "unquote", "unquote_plus", "unquote_to_bytes",
38 "DefragResult", "ParseResult", "SplitResult",
39 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
41# A classification of schemes.
42# The empty string classifies URLs with no scheme specified,
43# being the default value returned by “urlsplit” and “urlparse”.
45uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
46 'wais', 'file', 'https', 'shttp', 'mms',
47 'prospero', 'rtsp', 'rtspu', 'sftp',
48 'svn', 'svn+ssh', 'ws', 'wss']
50uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
51 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
52 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
53 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
54 'ws', 'wss']
56uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
57 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
58 'mms', 'sftp', 'tel']
60# These are not actually used anymore, but should stay for backwards
61# compatibility. (They are undocumented, but have a public-looking name.)
63non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
64 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
66uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
67 'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
69uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
70 'nntp', 'wais', 'https', 'shttp', 'snews',
71 'file', 'prospero']
73# Characters valid in scheme names
74scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
75 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
76 '0123456789'
77 '+-.')
79# Unsafe bytes to be removed per WHATWG spec
80_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
82# XXX: Consider replacing with functools.lru_cache
83MAX_CACHE_SIZE = 20
84_parse_cache = {}
86def clear_cache():
87 """Clear the parse cache and the quoters cache."""
88 _parse_cache.clear()
89 _safe_quoters.clear()
92# Helpers for bytes handling
93# For 3.2, we deliberately require applications that
94# handle improperly quoted URLs to do their own
95# decoding and encoding. If valid use cases are
96# presented, we may relax this by using latin-1
97# decoding internally for 3.3
98_implicit_encoding = 'ascii'
99_implicit_errors = 'strict'
101def _noop(obj):
102 return obj
104def _encode_result(obj, encoding=_implicit_encoding,
105 errors=_implicit_errors):
106 return obj.encode(encoding, errors)
108def _decode_args(args, encoding=_implicit_encoding,
109 errors=_implicit_errors):
110 return tuple(x.decode(encoding, errors) if x else '' for x in args)
112def _coerce_args(*args):
113 # Invokes decode if necessary to create str args
114 # and returns the coerced inputs along with
115 # an appropriate result coercion function
116 # - noop for str inputs
117 # - encoding function otherwise
118 str_input = isinstance(args[0], str)
119 for arg in args[1:]:
120 # We special-case the empty string to support the
121 # "scheme=''" default argument to some functions
122 if arg and isinstance(arg, str) != str_input:
123 raise TypeError("Cannot mix str and non-str arguments")
124 if str_input:
125 return args + (_noop,)
126 return _decode_args(args) + (_encode_result,)
128# Result objects are more helpful than simple tuples
129class _ResultMixinStr(object):
130 """Standard approach to encoding parsed results from str to bytes"""
131 __slots__ = ()
133 def encode(self, encoding='ascii', errors='strict'):
134 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
137class _ResultMixinBytes(object):
138 """Standard approach to decoding parsed results from bytes to str"""
139 __slots__ = ()
141 def decode(self, encoding='ascii', errors='strict'):
142 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
145class _NetlocResultMixinBase(object):
146 """Shared methods for the parsed result objects containing a netloc element"""
147 __slots__ = ()
149 @property
150 def username(self):
151 return self._userinfo[0]
153 @property
154 def password(self):
155 return self._userinfo[1]
157 @property
158 def hostname(self):
159 hostname = self._hostinfo[0]
160 if not hostname:
161 return None
162 # Scoped IPv6 address may have zone info, which must not be lowercased
163 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
164 separator = '%' if isinstance(hostname, str) else b'%'
165 hostname, percent, zone = hostname.partition(separator)
166 return hostname.lower() + percent + zone
168 @property
169 def port(self):
170 port = self._hostinfo[1]
171 if port is not None:
172 port = int(port, 10)
173 if not ( 0 <= port <= 65535):
174 raise ValueError("Port out of range 0-65535")
175 return port
178class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
179 __slots__ = ()
181 @property
182 def _userinfo(self):
183 netloc = self.netloc
184 userinfo, have_info, hostinfo = netloc.rpartition('@')
185 if have_info:
186 username, have_password, password = userinfo.partition(':')
187 if not have_password:
188 password = None
189 else:
190 username = password = None
191 return username, password
193 @property
194 def _hostinfo(self):
195 netloc = self.netloc
196 _, _, hostinfo = netloc.rpartition('@')
197 _, have_open_br, bracketed = hostinfo.partition('[')
198 if have_open_br:
199 hostname, _, port = bracketed.partition(']')
200 _, _, port = port.partition(':')
201 else:
202 hostname, _, port = hostinfo.partition(':')
203 if not port:
204 port = None
205 return hostname, port
208class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
209 __slots__ = ()
211 @property
212 def _userinfo(self):
213 netloc = self.netloc
214 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
215 if have_info:
216 username, have_password, password = userinfo.partition(b':')
217 if not have_password:
218 password = None
219 else:
220 username = password = None
221 return username, password
223 @property
224 def _hostinfo(self):
225 netloc = self.netloc
226 _, _, hostinfo = netloc.rpartition(b'@')
227 _, have_open_br, bracketed = hostinfo.partition(b'[')
228 if have_open_br:
229 hostname, _, port = bracketed.partition(b']')
230 _, _, port = port.partition(b':')
231 else:
232 hostname, _, port = hostinfo.partition(b':')
233 if not port:
234 port = None
235 return hostname, port
238from collections import namedtuple
240_DefragResultBase = namedtuple('DefragResult', 'url fragment')
241_SplitResultBase = namedtuple(
242 'SplitResult', 'scheme netloc path query fragment')
243_ParseResultBase = namedtuple(
244 'ParseResult', 'scheme netloc path params query fragment')
246_DefragResultBase.__doc__ = """
247DefragResult(url, fragment)
249A 2-tuple that contains the url without fragment identifier and the fragment
250identifier as a separate argument.
251"""
253_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
255_DefragResultBase.fragment.__doc__ = """
256Fragment identifier separated from URL, that allows indirect identification of a
257secondary resource by reference to a primary resource and additional identifying
258information.
259"""
261_SplitResultBase.__doc__ = """
262SplitResult(scheme, netloc, path, query, fragment)
264A 5-tuple that contains the different components of a URL. Similar to
265ParseResult, but does not split params.
266"""
268_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
270_SplitResultBase.netloc.__doc__ = """
271Network location where the request is made to.
272"""
274_SplitResultBase.path.__doc__ = """
275The hierarchical path, such as the path to a file to download.
276"""
278_SplitResultBase.query.__doc__ = """
279The query component, that contains non-hierarchical data, that along with data
280in path component, identifies a resource in the scope of URI's scheme and
281network location.
282"""
284_SplitResultBase.fragment.__doc__ = """
285Fragment identifier, that allows indirect identification of a secondary resource
286by reference to a primary resource and additional identifying information.
287"""
289_ParseResultBase.__doc__ = """
290ParseResult(scheme, netloc, path, params, query, fragment)
292A 6-tuple that contains components of a parsed URL.
293"""
295_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
296_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
297_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
298_ParseResultBase.params.__doc__ = """
299Parameters for last path element used to dereference the URI in order to provide
300access to perform some operation on the resource.
301"""
303_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
304_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
307# For backwards compatibility, alias _NetlocResultMixinStr
308# ResultBase is no longer part of the documented API, but it is
309# retained since deprecating it isn't worth the hassle
310ResultBase = _NetlocResultMixinStr
312# Structured result objects for string data
313class DefragResult(_DefragResultBase, _ResultMixinStr):
314 __slots__ = ()
315 def geturl(self):
316 if self.fragment:
317 return self.url + '#' + self.fragment
318 else:
319 return self.url
321class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
322 __slots__ = ()
323 def geturl(self):
324 return urlunsplit(self)
326class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
327 __slots__ = ()
328 def geturl(self):
329 return urlunparse(self)
331# Structured result objects for bytes data
332class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
333 __slots__ = ()
334 def geturl(self):
335 if self.fragment:
336 return self.url + b'#' + self.fragment
337 else:
338 return self.url
340class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
341 __slots__ = ()
342 def geturl(self):
343 return urlunsplit(self)
345class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
346 __slots__ = ()
347 def geturl(self):
348 return urlunparse(self)
350# Set up the encode/decode result pairs
351def _fix_result_transcoding():
352 _result_pairs = (
353 (DefragResult, DefragResultBytes),
354 (SplitResult, SplitResultBytes),
355 (ParseResult, ParseResultBytes),
356 )
357 for _decoded, _encoded in _result_pairs:
358 _decoded._encoded_counterpart = _encoded
359 _encoded._decoded_counterpart = _decoded
361_fix_result_transcoding()
362del _fix_result_transcoding
364def urlparse(url, scheme='', allow_fragments=True):
365 """Parse a URL into 6 components:
366 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
367 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
368 Note that we don't break the components up in smaller bits
369 (e.g. netloc is a single string) and we don't expand % escapes."""
370 url, scheme, _coerce_result = _coerce_args(url, scheme)
371 splitresult = urlsplit(url, scheme, allow_fragments)
372 scheme, netloc, url, query, fragment = splitresult
373 if scheme in uses_params and ';' in url:
374 url, params = _splitparams(url)
375 else:
376 params = ''
377 result = ParseResult(scheme, netloc, url, params, query, fragment)
378 return _coerce_result(result)
380def _splitparams(url):
381 if '/' in url:
382 i = url.find(';', url.rfind('/'))
383 if i < 0:
384 return url, ''
385 else:
386 i = url.find(';')
387 return url[:i], url[i+1:]
389def _splitnetloc(url, start=0):
390 delim = len(url) # position of end of domain part of url, default is end
391 for c in '/?#': # look for delimiters; the order is NOT important
392 wdelim = url.find(c, start) # find first of this delim
393 if wdelim >= 0: # if found
394 delim = min(delim, wdelim) # use earliest delim position
395 return url[start:delim], url[delim:] # return (domain, rest)
397def _checknetloc(netloc):
398 if not netloc or not any(ord(c) > 127 for c in netloc):
399 return
400 # looking for characters like \u2100 that expand to 'a/c'
401 # IDNA uses NFKC equivalence, so normalize for this check
402 import unicodedata
403 n = netloc.replace('@', '') # ignore characters already included
404 n = n.replace(':', '') # but not the surrounding text
405 n = n.replace('#', '')
406 n = n.replace('?', '')
407 netloc2 = unicodedata.normalize('NFKC', n)
408 if n == netloc2:
409 return
410 for c in '/?#@:':
411 if c in netloc2:
412 raise ValueError("netloc '" + netloc + "' contains invalid " +
413 "characters under NFKC normalization")
415def _remove_unsafe_bytes_from_url(url):
416 for b in _UNSAFE_URL_BYTES_TO_REMOVE:
417 url = url.replace(b, "")
418 return url
420def urlsplit(url, scheme='', allow_fragments=True):
421 """Parse a URL into 5 components:
422 <scheme>://<netloc>/<path>?<query>#<fragment>
423 Return a 5-tuple: (scheme, netloc, path, query, fragment).
424 Note that we don't break the components up in smaller bits
425 (e.g. netloc is a single string) and we don't expand % escapes."""
426 url, scheme, _coerce_result = _coerce_args(url, scheme)
427 url = _remove_unsafe_bytes_from_url(url)
428 scheme = _remove_unsafe_bytes_from_url(scheme)
429 allow_fragments = bool(allow_fragments)
430 key = url, scheme, allow_fragments, type(url), type(scheme)
431 cached = _parse_cache.get(key, None)
432 if cached:
433 return _coerce_result(cached)
434 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
435 clear_cache()
436 netloc = query = fragment = ''
437 i = url.find(':')
438 if i > 0:
439 if url[:i] == 'http': # optimize the common case
440 scheme = url[:i].lower()
441 url = url[i+1:]
442 if url[:2] == '//':
443 netloc, url = _splitnetloc(url, 2)
444 if (('[' in netloc and ']' not in netloc) or
445 (']' in netloc and '[' not in netloc)):
446 raise ValueError("Invalid IPv6 URL")
447 if allow_fragments and '#' in url:
448 url, fragment = url.split('#', 1)
449 if '?' in url:
450 url, query = url.split('?', 1)
451 _checknetloc(netloc)
452 v = SplitResult(scheme, netloc, url, query, fragment)
453 _parse_cache[key] = v
454 return _coerce_result(v)
455 for c in url[:i]:
456 if c not in scheme_chars:
457 break
458 else:
459 # make sure "url" is not actually a port number (in which case
460 # "scheme" is really part of the path)
461 rest = url[i+1:]
462 if not rest or any(c not in '0123456789' for c in rest):
463 # not a port number
464 scheme, url = url[:i].lower(), rest
466 if url[:2] == '//':
467 netloc, url = _splitnetloc(url, 2)
468 if (('[' in netloc and ']' not in netloc) or
469 (']' in netloc and '[' not in netloc)):
470 raise ValueError("Invalid IPv6 URL")
471 if allow_fragments and '#' in url:
472 url, fragment = url.split('#', 1)
473 if '?' in url:
474 url, query = url.split('?', 1)
475 _checknetloc(netloc)
476 v = SplitResult(scheme, netloc, url, query, fragment)
477 _parse_cache[key] = v
478 return _coerce_result(v)
480def urlunparse(components):
481 """Put a parsed URL back together again. This may result in a
482 slightly different, but equivalent URL, if the URL that was parsed
483 originally had redundant delimiters, e.g. a ? with an empty query
484 (the draft states that these are equivalent)."""
485 scheme, netloc, url, params, query, fragment, _coerce_result = (
486 _coerce_args(*components))
487 if params:
488 url = "%s;%s" % (url, params)
489 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
491def urlunsplit(components):
492 """Combine the elements of a tuple as returned by urlsplit() into a
493 complete URL as a string. The data argument can be any five-item iterable.
494 This may result in a slightly different, but equivalent URL, if the URL that
495 was parsed originally had unnecessary delimiters (for example, a ? with an
496 empty query; the RFC states that these are equivalent)."""
497 scheme, netloc, url, query, fragment, _coerce_result = (
498 _coerce_args(*components))
499 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
500 if url and url[:1] != '/': url = '/' + url
501 url = '//' + (netloc or '') + url
502 if scheme:
503 url = scheme + ':' + url
504 if query:
505 url = url + '?' + query
506 if fragment:
507 url = url + '#' + fragment
508 return _coerce_result(url)
510def urljoin(base, url, allow_fragments=True):
511 """Join a base URL and a possibly relative URL to form an absolute
512 interpretation of the latter."""
513 if not base:
514 return url
515 if not url:
516 return base
518 base, url, _coerce_result = _coerce_args(base, url)
519 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
520 urlparse(base, '', allow_fragments)
521 scheme, netloc, path, params, query, fragment = \
522 urlparse(url, bscheme, allow_fragments)
524 if scheme != bscheme or scheme not in uses_relative:
525 return _coerce_result(url)
526 if scheme in uses_netloc:
527 if netloc:
528 return _coerce_result(urlunparse((scheme, netloc, path,
529 params, query, fragment)))
530 netloc = bnetloc
532 if not path and not params:
533 path = bpath
534 params = bparams
535 if not query:
536 query = bquery
537 return _coerce_result(urlunparse((scheme, netloc, path,
538 params, query, fragment)))
540 base_parts = bpath.split('/')
541 if base_parts[-1] != '':
542 # the last item is not a directory, so will not be taken into account
543 # in resolving the relative path
544 del base_parts[-1]
546 # for rfc3986, ignore all base path should the first character be root.
547 if path[:1] == '/':
548 segments = path.split('/')
549 else:
550 segments = base_parts + path.split('/')
551 # filter out elements that would cause redundant slashes on re-joining
552 # the resolved_path
553 segments[1:-1] = filter(None, segments[1:-1])
555 resolved_path = []
557 for seg in segments:
558 if seg == '..':
559 try:
560 resolved_path.pop()
561 except IndexError:
562 # ignore any .. segments that would otherwise cause an IndexError
563 # when popped from resolved_path if resolving for rfc3986
564 pass
565 elif seg == '.':
566 continue
567 else:
568 resolved_path.append(seg)
570 if segments[-1] in ('.', '..'):
571 # do some post-processing here. if the last segment was a relative dir,
572 # then we need to append the trailing '/'
573 resolved_path.append('')
575 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
576 resolved_path) or '/', params, query, fragment)))
579def urldefrag(url):
580 """Removes any existing fragment from URL.
582 Returns a tuple of the defragmented URL and the fragment. If
583 the URL contained no fragments, the second element is the
584 empty string.
585 """
586 url, _coerce_result = _coerce_args(url)
587 if '#' in url:
588 s, n, p, a, q, frag = urlparse(url)
589 defrag = urlunparse((s, n, p, a, q, ''))
590 else:
591 frag = ''
592 defrag = url
593 return _coerce_result(DefragResult(defrag, frag))
595_hexdig = '0123456789ABCDEFabcdef'
596_hextobyte = None
598def unquote_to_bytes(string):
599 """unquote_to_bytes('abc%20def') -> b'abc def'."""
600 # Note: strings are encoded as UTF-8. This is only an issue if it contains
601 # unescaped non-ASCII characters, which URIs should not.
602 if not string:
603 # Is it a string-like object?
604 string.split
605 return b''
606 if isinstance(string, str):
607 string = string.encode('utf-8')
608 bits = string.split(b'%')
609 if len(bits) == 1:
610 return string
611 res = [bits[0]]
612 append = res.append
613 # Delay the initialization of the table to not waste memory
614 # if the function is never called
615 global _hextobyte
616 if _hextobyte is None:
617 _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
618 for a in _hexdig for b in _hexdig}
619 for item in bits[1:]:
620 try:
621 append(_hextobyte[item[:2]])
622 append(item[2:])
623 except KeyError:
624 append(b'%')
625 append(item)
626 return b''.join(res)
628_asciire = re.compile('([\x00-\x7f]+)')
630def unquote(string, encoding='utf-8', errors='replace'):
631 """Replace %xx escapes by their single-character equivalent. The optional
632 encoding and errors parameters specify how to decode percent-encoded
633 sequences into Unicode characters, as accepted by the bytes.decode()
634 method.
635 By default, percent-encoded sequences are decoded with UTF-8, and invalid
636 sequences are replaced by a placeholder character.
638 unquote('abc%20def') -> 'abc def'.
639 """
640 if '%' not in string:
641 string.split
642 return string
643 if encoding is None:
644 encoding = 'utf-8'
645 if errors is None:
646 errors = 'replace'
647 bits = _asciire.split(string)
648 res = [bits[0]]
649 append = res.append
650 for i in range(1, len(bits), 2):
651 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
652 append(bits[i + 1])
653 return ''.join(res)
656def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
657 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
658 """Parse a query given as a string argument.
660 Arguments:
662 qs: percent-encoded query string to be parsed
664 keep_blank_values: flag indicating whether blank values in
665 percent-encoded queries should be treated as blank strings.
666 A true value indicates that blanks should be retained as
667 blank strings. The default false value indicates that
668 blank values are to be ignored and treated as if they were
669 not included.
671 strict_parsing: flag indicating what to do with parsing errors.
672 If false (the default), errors are silently ignored.
673 If true, errors raise a ValueError exception.
675 encoding and errors: specify how to decode percent-encoded sequences
676 into Unicode characters, as accepted by the bytes.decode() method.
678 max_num_fields: int. If set, then throws a ValueError if there
679 are more than n fields read by parse_qsl().
681 separator: str. The symbol to use for separating the query arguments.
682 Defaults to &.
684 Returns a dictionary.
685 """
686 parsed_result = {}
687 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
688 encoding=encoding, errors=errors,
689 max_num_fields=max_num_fields, separator=separator)
690 for name, value in pairs:
691 if name in parsed_result:
692 parsed_result[name].append(value)
693 else:
694 parsed_result[name] = [value]
695 return parsed_result
698def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
699 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
700 """Parse a query given as a string argument.
702 Arguments:
704 qs: percent-encoded query string to be parsed
706 keep_blank_values: flag indicating whether blank values in
707 percent-encoded queries should be treated as blank strings.
708 A true value indicates that blanks should be retained as blank
709 strings. The default false value indicates that blank values
710 are to be ignored and treated as if they were not included.
712 strict_parsing: flag indicating what to do with parsing errors. If
713 false (the default), errors are silently ignored. If true,
714 errors raise a ValueError exception.
716 encoding and errors: specify how to decode percent-encoded sequences
717 into Unicode characters, as accepted by the bytes.decode() method.
719 max_num_fields: int. If set, then throws a ValueError
720 if there are more than n fields read by parse_qsl().
722 separator: str. The symbol to use for separating the query arguments.
723 Defaults to &.
725 Returns a list, as G-d intended.
726 """
727 qs, _coerce_result = _coerce_args(qs)
729 if not separator or (not isinstance(separator, (str, bytes))):
730 raise ValueError("Separator must be of type string or bytes.")
732 # If max_num_fields is defined then check that the number of fields
733 # is less than max_num_fields. This prevents a memory exhaustion DOS
734 # attack via post bodies with many fields.
735 if max_num_fields is not None:
736 num_fields = 1 + qs.count(separator)
737 if max_num_fields < num_fields:
738 raise ValueError('Max number of fields exceeded')
740 pairs = [s1 for s1 in qs.split(separator)]
741 r = []
742 for name_value in pairs:
743 if not name_value and not strict_parsing:
744 continue
745 nv = name_value.split('=', 1)
746 if len(nv) != 2:
747 if strict_parsing:
748 raise ValueError("bad query field: %r" % (name_value,))
749 # Handle case of a control-name with no equal sign
750 if keep_blank_values:
751 nv.append('')
752 else:
753 continue
754 if len(nv[1]) or keep_blank_values:
755 name = nv[0].replace('+', ' ')
756 name = unquote(name, encoding=encoding, errors=errors)
757 name = _coerce_result(name)
758 value = nv[1].replace('+', ' ')
759 value = unquote(value, encoding=encoding, errors=errors)
760 value = _coerce_result(value)
761 r.append((name, value))
762 return r
764def unquote_plus(string, encoding='utf-8', errors='replace'):
765 """Like unquote(), but also replace plus signs by spaces, as required for
766 unquoting HTML form values.
768 unquote_plus('%7e/abc+def') -> '~/abc def'
769 """
770 string = string.replace('+', ' ')
771 return unquote(string, encoding, errors)
773_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
774 b'abcdefghijklmnopqrstuvwxyz'
775 b'0123456789'
776 b'_.-')
777_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
778_safe_quoters = {}
780class Quoter(collections.defaultdict):
781 """A mapping from bytes (in range(0,256)) to strings.
783 String values are percent-encoded byte values, unless the key < 128, and
784 in the "safe" set (either the specified safe set, or default set).
785 """
786 # Keeps a cache internally, using defaultdict, for efficiency (lookups
787 # of cached keys don't call Python code at all).
788 def __init__(self, safe):
789 """safe: bytes object."""
790 self.safe = _ALWAYS_SAFE.union(safe)
792 def __repr__(self):
793 # Without this, will just display as a defaultdict
794 return "<%s %r>" % (self.__class__.__name__, dict(self))
796 def __missing__(self, b):
797 # Handle a cache miss. Store quoted string in cache and return.
798 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
799 self[b] = res
800 return res
802def quote(string, safe='/', encoding=None, errors=None):
803 """quote('abc def') -> 'abc%20def'
805 Each part of a URL, e.g. the path info, the query, etc., has a
806 different set of reserved characters that must be quoted.
808 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
809 the following reserved characters.
811 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
812 "$" | ","
814 Each of these characters is reserved in some component of a URL,
815 but not necessarily in all of them.
817 By default, the quote function is intended for quoting the path
818 section of a URL. Thus, it will not encode '/'. This character
819 is reserved, but in typical usage the quote function is being
820 called on a path where the existing slash characters are used as
821 reserved characters.
823 string and safe may be either str or bytes objects. encoding and errors
824 must not be specified if string is a bytes object.
826 The optional encoding and errors parameters specify how to deal with
827 non-ASCII characters, as accepted by the str.encode method.
828 By default, encoding='utf-8' (characters are encoded with UTF-8), and
829 errors='strict' (unsupported characters raise a UnicodeEncodeError).
830 """
831 if isinstance(string, str):
832 if not string:
833 return string
834 if encoding is None:
835 encoding = 'utf-8'
836 if errors is None:
837 errors = 'strict'
838 string = string.encode(encoding, errors)
839 else:
840 if encoding is not None:
841 raise TypeError("quote() doesn't support 'encoding' for bytes")
842 if errors is not None:
843 raise TypeError("quote() doesn't support 'errors' for bytes")
844 return quote_from_bytes(string, safe)
846def quote_plus(string, safe='', encoding=None, errors=None):
847 """Like quote(), but also replace ' ' with '+', as required for quoting
848 HTML form values. Plus signs in the original string are escaped unless
849 they are included in safe. It also does not have safe default to '/'.
850 """
851 # Check if ' ' in string, where string may either be a str or bytes. If
852 # there are no spaces, the regular quote will produce the right answer.
853 if ((isinstance(string, str) and ' ' not in string) or
854 (isinstance(string, bytes) and b' ' not in string)):
855 return quote(string, safe, encoding, errors)
856 if isinstance(safe, str):
857 space = ' '
858 else:
859 space = b' '
860 string = quote(string, safe + space, encoding, errors)
861 return string.replace(' ', '+')
863def quote_from_bytes(bs, safe='/'):
864 """Like quote(), but accepts a bytes object rather than a str, and does
865 not perform string-to-bytes encoding. It always returns an ASCII string.
866 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
867 """
868 if not isinstance(bs, (bytes, bytearray)):
869 raise TypeError("quote_from_bytes() expected bytes")
870 if not bs:
871 return ''
872 if isinstance(safe, str):
873 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
874 safe = safe.encode('ascii', 'ignore')
875 else:
876 safe = bytes([c for c in safe if c < 128])
877 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
878 return bs.decode()
879 try:
880 quoter = _safe_quoters[safe]
881 except KeyError:
882 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
883 return ''.join([quoter(char) for char in bs])
885def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
886 quote_via=quote_plus):
887 """Encode a dict or sequence of two-element tuples into a URL query string.
889 If any values in the query arg are sequences and doseq is true, each
890 sequence element is converted to a separate parameter.
892 If the query arg is a sequence of two-element tuples, the order of the
893 parameters in the output will match the order of parameters in the
894 input.
896 The components of a query arg may each be either a string or a bytes type.
898 The safe, encoding, and errors parameters are passed down to the function
899 specified by quote_via (encoding and errors only if a component is a str).
900 """
902 if hasattr(query, "items"):
903 query = query.items()
904 else:
905 # It's a bother at times that strings and string-like objects are
906 # sequences.
907 try:
908 # non-sequence items should not work with len()
909 # non-empty strings will fail this
910 if len(query) and not isinstance(query[0], tuple):
911 raise TypeError
912 # Zero-length sequences of all types will get here and succeed,
913 # but that's a minor nit. Since the original implementation
914 # allowed empty dicts that type of behavior probably should be
915 # preserved for consistency
916 except TypeError:
917 ty, va, tb = sys.exc_info()
918 raise TypeError("not a valid non-string sequence "
919 "or mapping object").with_traceback(tb)
921 l = []
922 if not doseq:
923 for k, v in query:
924 if isinstance(k, bytes):
925 k = quote_via(k, safe)
926 else:
927 k = quote_via(str(k), safe, encoding, errors)
929 if isinstance(v, bytes):
930 v = quote_via(v, safe)
931 else:
932 v = quote_via(str(v), safe, encoding, errors)
933 l.append(k + '=' + v)
934 else:
935 for k, v in query:
936 if isinstance(k, bytes):
937 k = quote_via(k, safe)
938 else:
939 k = quote_via(str(k), safe, encoding, errors)
941 if isinstance(v, bytes):
942 v = quote_via(v, safe)
943 l.append(k + '=' + v)
944 elif isinstance(v, str):
945 v = quote_via(v, safe, encoding, errors)
946 l.append(k + '=' + v)
947 else:
948 try:
949 # Is this a sufficient test for sequence-ness?
950 x = len(v)
951 except TypeError:
952 # not a sequence
953 v = quote_via(str(v), safe, encoding, errors)
954 l.append(k + '=' + v)
955 else:
956 # loop over the sequence
957 for elt in v:
958 if isinstance(elt, bytes):
959 elt = quote_via(elt, safe)
960 else:
961 elt = quote_via(str(elt), safe, encoding, errors)
962 l.append(k + '=' + elt)
963 return '&'.join(l)
965def to_bytes(url):
966 """to_bytes(u"URL") --> 'URL'."""
967 # Most URL schemes require ASCII. If that changes, the conversion
968 # can be relaxed.
969 # XXX get rid of to_bytes()
970 if isinstance(url, str):
971 try:
972 url = url.encode("ASCII").decode()
973 except UnicodeError:
974 raise UnicodeError("URL " + repr(url) +
975 " contains non-ASCII characters")
976 return url
978def unwrap(url):
979 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
980 url = str(url).strip()
981 if url[:1] == '<' and url[-1:] == '>':
982 url = url[1:-1].strip()
983 if url[:4] == 'URL:': url = url[4:].strip()
984 return url
986_typeprog = None
987def splittype(url):
988 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
989 global _typeprog
990 if _typeprog is None:
991 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
993 match = _typeprog.match(url)
994 if match:
995 scheme, data = match.groups()
996 return scheme.lower(), data
997 return None, url
999_hostprog = None
1000def splithost(url):
1001 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1002 global _hostprog
1003 if _hostprog is None:
1004 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
1006 match = _hostprog.match(url)
1007 if match:
1008 host_port, path = match.groups()
1009 if path and path[0] != '/':
1010 path = '/' + path
1011 return host_port, path
1012 return None, url
1014def splituser(host):
1015 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1016 user, delim, host = host.rpartition('@')
1017 return (user if delim else None), host
1019def splitpasswd(user):
1020 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1021 user, delim, passwd = user.partition(':')
1022 return user, (passwd if delim else None)
1024# splittag('/path#tag') --> '/path', 'tag'
1025_portprog = None
1026def splitport(host):
1027 """splitport('host:port') --> 'host', 'port'."""
1028 global _portprog
1029 if _portprog is None:
1030 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
1032 match = _portprog.match(host)
1033 if match:
1034 host, port = match.groups()
1035 if port:
1036 return host, port
1037 return host, None
1039def splitnport(host, defport=-1):
1040 """Split host and port, returning numeric port.
1041 Return given default port if no ':' found; defaults to -1.
1042 Return numerical port if a valid number are found after ':'.
1043 Return None if ':' but not a valid number."""
1044 host, delim, port = host.rpartition(':')
1045 if not delim:
1046 host = port
1047 elif port:
1048 try:
1049 nport = int(port)
1050 except ValueError:
1051 nport = None
1052 return host, nport
1053 return host, defport
1055def splitquery(url):
1056 """splitquery('/path?query') --> '/path', 'query'."""
1057 path, delim, query = url.rpartition('?')
1058 if delim:
1059 return path, query
1060 return url, None
1062def splittag(url):
1063 """splittag('/path#tag') --> '/path', 'tag'."""
1064 path, delim, tag = url.rpartition('#')
1065 if delim:
1066 return path, tag
1067 return url, None
1069def splitattr(url):
1070 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1071 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1072 words = url.split(';')
1073 return words[0], words[1:]
1075def splitvalue(attr):
1076 """splitvalue('attr=value') --> 'attr', 'value'."""
1077 attr, delim, value = attr.partition('=')
1078 return attr, (value if delim else None)