1"""
2Ported using Python-Future from the Python 3.3 standard library.
3
4Parse (absolute and relative) URLs.
5
6urlparse module is based upon the following RFC specifications.
7
8RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
9and L. Masinter, January 2005.
10
11RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
12and L.Masinter, December 1999.
13
14RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
15Berners-Lee, R. Fielding, and L. Masinter, August 1998.
16
17RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
18
19RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
201995.
21
22RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
23McCahill, December 1994
24
25RFC 3986 is considered the current standard and any future changes to
26urlparse module should conform with it. The urlparse module is
27currently not entirely compliant with this RFC due to defacto
28scenarios for parsing, and for backward compatibility purposes, some
29parsing quirks from older RFCs are retained. The testcases in
30test_urlparse.py provides a good indicator of parsing behavior.
31"""
32from __future__ import absolute_import, division, unicode_literals
33from future.builtins import bytes, chr, dict, int, range, str
34from future.utils import raise_with_traceback
35
36import re
37import sys
38import collections
39
40__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
41 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
42 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
43 "unquote", "unquote_plus", "unquote_to_bytes"]
44
45# A classification of schemes ('' means apply by default)
46uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
47 'wais', 'file', 'https', 'shttp', 'mms',
48 'prospero', 'rtsp', 'rtspu', '', 'sftp',
49 'svn', 'svn+ssh']
50uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
51 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
52 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
53 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
54uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
55 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
56 'mms', '', 'sftp', 'tel']
57
58# These are not actually used anymore, but should stay for backwards
59# compatibility. (They are undocumented, but have a public-looking name.)
60non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
61 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
62uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
63 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
64uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
65 'nntp', 'wais', 'https', 'shttp', 'snews',
66 'file', 'prospero', '']
67
68# Characters valid in scheme names
69scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
70 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
71 '0123456789'
72 '+-.')
73
74# XXX: Consider replacing with functools.lru_cache
75MAX_CACHE_SIZE = 20
76_parse_cache = {}
77
78def clear_cache():
79 """Clear the parse cache and the quoters cache."""
80 _parse_cache.clear()
81 _safe_quoters.clear()
82
83
84# Helpers for bytes handling
85# For 3.2, we deliberately require applications that
86# handle improperly quoted URLs to do their own
87# decoding and encoding. If valid use cases are
88# presented, we may relax this by using latin-1
89# decoding internally for 3.3
90_implicit_encoding = 'ascii'
91_implicit_errors = 'strict'
92
93def _noop(obj):
94 return obj
95
96def _encode_result(obj, encoding=_implicit_encoding,
97 errors=_implicit_errors):
98 return obj.encode(encoding, errors)
99
100def _decode_args(args, encoding=_implicit_encoding,
101 errors=_implicit_errors):
102 return tuple(x.decode(encoding, errors) if x else '' for x in args)
103
104def _coerce_args(*args):
105 # Invokes decode if necessary to create str args
106 # and returns the coerced inputs along with
107 # an appropriate result coercion function
108 # - noop for str inputs
109 # - encoding function otherwise
110 str_input = isinstance(args[0], str)
111 for arg in args[1:]:
112 # We special-case the empty string to support the
113 # "scheme=''" default argument to some functions
114 if arg and isinstance(arg, str) != str_input:
115 raise TypeError("Cannot mix str and non-str arguments")
116 if str_input:
117 return args + (_noop,)
118 return _decode_args(args) + (_encode_result,)
119
120# Result objects are more helpful than simple tuples
121class _ResultMixinStr(object):
122 """Standard approach to encoding parsed results from str to bytes"""
123 __slots__ = ()
124
125 def encode(self, encoding='ascii', errors='strict'):
126 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
127
128
129class _ResultMixinBytes(object):
130 """Standard approach to decoding parsed results from bytes to str"""
131 __slots__ = ()
132
133 def decode(self, encoding='ascii', errors='strict'):
134 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
135
136
137class _NetlocResultMixinBase(object):
138 """Shared methods for the parsed result objects containing a netloc element"""
139 __slots__ = ()
140
141 @property
142 def username(self):
143 return self._userinfo[0]
144
145 @property
146 def password(self):
147 return self._userinfo[1]
148
149 @property
150 def hostname(self):
151 hostname = self._hostinfo[0]
152 if not hostname:
153 hostname = None
154 elif hostname is not None:
155 hostname = hostname.lower()
156 return hostname
157
158 @property
159 def port(self):
160 port = self._hostinfo[1]
161 if port is not None:
162 port = int(port, 10)
163 # Return None on an illegal port
164 if not ( 0 <= port <= 65535):
165 return None
166 return port
167
168
169class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
170 __slots__ = ()
171
172 @property
173 def _userinfo(self):
174 netloc = self.netloc
175 userinfo, have_info, hostinfo = netloc.rpartition('@')
176 if have_info:
177 username, have_password, password = userinfo.partition(':')
178 if not have_password:
179 password = None
180 else:
181 username = password = None
182 return username, password
183
184 @property
185 def _hostinfo(self):
186 netloc = self.netloc
187 _, _, hostinfo = netloc.rpartition('@')
188 _, have_open_br, bracketed = hostinfo.partition('[')
189 if have_open_br:
190 hostname, _, port = bracketed.partition(']')
191 _, have_port, port = port.partition(':')
192 else:
193 hostname, have_port, port = hostinfo.partition(':')
194 if not have_port:
195 port = None
196 return hostname, port
197
198
199class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
200 __slots__ = ()
201
202 @property
203 def _userinfo(self):
204 netloc = self.netloc
205 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
206 if have_info:
207 username, have_password, password = userinfo.partition(b':')
208 if not have_password:
209 password = None
210 else:
211 username = password = None
212 return username, password
213
214 @property
215 def _hostinfo(self):
216 netloc = self.netloc
217 _, _, hostinfo = netloc.rpartition(b'@')
218 _, have_open_br, bracketed = hostinfo.partition(b'[')
219 if have_open_br:
220 hostname, _, port = bracketed.partition(b']')
221 _, have_port, port = port.partition(b':')
222 else:
223 hostname, have_port, port = hostinfo.partition(b':')
224 if not have_port:
225 port = None
226 return hostname, port
227
228
229from collections import namedtuple
230
231_DefragResultBase = namedtuple('DefragResult', 'url fragment')
232_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
233_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
234
235# For backwards compatibility, alias _NetlocResultMixinStr
236# ResultBase is no longer part of the documented API, but it is
237# retained since deprecating it isn't worth the hassle
238ResultBase = _NetlocResultMixinStr
239
240# Structured result objects for string data
241class DefragResult(_DefragResultBase, _ResultMixinStr):
242 __slots__ = ()
243 def geturl(self):
244 if self.fragment:
245 return self.url + '#' + self.fragment
246 else:
247 return self.url
248
249class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
250 __slots__ = ()
251 def geturl(self):
252 return urlunsplit(self)
253
254class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
255 __slots__ = ()
256 def geturl(self):
257 return urlunparse(self)
258
259# Structured result objects for bytes data
260class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
261 __slots__ = ()
262 def geturl(self):
263 if self.fragment:
264 return self.url + b'#' + self.fragment
265 else:
266 return self.url
267
268class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
269 __slots__ = ()
270 def geturl(self):
271 return urlunsplit(self)
272
273class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
274 __slots__ = ()
275 def geturl(self):
276 return urlunparse(self)
277
278# Set up the encode/decode result pairs
279def _fix_result_transcoding():
280 _result_pairs = (
281 (DefragResult, DefragResultBytes),
282 (SplitResult, SplitResultBytes),
283 (ParseResult, ParseResultBytes),
284 )
285 for _decoded, _encoded in _result_pairs:
286 _decoded._encoded_counterpart = _encoded
287 _encoded._decoded_counterpart = _decoded
288
289_fix_result_transcoding()
290del _fix_result_transcoding
291
292def urlparse(url, scheme='', allow_fragments=True):
293 """Parse a URL into 6 components:
294 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
295 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
296 Note that we don't break the components up in smaller bits
297 (e.g. netloc is a single string) and we don't expand % escapes."""
298 url, scheme, _coerce_result = _coerce_args(url, scheme)
299 splitresult = urlsplit(url, scheme, allow_fragments)
300 scheme, netloc, url, query, fragment = splitresult
301 if scheme in uses_params and ';' in url:
302 url, params = _splitparams(url)
303 else:
304 params = ''
305 result = ParseResult(scheme, netloc, url, params, query, fragment)
306 return _coerce_result(result)
307
308def _splitparams(url):
309 if '/' in url:
310 i = url.find(';', url.rfind('/'))
311 if i < 0:
312 return url, ''
313 else:
314 i = url.find(';')
315 return url[:i], url[i+1:]
316
317def _splitnetloc(url, start=0):
318 delim = len(url) # position of end of domain part of url, default is end
319 for c in '/?#': # look for delimiters; the order is NOT important
320 wdelim = url.find(c, start) # find first of this delim
321 if wdelim >= 0: # if found
322 delim = min(delim, wdelim) # use earliest delim position
323 return url[start:delim], url[delim:] # return (domain, rest)
324
325def urlsplit(url, scheme='', allow_fragments=True):
326 """Parse a URL into 5 components:
327 <scheme>://<netloc>/<path>?<query>#<fragment>
328 Return a 5-tuple: (scheme, netloc, path, query, fragment).
329 Note that we don't break the components up in smaller bits
330 (e.g. netloc is a single string) and we don't expand % escapes."""
331 url, scheme, _coerce_result = _coerce_args(url, scheme)
332 allow_fragments = bool(allow_fragments)
333 key = url, scheme, allow_fragments, type(url), type(scheme)
334 cached = _parse_cache.get(key, None)
335 if cached:
336 return _coerce_result(cached)
337 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
338 clear_cache()
339 netloc = query = fragment = ''
340 i = url.find(':')
341 if i > 0:
342 if url[:i] == 'http': # optimize the common case
343 scheme = url[:i].lower()
344 url = url[i+1:]
345 if url[:2] == '//':
346 netloc, url = _splitnetloc(url, 2)
347 if (('[' in netloc and ']' not in netloc) or
348 (']' in netloc and '[' not in netloc)):
349 raise ValueError("Invalid IPv6 URL")
350 if allow_fragments and '#' in url:
351 url, fragment = url.split('#', 1)
352 if '?' in url:
353 url, query = url.split('?', 1)
354 v = SplitResult(scheme, netloc, url, query, fragment)
355 _parse_cache[key] = v
356 return _coerce_result(v)
357 for c in url[:i]:
358 if c not in scheme_chars:
359 break
360 else:
361 # make sure "url" is not actually a port number (in which case
362 # "scheme" is really part of the path)
363 rest = url[i+1:]
364 if not rest or any(c not in '0123456789' for c in rest):
365 # not a port number
366 scheme, url = url[:i].lower(), rest
367
368 if url[:2] == '//':
369 netloc, url = _splitnetloc(url, 2)
370 if (('[' in netloc and ']' not in netloc) or
371 (']' in netloc and '[' not in netloc)):
372 raise ValueError("Invalid IPv6 URL")
373 if allow_fragments and '#' in url:
374 url, fragment = url.split('#', 1)
375 if '?' in url:
376 url, query = url.split('?', 1)
377 v = SplitResult(scheme, netloc, url, query, fragment)
378 _parse_cache[key] = v
379 return _coerce_result(v)
380
381def urlunparse(components):
382 """Put a parsed URL back together again. This may result in a
383 slightly different, but equivalent URL, if the URL that was parsed
384 originally had redundant delimiters, e.g. a ? with an empty query
385 (the draft states that these are equivalent)."""
386 scheme, netloc, url, params, query, fragment, _coerce_result = (
387 _coerce_args(*components))
388 if params:
389 url = "%s;%s" % (url, params)
390 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
391
392def urlunsplit(components):
393 """Combine the elements of a tuple as returned by urlsplit() into a
394 complete URL as a string. The data argument can be any five-item iterable.
395 This may result in a slightly different, but equivalent URL, if the URL that
396 was parsed originally had unnecessary delimiters (for example, a ? with an
397 empty query; the RFC states that these are equivalent)."""
398 scheme, netloc, url, query, fragment, _coerce_result = (
399 _coerce_args(*components))
400 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
401 if url and url[:1] != '/': url = '/' + url
402 url = '//' + (netloc or '') + url
403 if scheme:
404 url = scheme + ':' + url
405 if query:
406 url = url + '?' + query
407 if fragment:
408 url = url + '#' + fragment
409 return _coerce_result(url)
410
411def urljoin(base, url, allow_fragments=True):
412 """Join a base URL and a possibly relative URL to form an absolute
413 interpretation of the latter."""
414 if not base:
415 return url
416 if not url:
417 return base
418 base, url, _coerce_result = _coerce_args(base, url)
419 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
420 urlparse(base, '', allow_fragments)
421 scheme, netloc, path, params, query, fragment = \
422 urlparse(url, bscheme, allow_fragments)
423 if scheme != bscheme or scheme not in uses_relative:
424 return _coerce_result(url)
425 if scheme in uses_netloc:
426 if netloc:
427 return _coerce_result(urlunparse((scheme, netloc, path,
428 params, query, fragment)))
429 netloc = bnetloc
430 if path[:1] == '/':
431 return _coerce_result(urlunparse((scheme, netloc, path,
432 params, query, fragment)))
433 if not path and not params:
434 path = bpath
435 params = bparams
436 if not query:
437 query = bquery
438 return _coerce_result(urlunparse((scheme, netloc, path,
439 params, query, fragment)))
440 segments = bpath.split('/')[:-1] + path.split('/')
441 # XXX The stuff below is bogus in various ways...
442 if segments[-1] == '.':
443 segments[-1] = ''
444 while '.' in segments:
445 segments.remove('.')
446 while 1:
447 i = 1
448 n = len(segments) - 1
449 while i < n:
450 if (segments[i] == '..'
451 and segments[i-1] not in ('', '..')):
452 del segments[i-1:i+1]
453 break
454 i = i+1
455 else:
456 break
457 if segments == ['', '..']:
458 segments[-1] = ''
459 elif len(segments) >= 2 and segments[-1] == '..':
460 segments[-2:] = ['']
461 return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),
462 params, query, fragment)))
463
464def urldefrag(url):
465 """Removes any existing fragment from URL.
466
467 Returns a tuple of the defragmented URL and the fragment. If
468 the URL contained no fragments, the second element is the
469 empty string.
470 """
471 url, _coerce_result = _coerce_args(url)
472 if '#' in url:
473 s, n, p, a, q, frag = urlparse(url)
474 defrag = urlunparse((s, n, p, a, q, ''))
475 else:
476 frag = ''
477 defrag = url
478 return _coerce_result(DefragResult(defrag, frag))
479
480_hexdig = '0123456789ABCDEFabcdef'
481_hextobyte = dict(((a + b).encode(), bytes([int(a + b, 16)]))
482 for a in _hexdig for b in _hexdig)
483
484def unquote_to_bytes(string):
485 """unquote_to_bytes('abc%20def') -> b'abc def'."""
486 # Note: strings are encoded as UTF-8. This is only an issue if it contains
487 # unescaped non-ASCII characters, which URIs should not.
488 if not string:
489 # Is it a string-like object?
490 string.split
491 return bytes(b'')
492 if isinstance(string, str):
493 string = string.encode('utf-8')
494 ### For Python-Future:
495 # It is already a byte-string object, but force it to be newbytes here on
496 # Py2:
497 string = bytes(string)
498 ###
499 bits = string.split(b'%')
500 if len(bits) == 1:
501 return string
502 res = [bits[0]]
503 append = res.append
504 for item in bits[1:]:
505 try:
506 append(_hextobyte[item[:2]])
507 append(item[2:])
508 except KeyError:
509 append(b'%')
510 append(item)
511 return bytes(b'').join(res)
512
513_asciire = re.compile('([\x00-\x7f]+)')
514
515def unquote(string, encoding='utf-8', errors='replace'):
516 """Replace %xx escapes by their single-character equivalent. The optional
517 encoding and errors parameters specify how to decode percent-encoded
518 sequences into Unicode characters, as accepted by the bytes.decode()
519 method.
520 By default, percent-encoded sequences are decoded with UTF-8, and invalid
521 sequences are replaced by a placeholder character.
522
523 unquote('abc%20def') -> 'abc def'.
524 """
525 if '%' not in string:
526 string.split
527 return string
528 if encoding is None:
529 encoding = 'utf-8'
530 if errors is None:
531 errors = 'replace'
532 bits = _asciire.split(string)
533 res = [bits[0]]
534 append = res.append
535 for i in range(1, len(bits), 2):
536 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
537 append(bits[i + 1])
538 return ''.join(res)
539
540def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
541 encoding='utf-8', errors='replace'):
542 """Parse a query given as a string argument.
543
544 Arguments:
545
546 qs: percent-encoded query string to be parsed
547
548 keep_blank_values: flag indicating whether blank values in
549 percent-encoded queries should be treated as blank strings.
550 A true value indicates that blanks should be retained as
551 blank strings. The default false value indicates that
552 blank values are to be ignored and treated as if they were
553 not included.
554
555 strict_parsing: flag indicating what to do with parsing errors.
556 If false (the default), errors are silently ignored.
557 If true, errors raise a ValueError exception.
558
559 encoding and errors: specify how to decode percent-encoded sequences
560 into Unicode characters, as accepted by the bytes.decode() method.
561 """
562 parsed_result = {}
563 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
564 encoding=encoding, errors=errors)
565 for name, value in pairs:
566 if name in parsed_result:
567 parsed_result[name].append(value)
568 else:
569 parsed_result[name] = [value]
570 return parsed_result
571
572def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
573 encoding='utf-8', errors='replace'):
574 """Parse a query given as a string argument.
575
576 Arguments:
577
578 qs: percent-encoded query string to be parsed
579
580 keep_blank_values: flag indicating whether blank values in
581 percent-encoded queries should be treated as blank strings. A
582 true value indicates that blanks should be retained as blank
583 strings. The default false value indicates that blank values
584 are to be ignored and treated as if they were not included.
585
586 strict_parsing: flag indicating what to do with parsing errors. If
587 false (the default), errors are silently ignored. If true,
588 errors raise a ValueError exception.
589
590 encoding and errors: specify how to decode percent-encoded sequences
591 into Unicode characters, as accepted by the bytes.decode() method.
592
593 Returns a list, as G-d intended.
594 """
595 qs, _coerce_result = _coerce_args(qs)
596 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
597 r = []
598 for name_value in pairs:
599 if not name_value and not strict_parsing:
600 continue
601 nv = name_value.split('=', 1)
602 if len(nv) != 2:
603 if strict_parsing:
604 raise ValueError("bad query field: %r" % (name_value,))
605 # Handle case of a control-name with no equal sign
606 if keep_blank_values:
607 nv.append('')
608 else:
609 continue
610 if len(nv[1]) or keep_blank_values:
611 name = nv[0].replace('+', ' ')
612 name = unquote(name, encoding=encoding, errors=errors)
613 name = _coerce_result(name)
614 value = nv[1].replace('+', ' ')
615 value = unquote(value, encoding=encoding, errors=errors)
616 value = _coerce_result(value)
617 r.append((name, value))
618 return r
619
620def unquote_plus(string, encoding='utf-8', errors='replace'):
621 """Like unquote(), but also replace plus signs by spaces, as required for
622 unquoting HTML form values.
623
624 unquote_plus('%7e/abc+def') -> '~/abc def'
625 """
626 string = string.replace('+', ' ')
627 return unquote(string, encoding, errors)
628
629_ALWAYS_SAFE = frozenset(bytes(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
630 b'abcdefghijklmnopqrstuvwxyz'
631 b'0123456789'
632 b'_.-'))
633_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
634_safe_quoters = {}
635
636class Quoter(collections.defaultdict):
637 """A mapping from bytes (in range(0,256)) to strings.
638
639 String values are percent-encoded byte values, unless the key < 128, and
640 in the "safe" set (either the specified safe set, or default set).
641 """
642 # Keeps a cache internally, using defaultdict, for efficiency (lookups
643 # of cached keys don't call Python code at all).
644 def __init__(self, safe):
645 """safe: bytes object."""
646 self.safe = _ALWAYS_SAFE.union(bytes(safe))
647
648 def __repr__(self):
649 # Without this, will just display as a defaultdict
650 return "<Quoter %r>" % dict(self)
651
652 def __missing__(self, b):
653 # Handle a cache miss. Store quoted string in cache and return.
654 res = chr(b) if b in self.safe else '%{0:02X}'.format(b)
655 self[b] = res
656 return res
657
658def quote(string, safe='/', encoding=None, errors=None):
659 """quote('abc def') -> 'abc%20def'
660
661 Each part of a URL, e.g. the path info, the query, etc., has a
662 different set of reserved characters that must be quoted.
663
664 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
665 the following reserved characters.
666
667 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
668 "$" | ","
669
670 Each of these characters is reserved in some component of a URL,
671 but not necessarily in all of them.
672
673 By default, the quote function is intended for quoting the path
674 section of a URL. Thus, it will not encode '/'. This character
675 is reserved, but in typical usage the quote function is being
676 called on a path where the existing slash characters are used as
677 reserved characters.
678
679 string and safe may be either str or bytes objects. encoding must
680 not be specified if string is a str.
681
682 The optional encoding and errors parameters specify how to deal with
683 non-ASCII characters, as accepted by the str.encode method.
684 By default, encoding='utf-8' (characters are encoded with UTF-8), and
685 errors='strict' (unsupported characters raise a UnicodeEncodeError).
686 """
687 if isinstance(string, str):
688 if not string:
689 return string
690 if encoding is None:
691 encoding = 'utf-8'
692 if errors is None:
693 errors = 'strict'
694 string = string.encode(encoding, errors)
695 else:
696 if encoding is not None:
697 raise TypeError("quote() doesn't support 'encoding' for bytes")
698 if errors is not None:
699 raise TypeError("quote() doesn't support 'errors' for bytes")
700 return quote_from_bytes(string, safe)
701
702def quote_plus(string, safe='', encoding=None, errors=None):
703 """Like quote(), but also replace ' ' with '+', as required for quoting
704 HTML form values. Plus signs in the original string are escaped unless
705 they are included in safe. It also does not have safe default to '/'.
706 """
707 # Check if ' ' in string, where string may either be a str or bytes. If
708 # there are no spaces, the regular quote will produce the right answer.
709 if ((isinstance(string, str) and ' ' not in string) or
710 (isinstance(string, bytes) and b' ' not in string)):
711 return quote(string, safe, encoding, errors)
712 if isinstance(safe, str):
713 space = str(' ')
714 else:
715 space = bytes(b' ')
716 string = quote(string, safe + space, encoding, errors)
717 return string.replace(' ', '+')
718
719def quote_from_bytes(bs, safe='/'):
720 """Like quote(), but accepts a bytes object rather than a str, and does
721 not perform string-to-bytes encoding. It always returns an ASCII string.
722 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
723 """
724 if not isinstance(bs, (bytes, bytearray)):
725 raise TypeError("quote_from_bytes() expected bytes")
726 if not bs:
727 return str('')
728 ### For Python-Future:
729 bs = bytes(bs)
730 ###
731 if isinstance(safe, str):
732 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
733 safe = str(safe).encode('ascii', 'ignore')
734 else:
735 ### For Python-Future:
736 safe = bytes(safe)
737 ###
738 safe = bytes([c for c in safe if c < 128])
739 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
740 return bs.decode()
741 try:
742 quoter = _safe_quoters[safe]
743 except KeyError:
744 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
745 return str('').join([quoter(char) for char in bs])
746
747def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
748 """Encode a sequence of two-element tuples or dictionary into a URL query string.
749
750 If any values in the query arg are sequences and doseq is true, each
751 sequence element is converted to a separate parameter.
752
753 If the query arg is a sequence of two-element tuples, the order of the
754 parameters in the output will match the order of parameters in the
755 input.
756
757 The query arg may be either a string or a bytes type. When query arg is a
758 string, the safe, encoding and error parameters are sent the quote_plus for
759 encoding.
760 """
761
762 if hasattr(query, "items"):
763 query = query.items()
764 else:
765 # It's a bother at times that strings and string-like objects are
766 # sequences.
767 try:
768 # non-sequence items should not work with len()
769 # non-empty strings will fail this
770 if len(query) and not isinstance(query[0], tuple):
771 raise TypeError
772 # Zero-length sequences of all types will get here and succeed,
773 # but that's a minor nit. Since the original implementation
774 # allowed empty dicts that type of behavior probably should be
775 # preserved for consistency
776 except TypeError:
777 ty, va, tb = sys.exc_info()
778 raise_with_traceback(TypeError("not a valid non-string sequence "
779 "or mapping object"), tb)
780
781 l = []
782 if not doseq:
783 for k, v in query:
784 if isinstance(k, bytes):
785 k = quote_plus(k, safe)
786 else:
787 k = quote_plus(str(k), safe, encoding, errors)
788
789 if isinstance(v, bytes):
790 v = quote_plus(v, safe)
791 else:
792 v = quote_plus(str(v), safe, encoding, errors)
793 l.append(k + '=' + v)
794 else:
795 for k, v in query:
796 if isinstance(k, bytes):
797 k = quote_plus(k, safe)
798 else:
799 k = quote_plus(str(k), safe, encoding, errors)
800
801 if isinstance(v, bytes):
802 v = quote_plus(v, safe)
803 l.append(k + '=' + v)
804 elif isinstance(v, str):
805 v = quote_plus(v, safe, encoding, errors)
806 l.append(k + '=' + v)
807 else:
808 try:
809 # Is this a sufficient test for sequence-ness?
810 x = len(v)
811 except TypeError:
812 # not a sequence
813 v = quote_plus(str(v), safe, encoding, errors)
814 l.append(k + '=' + v)
815 else:
816 # loop over the sequence
817 for elt in v:
818 if isinstance(elt, bytes):
819 elt = quote_plus(elt, safe)
820 else:
821 elt = quote_plus(str(elt), safe, encoding, errors)
822 l.append(k + '=' + elt)
823 return str('&').join(l)
824
825# Utilities to parse URLs (most of these return None for missing parts):
826# unwrap('<URL:type://host/path>') --> 'type://host/path'
827# splittype('type:opaquestring') --> 'type', 'opaquestring'
828# splithost('//host[:port]/path') --> 'host[:port]', '/path'
829# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
830# splitpasswd('user:passwd') -> 'user', 'passwd'
831# splitport('host:port') --> 'host', 'port'
832# splitquery('/path?query') --> '/path', 'query'
833# splittag('/path#tag') --> '/path', 'tag'
834# splitattr('/path;attr1=value1;attr2=value2;...') ->
835# '/path', ['attr1=value1', 'attr2=value2', ...]
836# splitvalue('attr=value') --> 'attr', 'value'
837# urllib.parse.unquote('abc%20def') -> 'abc def'
838# quote('abc def') -> 'abc%20def')
839
840def to_bytes(url):
841 """to_bytes(u"URL") --> 'URL'."""
842 # Most URL schemes require ASCII. If that changes, the conversion
843 # can be relaxed.
844 # XXX get rid of to_bytes()
845 if isinstance(url, str):
846 try:
847 url = url.encode("ASCII").decode()
848 except UnicodeError:
849 raise UnicodeError("URL " + repr(url) +
850 " contains non-ASCII characters")
851 return url
852
853def unwrap(url):
854 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
855 url = str(url).strip()
856 if url[:1] == '<' and url[-1:] == '>':
857 url = url[1:-1].strip()
858 if url[:4] == 'URL:': url = url[4:].strip()
859 return url
860
861_typeprog = None
862def splittype(url):
863 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
864 global _typeprog
865 if _typeprog is None:
866 import re
867 _typeprog = re.compile('^([^/:]+):')
868
869 match = _typeprog.match(url)
870 if match:
871 scheme = match.group(1)
872 return scheme.lower(), url[len(scheme) + 1:]
873 return None, url
874
875_hostprog = None
876def splithost(url):
877 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
878 global _hostprog
879 if _hostprog is None:
880 import re
881 _hostprog = re.compile('^//([^/?]*)(.*)$')
882
883 match = _hostprog.match(url)
884 if match:
885 host_port = match.group(1)
886 path = match.group(2)
887 if path and not path.startswith('/'):
888 path = '/' + path
889 return host_port, path
890 return None, url
891
892_userprog = None
893def splituser(host):
894 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
895 global _userprog
896 if _userprog is None:
897 import re
898 _userprog = re.compile('^(.*)@(.*)$')
899
900 match = _userprog.match(host)
901 if match: return match.group(1, 2)
902 return None, host
903
904_passwdprog = None
905def splitpasswd(user):
906 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
907 global _passwdprog
908 if _passwdprog is None:
909 import re
910 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
911
912 match = _passwdprog.match(user)
913 if match: return match.group(1, 2)
914 return user, None
915
916# splittag('/path#tag') --> '/path', 'tag'
917_portprog = None
918def splitport(host):
919 """splitport('host:port') --> 'host', 'port'."""
920 global _portprog
921 if _portprog is None:
922 import re
923 _portprog = re.compile('^(.*):([0-9]+)$')
924
925 match = _portprog.match(host)
926 if match: return match.group(1, 2)
927 return host, None
928
929_nportprog = None
930def splitnport(host, defport=-1):
931 """Split host and port, returning numeric port.
932 Return given default port if no ':' found; defaults to -1.
933 Return numerical port if a valid number are found after ':'.
934 Return None if ':' but not a valid number."""
935 global _nportprog
936 if _nportprog is None:
937 import re
938 _nportprog = re.compile('^(.*):(.*)$')
939
940 match = _nportprog.match(host)
941 if match:
942 host, port = match.group(1, 2)
943 try:
944 if not port: raise ValueError("no digits")
945 nport = int(port)
946 except ValueError:
947 nport = None
948 return host, nport
949 return host, defport
950
951_queryprog = None
952def splitquery(url):
953 """splitquery('/path?query') --> '/path', 'query'."""
954 global _queryprog
955 if _queryprog is None:
956 import re
957 _queryprog = re.compile('^(.*)\?([^?]*)$')
958
959 match = _queryprog.match(url)
960 if match: return match.group(1, 2)
961 return url, None
962
963_tagprog = None
964def splittag(url):
965 """splittag('/path#tag') --> '/path', 'tag'."""
966 global _tagprog
967 if _tagprog is None:
968 import re
969 _tagprog = re.compile('^(.*)#([^#]*)$')
970
971 match = _tagprog.match(url)
972 if match: return match.group(1, 2)
973 return url, None
974
975def splitattr(url):
976 """splitattr('/path;attr1=value1;attr2=value2;...') ->
977 '/path', ['attr1=value1', 'attr2=value2', ...]."""
978 words = url.split(';')
979 return words[0], words[1:]
980
981_valueprog = None
982def splitvalue(attr):
983 """splitvalue('attr=value') --> 'attr', 'value'."""
984 global _valueprog
985 if _valueprog is None:
986 import re
987 _valueprog = re.compile('^([^=]*)=(.*)$')
988
989 match = _valueprog.match(attr)
990 if match: return match.group(1, 2)
991 return attr, None