Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hyperlink/_url.py: 54%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2u"""Hyperlink provides Pythonic URL parsing, construction, and rendering.
4Usage is straightforward::
6 >>> import hyperlink
7 >>> url = hyperlink.parse(u'http://github.com/mahmoud/hyperlink?utm_source=docs')
8 >>> url.host
9 u'github.com'
10 >>> secure_url = url.replace(scheme=u'https')
11 >>> secure_url.get('utm_source')[0]
12 u'docs'
14Hyperlink's API centers on the :class:`DecodedURL` type, which wraps
15the lower-level :class:`URL`, both of which can be returned by the
16:func:`parse()` convenience function.
18""" # noqa: E501
20import re
21import sys
22import string
23import socket
24from socket import AF_INET, AF_INET6
26try:
27 from socket import AddressFamily
28except ImportError:
29 AddressFamily = int # type: ignore[assignment,misc]
30from typing import (
31 Any,
32 Callable,
33 Dict,
34 Iterable,
35 Iterator,
36 List,
37 Mapping,
38 Optional,
39 Sequence,
40 Text,
41 Tuple,
42 Type,
43 TypeVar,
44 Union,
45 cast,
46 TYPE_CHECKING,
47 overload,
48)
49from unicodedata import normalize
50from ._socket import inet_pton
52try:
53 from collections.abc import Mapping as MappingABC
54except ImportError: # Python 2
55 from collections import Mapping as MappingABC
57from idna import encode as idna_encode, decode as idna_decode
60PY2 = sys.version_info[0] == 2
61try:
62 unichr
63except NameError: # Py3
64 unichr = chr # type: Callable[[int], Text]
65NoneType = type(None) # type: Type[None]
66QueryPairs = Tuple[Tuple[Text, Optional[Text]], ...] # internal representation
67QueryParameters = Union[
68 Mapping[Text, Optional[Text]],
69 QueryPairs,
70 Iterable[Tuple[Text, Optional[Text]]],
71]
72T = TypeVar("T")
73# Literal is not available in all pythons so we only bring it in for mypy.
74if TYPE_CHECKING:
75 from typing import Literal
78# from boltons.typeutils
79def make_sentinel(name="_MISSING", var_name=""):
80 # type: (str, str) -> object
81 """Creates and returns a new **instance** of a new class, suitable for
82 usage as a "sentinel", a kind of singleton often used to indicate
83 a value is missing when ``None`` is a valid input.
85 Args:
86 name: Name of the Sentinel
87 var_name: Set this name to the name of the variable in its respective
88 module enable pickle-ability.
90 >>> make_sentinel(var_name='_MISSING')
91 _MISSING
93 The most common use cases here in boltons are as default values
94 for optional function arguments, partly because of its
95 less-confusing appearance in automatically generated
96 documentation. Sentinels also function well as placeholders in queues
97 and linked lists.
99 .. note::
101 By design, additional calls to ``make_sentinel`` with the same
102 values will not produce equivalent objects.
104 >>> make_sentinel('TEST') == make_sentinel('TEST')
105 False
106 >>> type(make_sentinel('TEST')) == type(make_sentinel('TEST'))
107 False
108 """
110 class Sentinel(object):
111 def __init__(self):
112 # type: () -> None
113 self.name = name
114 self.var_name = var_name
116 def __repr__(self):
117 # type: () -> str
118 if self.var_name:
119 return self.var_name
120 return "%s(%r)" % (self.__class__.__name__, self.name)
122 if var_name:
123 # superclass type hints don't allow str return type, but it is
124 # allowed in the docs, hence the ignore[override] below
125 def __reduce__(self):
126 # type: () -> str
127 return self.var_name
129 def __nonzero__(self):
130 # type: () -> bool
131 return False
133 __bool__ = __nonzero__
135 return Sentinel()
138_unspecified = _UNSET = make_sentinel("_UNSET") # type: Any
141# RFC 3986 Section 2.3, Unreserved URI Characters
142# https://tools.ietf.org/html/rfc3986#section-2.3
143_UNRESERVED_CHARS = frozenset(
144 "~-._0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz"
145)
148# URL parsing regex (based on RFC 3986 Appendix B, with modifications)
149_URL_RE = re.compile(
150 r"^((?P<scheme>[^:/?#]+):)?"
151 r"((?P<_netloc_sep>//)"
152 r"(?P<authority>[^/?#]*))?"
153 r"(?P<path>[^?#]*)"
154 r"(\?(?P<query>[^#]*))?"
155 r"(#(?P<fragment>.*))?$"
156)
157_SCHEME_RE = re.compile(r"^[a-zA-Z0-9+-.]*$")
158_AUTHORITY_RE = re.compile(
159 r"^(?:(?P<userinfo>[^@/?#]*)@)?"
160 r"(?P<host>"
161 r"(?:\[(?P<ipv6_host>[^[\]/?#]*)\])"
162 r"|(?P<plain_host>[^:/?#[\]]*)"
163 r"|(?P<bad_host>.*?))?"
164 r"(?::(?P<port>.*))?$"
165)
168_HEX_CHAR_MAP = dict(
169 [
170 ((a + b).encode("ascii"), unichr(int(a + b, 16)).encode("charmap"))
171 for a in string.hexdigits
172 for b in string.hexdigits
173 ]
174)
175_ASCII_RE = re.compile("([\x00-\x7f]+)")
177# RFC 3986 section 2.2, Reserved Characters
178# https://tools.ietf.org/html/rfc3986#section-2.2
179_GEN_DELIMS = frozenset(u":/?#[]@")
180_SUB_DELIMS = frozenset(u"!$&'()*+,;=")
181_ALL_DELIMS = _GEN_DELIMS | _SUB_DELIMS
183_USERINFO_SAFE = _UNRESERVED_CHARS | _SUB_DELIMS | set(u"%")
184_USERINFO_DELIMS = _ALL_DELIMS - _USERINFO_SAFE
185_PATH_SAFE = _USERINFO_SAFE | set(u":@")
186_PATH_DELIMS = _ALL_DELIMS - _PATH_SAFE
187_SCHEMELESS_PATH_SAFE = _PATH_SAFE - set(":")
188_SCHEMELESS_PATH_DELIMS = _ALL_DELIMS - _SCHEMELESS_PATH_SAFE
189_FRAGMENT_SAFE = _UNRESERVED_CHARS | _PATH_SAFE | set(u"/?")
190_FRAGMENT_DELIMS = _ALL_DELIMS - _FRAGMENT_SAFE
191_QUERY_VALUE_SAFE = _UNRESERVED_CHARS | _FRAGMENT_SAFE - set(u"&")
192_QUERY_VALUE_DELIMS = _ALL_DELIMS - _QUERY_VALUE_SAFE
193_QUERY_KEY_SAFE = _UNRESERVED_CHARS | _QUERY_VALUE_SAFE - set(u"=")
194_QUERY_KEY_DELIMS = _ALL_DELIMS - _QUERY_KEY_SAFE
197def _make_decode_map(delims, allow_percent=False):
198 # type: (Iterable[Text], bool) -> Mapping[bytes, bytes]
199 ret = dict(_HEX_CHAR_MAP)
200 if not allow_percent:
201 delims = set(delims) | set([u"%"])
202 for delim in delims:
203 _hexord = "{0:02X}".format(ord(delim)).encode("ascii")
204 _hexord_lower = _hexord.lower()
205 ret.pop(_hexord)
206 if _hexord != _hexord_lower:
207 ret.pop(_hexord_lower)
208 return ret
211def _make_quote_map(safe_chars):
212 # type: (Iterable[Text]) -> Mapping[Union[int, Text], Text]
213 ret = {} # type: Dict[Union[int, Text], Text]
214 # v is included in the dict for py3 mostly, because bytestrings
215 # are iterables of ints, of course!
216 for i, v in zip(range(256), range(256)):
217 c = chr(v)
218 if c in safe_chars:
219 ret[c] = ret[v] = c
220 else:
221 ret[c] = ret[v] = "%{0:02X}".format(i)
222 return ret
225_USERINFO_PART_QUOTE_MAP = _make_quote_map(_USERINFO_SAFE)
226_USERINFO_DECODE_MAP = _make_decode_map(_USERINFO_DELIMS)
227_PATH_PART_QUOTE_MAP = _make_quote_map(_PATH_SAFE)
228_SCHEMELESS_PATH_PART_QUOTE_MAP = _make_quote_map(_SCHEMELESS_PATH_SAFE)
229_PATH_DECODE_MAP = _make_decode_map(_PATH_DELIMS)
230_QUERY_KEY_QUOTE_MAP = _make_quote_map(_QUERY_KEY_SAFE)
231_QUERY_KEY_DECODE_MAP = _make_decode_map(_QUERY_KEY_DELIMS)
232_QUERY_VALUE_QUOTE_MAP = _make_quote_map(_QUERY_VALUE_SAFE)
233_QUERY_VALUE_DECODE_MAP = _make_decode_map(_QUERY_VALUE_DELIMS | set("+"))
234_FRAGMENT_QUOTE_MAP = _make_quote_map(_FRAGMENT_SAFE)
235_FRAGMENT_DECODE_MAP = _make_decode_map(_FRAGMENT_DELIMS)
236_UNRESERVED_QUOTE_MAP = _make_quote_map(_UNRESERVED_CHARS)
237_UNRESERVED_DECODE_MAP = dict(
238 [
239 (k, v)
240 for k, v in _HEX_CHAR_MAP.items()
241 if v.decode("ascii", "replace") in _UNRESERVED_CHARS
242 ]
243)
245_ROOT_PATHS = frozenset(((), (u"",)))
248def _encode_reserved(text, maximal=True):
249 # type: (Text, bool) -> Text
250 """A very comprehensive percent encoding for encoding all
251 delimiters. Used for arguments to DecodedURL, where a % means a
252 percent sign, and not the character used by URLs for escaping
253 bytes.
254 """
255 if maximal:
256 bytestr = normalize("NFC", text).encode("utf8")
257 return u"".join([_UNRESERVED_QUOTE_MAP[b] for b in bytestr])
258 return u"".join(
259 [
260 _UNRESERVED_QUOTE_MAP[t] if t in _UNRESERVED_CHARS else t
261 for t in text
262 ]
263 )
266def _encode_path_part(text, maximal=True):
267 # type: (Text, bool) -> Text
268 "Percent-encode a single segment of a URL path."
269 if maximal:
270 bytestr = normalize("NFC", text).encode("utf8")
271 return u"".join([_PATH_PART_QUOTE_MAP[b] for b in bytestr])
272 return u"".join(
273 [_PATH_PART_QUOTE_MAP[t] if t in _PATH_DELIMS else t for t in text]
274 )
277def _encode_schemeless_path_part(text, maximal=True):
278 # type: (Text, bool) -> Text
279 """Percent-encode the first segment of a URL path for a URL without a
280 scheme specified.
281 """
282 if maximal:
283 bytestr = normalize("NFC", text).encode("utf8")
284 return u"".join([_SCHEMELESS_PATH_PART_QUOTE_MAP[b] for b in bytestr])
285 return u"".join(
286 [
287 _SCHEMELESS_PATH_PART_QUOTE_MAP[t]
288 if t in _SCHEMELESS_PATH_DELIMS
289 else t
290 for t in text
291 ]
292 )
295def _encode_path_parts(
296 text_parts, # type: Sequence[Text]
297 rooted=False, # type: bool
298 has_scheme=True, # type: bool
299 has_authority=True, # type: bool
300 maximal=True, # type: bool
301):
302 # type: (...) -> Sequence[Text]
303 """
304 Percent-encode a tuple of path parts into a complete path.
306 Setting *maximal* to False percent-encodes only the reserved
307 characters that are syntactically necessary for serialization,
308 preserving any IRI-style textual data.
310 Leaving *maximal* set to its default True percent-encodes
311 everything required to convert a portion of an IRI to a portion of
312 a URI.
314 RFC 3986 3.3:
316 If a URI contains an authority component, then the path component
317 must either be empty or begin with a slash ("/") character. If a URI
318 does not contain an authority component, then the path cannot begin
319 with two slash characters ("//"). In addition, a URI reference
320 (Section 4.1) may be a relative-path reference, in which case the
321 first path segment cannot contain a colon (":") character.
322 """
323 if not text_parts:
324 return ()
325 if rooted:
326 text_parts = (u"",) + tuple(text_parts)
327 # elif has_authority and text_parts:
328 # raise Exception('see rfc above') # TODO: too late to fail like this?
329 encoded_parts = [] # type: List[Text]
330 if has_scheme:
331 encoded_parts = [
332 _encode_path_part(part, maximal=maximal) if part else part
333 for part in text_parts
334 ]
335 else:
336 encoded_parts = [_encode_schemeless_path_part(text_parts[0])]
337 encoded_parts.extend(
338 [
339 _encode_path_part(part, maximal=maximal) if part else part
340 for part in text_parts[1:]
341 ]
342 )
343 return tuple(encoded_parts)
346def _encode_query_key(text, maximal=True):
347 # type: (Text, bool) -> Text
348 """
349 Percent-encode a single query string key or value.
350 """
351 if maximal:
352 bytestr = normalize("NFC", text).encode("utf8")
353 return u"".join([_QUERY_KEY_QUOTE_MAP[b] for b in bytestr])
354 return u"".join(
355 [_QUERY_KEY_QUOTE_MAP[t] if t in _QUERY_KEY_DELIMS else t for t in text]
356 )
359def _encode_query_value(text, maximal=True):
360 # type: (Text, bool) -> Text
361 """
362 Percent-encode a single query string key or value.
363 """
364 if maximal:
365 bytestr = normalize("NFC", text).encode("utf8")
366 return u"".join([_QUERY_VALUE_QUOTE_MAP[b] for b in bytestr])
367 return u"".join(
368 [
369 _QUERY_VALUE_QUOTE_MAP[t] if t in _QUERY_VALUE_DELIMS else t
370 for t in text
371 ]
372 )
375def _encode_fragment_part(text, maximal=True):
376 # type: (Text, bool) -> Text
377 """Quote the fragment part of the URL. Fragments don't have
378 subdelimiters, so the whole URL fragment can be passed.
379 """
380 if maximal:
381 bytestr = normalize("NFC", text).encode("utf8")
382 return u"".join([_FRAGMENT_QUOTE_MAP[b] for b in bytestr])
383 return u"".join(
384 [_FRAGMENT_QUOTE_MAP[t] if t in _FRAGMENT_DELIMS else t for t in text]
385 )
388def _encode_userinfo_part(text, maximal=True):
389 # type: (Text, bool) -> Text
390 """Quote special characters in either the username or password
391 section of the URL.
392 """
393 if maximal:
394 bytestr = normalize("NFC", text).encode("utf8")
395 return u"".join([_USERINFO_PART_QUOTE_MAP[b] for b in bytestr])
396 return u"".join(
397 [
398 _USERINFO_PART_QUOTE_MAP[t] if t in _USERINFO_DELIMS else t
399 for t in text
400 ]
401 )
404# This port list painstakingly curated by hand searching through
405# https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
406# and
407# https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml
408SCHEME_PORT_MAP = {
409 "acap": 674,
410 "afp": 548,
411 "dict": 2628,
412 "dns": 53,
413 "file": None,
414 "ftp": 21,
415 "git": 9418,
416 "gopher": 70,
417 "http": 80,
418 "https": 443,
419 "imap": 143,
420 "ipp": 631,
421 "ipps": 631,
422 "irc": 194,
423 "ircs": 6697,
424 "ldap": 389,
425 "ldaps": 636,
426 "mms": 1755,
427 "msrp": 2855,
428 "msrps": None,
429 "mtqp": 1038,
430 "nfs": 111,
431 "nntp": 119,
432 "nntps": 563,
433 "pop": 110,
434 "prospero": 1525,
435 "redis": 6379,
436 "rsync": 873,
437 "rtsp": 554,
438 "rtsps": 322,
439 "rtspu": 5005,
440 "sftp": 22,
441 "smb": 445,
442 "snmp": 161,
443 "ssh": 22,
444 "steam": None,
445 "svn": 3690,
446 "telnet": 23,
447 "ventrilo": 3784,
448 "vnc": 5900,
449 "wais": 210,
450 "ws": 80,
451 "wss": 443,
452 "xmpp": None,
453}
455# This list of schemes that don't use authorities is also from the link above.
456NO_NETLOC_SCHEMES = set(
457 [
458 "urn",
459 "about",
460 "bitcoin",
461 "blob",
462 "data",
463 "geo",
464 "magnet",
465 "mailto",
466 "news",
467 "pkcs11",
468 "sip",
469 "sips",
470 "tel",
471 ]
472)
473# As of Mar 11, 2017, there were 44 netloc schemes, and 13 non-netloc
475NO_QUERY_PLUS_SCHEMES = set()
478def register_scheme(
479 text, uses_netloc=True, default_port=None, query_plus_is_space=True
480):
481 # type: (Text, bool, Optional[int], bool) -> None
482 """Registers new scheme information, resulting in correct port and
483 slash behavior from the URL object. There are dozens of standard
484 schemes preregistered, so this function is mostly meant for
485 proprietary internal customizations or stopgaps on missing
486 standards information. If a scheme seems to be missing, please
487 `file an issue`_!
489 Args:
490 text: A string representation of the scheme.
491 (the 'http' in 'http://hatnote.com')
492 uses_netloc: Does the scheme support specifying a
493 network host? For instance, "http" does, "mailto" does
494 not. Defaults to True.
495 default_port: The default port, if any, for
496 netloc-using schemes.
497 query_plus_is_space: If true, a "+" in the query string should be
498 decoded as a space by DecodedURL.
500 .. _file an issue: https://github.com/mahmoud/hyperlink/issues
501 """
502 text = text.lower()
503 if default_port is not None:
504 try:
505 default_port = int(default_port)
506 except (ValueError, TypeError):
507 raise ValueError(
508 "default_port expected integer or None, not %r"
509 % (default_port,)
510 )
512 if uses_netloc is True:
513 SCHEME_PORT_MAP[text] = default_port
514 elif uses_netloc is False:
515 if default_port is not None:
516 raise ValueError(
517 "unexpected default port while specifying"
518 " non-netloc scheme: %r" % default_port
519 )
520 NO_NETLOC_SCHEMES.add(text)
521 else:
522 raise ValueError("uses_netloc expected bool, not: %r" % uses_netloc)
524 if not query_plus_is_space:
525 NO_QUERY_PLUS_SCHEMES.add(text)
527 return
530def scheme_uses_netloc(scheme, default=None):
531 # type: (Text, Optional[bool]) -> Optional[bool]
532 """Whether or not a URL uses :code:`:` or :code:`://` to separate the
533 scheme from the rest of the URL depends on the scheme's own
534 standard definition. There is no way to infer this behavior
535 from other parts of the URL. A scheme either supports network
536 locations or it does not.
538 The URL type's approach to this is to check for explicitly
539 registered schemes, with common schemes like HTTP
540 preregistered. This is the same approach taken by
541 :mod:`urlparse`.
543 URL adds two additional heuristics if the scheme as a whole is
544 not registered. First, it attempts to check the subpart of the
545 scheme after the last ``+`` character. This adds intuitive
546 behavior for schemes like ``git+ssh``. Second, if a URL with
547 an unrecognized scheme is loaded, it will maintain the
548 separator it sees.
549 """
550 if not scheme:
551 return False
552 scheme = scheme.lower()
553 if scheme in SCHEME_PORT_MAP:
554 return True
555 if scheme in NO_NETLOC_SCHEMES:
556 return False
557 if scheme.split("+")[-1] in SCHEME_PORT_MAP:
558 return True
559 return default
562class URLParseError(ValueError):
563 """Exception inheriting from :exc:`ValueError`, raised when failing to
564 parse a URL. Mostly raised on invalid ports and IPv6 addresses.
565 """
567 pass
570def _optional(argument, default):
571 # type: (Any, Any) -> Any
572 if argument is _UNSET:
573 return default
574 else:
575 return argument
578def _typecheck(name, value, *types):
579 # type: (Text, T, Type[Any]) -> T
580 """
581 Check that the given *value* is one of the given *types*, or raise an
582 exception describing the problem using *name*.
583 """
584 if not types:
585 raise ValueError("expected one or more types, maybe use _textcheck?")
586 if not isinstance(value, types):
587 raise TypeError(
588 "expected %s for %s, got %r"
589 % (" or ".join([t.__name__ for t in types]), name, value)
590 )
591 return value
594def _textcheck(name, value, delims=frozenset(), nullable=False):
595 # type: (Text, T, Iterable[Text], bool) -> T
596 if not isinstance(value, Text):
597 if nullable and value is None:
598 # used by query string values
599 return value # type: ignore[unreachable]
600 else:
601 str_name = "unicode" if PY2 else "str"
602 exp = str_name + " or NoneType" if nullable else str_name
603 raise TypeError("expected %s for %s, got %r" % (exp, name, value))
604 if delims and set(value) & set(delims): # TODO: test caching into regexes
605 raise ValueError(
606 "one or more reserved delimiters %s present in %s: %r"
607 % ("".join(delims), name, value)
608 )
609 return value # type: ignore[return-value] # T vs. Text
612def iter_pairs(iterable):
613 # type: (Iterable[Any]) -> Iterator[Any]
614 """
615 Iterate over the (key, value) pairs in ``iterable``.
617 This handles dictionaries sensibly, and falls back to assuming the
618 iterable yields (key, value) pairs. This behaviour is similar to
619 what Python's ``dict()`` constructor does.
620 """
621 if isinstance(iterable, MappingABC):
622 iterable = iterable.items()
623 return iter(iterable)
626def _decode_unreserved(text, normalize_case=False, encode_stray_percents=False):
627 # type: (Text, bool, bool) -> Text
628 return _percent_decode(
629 text,
630 normalize_case=normalize_case,
631 encode_stray_percents=encode_stray_percents,
632 _decode_map=_UNRESERVED_DECODE_MAP,
633 )
636def _decode_userinfo_part(
637 text, normalize_case=False, encode_stray_percents=False
638):
639 # type: (Text, bool, bool) -> Text
640 return _percent_decode(
641 text,
642 normalize_case=normalize_case,
643 encode_stray_percents=encode_stray_percents,
644 _decode_map=_USERINFO_DECODE_MAP,
645 )
648def _decode_path_part(text, normalize_case=False, encode_stray_percents=False):
649 # type: (Text, bool, bool) -> Text
650 """
651 >>> _decode_path_part(u'%61%77%2f%7a')
652 u'aw%2fz'
653 >>> _decode_path_part(u'%61%77%2f%7a', normalize_case=True)
654 u'aw%2Fz'
655 """
656 return _percent_decode(
657 text,
658 normalize_case=normalize_case,
659 encode_stray_percents=encode_stray_percents,
660 _decode_map=_PATH_DECODE_MAP,
661 )
664def _decode_query_key(text, normalize_case=False, encode_stray_percents=False):
665 # type: (Text, bool, bool) -> Text
666 return _percent_decode(
667 text,
668 normalize_case=normalize_case,
669 encode_stray_percents=encode_stray_percents,
670 _decode_map=_QUERY_KEY_DECODE_MAP,
671 )
674def _decode_query_value(
675 text, normalize_case=False, encode_stray_percents=False
676):
677 # type: (Text, bool, bool) -> Text
678 return _percent_decode(
679 text,
680 normalize_case=normalize_case,
681 encode_stray_percents=encode_stray_percents,
682 _decode_map=_QUERY_VALUE_DECODE_MAP,
683 )
686def _decode_fragment_part(
687 text, normalize_case=False, encode_stray_percents=False
688):
689 # type: (Text, bool, bool) -> Text
690 return _percent_decode(
691 text,
692 normalize_case=normalize_case,
693 encode_stray_percents=encode_stray_percents,
694 _decode_map=_FRAGMENT_DECODE_MAP,
695 )
698def _percent_decode(
699 text, # type: Text
700 normalize_case=False, # type: bool
701 subencoding="utf-8", # type: Text
702 raise_subencoding_exc=False, # type: bool
703 encode_stray_percents=False, # type: bool
704 _decode_map=_HEX_CHAR_MAP, # type: Mapping[bytes, bytes]
705):
706 # type: (...) -> Text
707 """Convert percent-encoded text characters to their normal,
708 human-readable equivalents.
710 All characters in the input text must be encodable by
711 *subencoding*. All special characters underlying the values in the
712 percent-encoding must be decodable as *subencoding*. If a
713 non-*subencoding*-valid string is passed, the original text is
714 returned with no changes applied.
716 Only called by field-tailored variants, e.g.,
717 :func:`_decode_path_part`, as every percent-encodable part of the
718 URL has characters which should not be percent decoded.
720 >>> _percent_decode(u'abc%20def')
721 u'abc def'
723 Args:
724 text: Text with percent-encoding present.
725 normalize_case: Whether undecoded percent segments, such as encoded
726 delimiters, should be uppercased, per RFC 3986 Section 2.1.
727 See :func:`_decode_path_part` for an example.
728 subencoding: The name of the encoding underlying the percent-encoding.
729 raise_subencoding_exc: Whether an error in decoding the bytes
730 underlying the percent-decoding should be raised.
732 Returns:
733 Text: The percent-decoded version of *text*, decoded by *subencoding*.
734 """
735 try:
736 quoted_bytes = text.encode(subencoding)
737 except UnicodeEncodeError:
738 return text
740 bits = quoted_bytes.split(b"%")
741 if len(bits) == 1:
742 return text
744 res = [bits[0]]
745 append = res.append
747 for item in bits[1:]:
748 hexpair, rest = item[:2], item[2:]
749 try:
750 append(_decode_map[hexpair])
751 append(rest)
752 except KeyError:
753 pair_is_hex = hexpair in _HEX_CHAR_MAP
754 if pair_is_hex or not encode_stray_percents:
755 append(b"%")
756 else:
757 # if it's undecodable, treat as a real percent sign,
758 # which is reserved (because it wasn't in the
759 # context-aware _decode_map passed in), and should
760 # stay in an encoded state.
761 append(b"%25")
762 if normalize_case and pair_is_hex:
763 append(hexpair.upper())
764 append(rest)
765 else:
766 append(item)
768 unquoted_bytes = b"".join(res)
770 try:
771 return unquoted_bytes.decode(subencoding)
772 except UnicodeDecodeError:
773 if raise_subencoding_exc:
774 raise
775 return text
778def _decode_host(host):
779 # type: (Text) -> Text
780 """Decode a host from ASCII-encodable text to IDNA-decoded text. If
781 the host text is not ASCII, it is returned unchanged, as it is
782 presumed that it is already IDNA-decoded.
784 Some technical details: _decode_host is built on top of the "idna"
785 package, which has some quirks:
787 Capital letters are not valid IDNA2008. The idna package will
788 raise an exception like this on capital letters:
790 > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed
792 However, if a segment of a host (i.e., something in
793 url.host.split('.')) is already ASCII, idna doesn't perform its
794 usual checks. In fact, for capital letters it automatically
795 lowercases them.
797 This check and some other functionality can be bypassed by passing
798 uts46=True to idna.encode/decode. This allows a more permissive and
799 convenient interface. So far it seems like the balanced approach.
801 Example output (from idna==2.6):
803 >> idna.encode(u'mahmöud.io')
804 'xn--mahmud-zxa.io'
805 >> idna.encode(u'Mahmöud.io')
806 Traceback (most recent call last):
807 File "<stdin>", line 1, in <module>
808 File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
809 result.append(alabel(label))
810 File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
811 check_label(label)
812 File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
813 raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
814 idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed
815 >> idna.encode(u'Mahmoud.io')
816 'Mahmoud.io'
818 # Similar behavior for decodes below
819 >> idna.decode(u'Mahmoud.io')
820 u'mahmoud.io
821 >> idna.decode(u'Méhmoud.io', uts46=True)
822 u'm\xe9hmoud.io'
823 """ # noqa: E501
824 if not host:
825 return u""
826 try:
827 host_bytes = host.encode("ascii")
828 except UnicodeEncodeError:
829 host_text = host
830 else:
831 try:
832 host_text = idna_decode(host_bytes, uts46=True)
833 except ValueError:
834 # only reached on "narrow" (UCS-2) Python builds <3.4, see #7
835 # NOTE: not going to raise here, because there's no
836 # ambiguity in the IDNA, and the host is still
837 # technically usable
838 host_text = host
839 return host_text
842def _resolve_dot_segments(path):
843 # type: (Sequence[Text]) -> Sequence[Text]
844 """Normalize the URL path by resolving segments of '.' and '..'. For
845 more details, see `RFC 3986 section 5.2.4, Remove Dot Segments`_.
847 Args:
848 path: sequence of path segments in text form
850 Returns:
851 A new sequence of path segments with the '.' and '..' elements removed
852 and resolved.
854 .. _RFC 3986 section 5.2.4, Remove Dot Segments: https://tools.ietf.org/html/rfc3986#section-5.2.4
855 """ # noqa: E501
856 segs = [] # type: List[Text]
858 for seg in path:
859 if seg == u".":
860 pass
861 elif seg == u"..":
862 if segs:
863 segs.pop()
864 else:
865 segs.append(seg)
867 if list(path[-1:]) in ([u"."], [u".."]):
868 segs.append(u"")
870 return segs
873def parse_host(host):
874 # type: (Text) -> Tuple[Optional[AddressFamily], Text]
875 """Parse the host into a tuple of ``(family, host)``, where family
876 is the appropriate :mod:`socket` module constant when the host is
877 an IP address. Family is ``None`` when the host is not an IP.
879 Will raise :class:`URLParseError` on invalid IPv6 constants.
881 Returns:
882 family (socket constant or None), host (string)
884 >>> import socket
885 >>> parse_host('googlewebsite.com') == (None, 'googlewebsite.com')
886 True
887 >>> parse_host('::1') == (socket.AF_INET6, '::1')
888 True
889 >>> parse_host('192.168.1.1') == (socket.AF_INET, '192.168.1.1')
890 True
891 """
892 if not host:
893 return None, u""
895 if u":" in host:
896 try:
897 inet_pton(AF_INET6, host)
898 except socket.error as se:
899 raise URLParseError("invalid IPv6 host: %r (%r)" % (host, se))
900 except UnicodeEncodeError:
901 pass # TODO: this can't be a real host right?
902 else:
903 family = AF_INET6 # type: Optional[AddressFamily]
904 else:
905 try:
906 inet_pton(AF_INET, host)
907 except (socket.error, UnicodeEncodeError):
908 family = None # not an IP
909 else:
910 family = AF_INET
912 return family, host
915class URL(object):
916 r"""From blogs to billboards, URLs are so common, that it's easy to
917 overlook their complexity and power. With hyperlink's
918 :class:`URL` type, working with URLs doesn't have to be hard.
920 URLs are made of many parts. Most of these parts are officially
921 named in `RFC 3986`_ and this diagram may prove handy in identifying
922 them::
924 foo://user:pass@example.com:8042/over/there?name=ferret#nose
925 \_/ \_______/ \_________/ \__/\_________/ \_________/ \__/
926 | | | | | | |
927 scheme userinfo host port path query fragment
929 While :meth:`~URL.from_text` is used for parsing whole URLs, the
930 :class:`URL` constructor builds a URL from the individual
931 components, like so::
933 >>> from hyperlink import URL
934 >>> url = URL(scheme=u'https', host=u'example.com', path=[u'hello', u'world'])
935 >>> print(url.to_text())
936 https://example.com/hello/world
938 The constructor runs basic type checks. All strings are expected
939 to be text (:class:`str` in Python 3, :class:`unicode` in Python 2). All
940 arguments are optional, defaulting to appropriately empty values. A full
941 list of constructor arguments is below.
943 Args:
944 scheme: The text name of the scheme.
945 host: The host portion of the network location
946 port: The port part of the network location. If ``None`` or no port is
947 passed, the port will default to the default port of the scheme, if
948 it is known. See the ``SCHEME_PORT_MAP`` and
949 :func:`register_default_port` for more info.
950 path: A tuple of strings representing the slash-separated parts of the
951 path, each percent-encoded.
952 query: The query parameters, as a dictionary or as an sequence of
953 percent-encoded key-value pairs.
954 fragment: The fragment part of the URL.
955 rooted: A rooted URL is one which indicates an absolute path.
956 This is True on any URL that includes a host, or any relative URL
957 that starts with a slash.
958 userinfo: The username or colon-separated username:password pair.
959 uses_netloc: Indicates whether ``://`` (the "netloc separator") will
960 appear to separate the scheme from the *path* in cases where no
961 host is present.
962 Setting this to ``True`` is a non-spec-compliant affordance for the
963 common practice of having URIs that are *not* URLs (cannot have a
964 'host' part) but nevertheless use the common ``://`` idiom that
965 most people associate with URLs; e.g. ``message:`` URIs like
966 ``message://message-id`` being equivalent to ``message:message-id``.
967 This may be inferred based on the scheme depending on whether
968 :func:`register_scheme` has been used to register the scheme and
969 should not be passed directly unless you know the scheme works like
970 this and you know it has not been registered.
972 All of these parts are also exposed as read-only attributes of :class:`URL`
973 instances, along with several useful methods.
975 .. _RFC 3986: https://tools.ietf.org/html/rfc3986
976 .. _RFC 3987: https://tools.ietf.org/html/rfc3987
977 """ # noqa: E501
979 def __init__(
980 self,
981 scheme=None, # type: Optional[Text]
982 host=None, # type: Optional[Text]
983 path=(), # type: Iterable[Text]
984 query=(), # type: QueryParameters
985 fragment=u"", # type: Text
986 port=None, # type: Optional[int]
987 rooted=None, # type: Optional[bool]
988 userinfo=u"", # type: Text
989 uses_netloc=None, # type: Optional[bool]
990 ):
991 # type: (...) -> None
992 if host is not None and scheme is None:
993 scheme = u"http" # TODO: why
994 if port is None and scheme is not None:
995 port = SCHEME_PORT_MAP.get(scheme)
996 if host and query and not path:
997 # per RFC 3986 6.2.3, "a URI that uses the generic syntax
998 # for authority with an empty path should be normalized to
999 # a path of '/'."
1000 path = (u"",)
1002 # Now that we're done detecting whether they were passed, we can set
1003 # them to their defaults:
1004 if scheme is None:
1005 scheme = u""
1006 if host is None:
1007 host = u""
1008 if rooted is None:
1009 rooted = bool(host)
1011 # Set attributes.
1012 self._scheme = _textcheck("scheme", scheme)
1013 if self._scheme:
1014 if not _SCHEME_RE.match(self._scheme):
1015 raise ValueError(
1016 'invalid scheme: %r. Only alphanumeric, "+",'
1017 ' "-", and "." allowed. Did you meant to call'
1018 " %s.from_text()?" % (self._scheme, self.__class__.__name__)
1019 )
1021 _, self._host = parse_host(_textcheck("host", host, "/?#@"))
1022 if isinstance(path, Text):
1023 raise TypeError(
1024 "expected iterable of text for path, not: %r" % (path,)
1025 )
1026 self._path = tuple(
1027 (_textcheck("path segment", segment, "/?#") for segment in path)
1028 )
1029 self._query = tuple(
1030 (
1031 _textcheck("query parameter name", k, "&=#"),
1032 _textcheck("query parameter value", v, "&#", nullable=True),
1033 )
1034 for k, v in iter_pairs(query)
1035 )
1036 self._fragment = _textcheck("fragment", fragment)
1037 self._port = _typecheck("port", port, int, NoneType)
1038 self._rooted = _typecheck("rooted", rooted, bool)
1039 self._userinfo = _textcheck("userinfo", userinfo, "/?#@")
1041 if uses_netloc is None:
1042 uses_netloc = scheme_uses_netloc(self._scheme, uses_netloc)
1043 self._uses_netloc = _typecheck(
1044 "uses_netloc", uses_netloc, bool, NoneType
1045 )
1046 will_have_authority = self._host or (
1047 self._port and self._port != SCHEME_PORT_MAP.get(scheme)
1048 )
1049 if will_have_authority:
1050 # fixup for rooted consistency; if there's any 'authority'
1051 # represented in the textual URL, then the path must be rooted, and
1052 # we're definitely using a netloc (there must be a ://).
1053 self._rooted = True
1054 self._uses_netloc = True
1055 if (not self._rooted) and self.path[:1] == (u"",):
1056 self._rooted = True
1057 self._path = self._path[1:]
1058 if not will_have_authority and self._path and not self._rooted:
1059 # If, after fixing up the path, there *is* a path and it *isn't*
1060 # rooted, then we are definitely not using a netloc; if we did, it
1061 # would make the path (erroneously) look like a hostname.
1062 self._uses_netloc = False
1064 def get_decoded_url(self, lazy=False):
1065 # type: (bool) -> DecodedURL
1066 try:
1067 return self._decoded_url
1068 except AttributeError:
1069 self._decoded_url = DecodedURL(self, lazy=lazy) # type: DecodedURL
1070 return self._decoded_url
1072 @property
1073 def scheme(self):
1074 # type: () -> Text
1075 """The scheme is a string, and the first part of an absolute URL, the
1076 part before the first colon, and the part which defines the
1077 semantics of the rest of the URL. Examples include "http",
1078 "https", "ssh", "file", "mailto", and many others. See
1079 :func:`~hyperlink.register_scheme()` for more info.
1080 """
1081 return self._scheme
1083 @property
1084 def host(self):
1085 # type: () -> Text
1086 """The host is a string, and the second standard part of an absolute
1087 URL. When present, a valid host must be a domain name, or an
1088 IP (v4 or v6). It occurs before the first slash, or the second
1089 colon, if a :attr:`~hyperlink.URL.port` is provided.
1090 """
1091 return self._host
1093 @property
1094 def port(self):
1095 # type: () -> Optional[int]
1096 """The port is an integer that is commonly used in connecting to the
1097 :attr:`host`, and almost never appears without it.
1099 When not present in the original URL, this attribute defaults
1100 to the scheme's default port. If the scheme's default port is
1101 not known, and the port is not provided, this attribute will
1102 be set to None.
1104 >>> URL.from_text(u'http://example.com/pa/th').port
1105 80
1106 >>> URL.from_text(u'foo://example.com/pa/th').port
1107 >>> URL.from_text(u'foo://example.com:8042/pa/th').port
1108 8042
1110 .. note::
1112 Per the standard, when the port is the same as the schemes
1113 default port, it will be omitted in the text URL.
1114 """
1115 return self._port
1117 @property
1118 def path(self):
1119 # type: () -> Sequence[Text]
1120 """A tuple of strings, created by splitting the slash-separated
1121 hierarchical path. Started by the first slash after the host,
1122 terminated by a "?", which indicates the start of the
1123 :attr:`~hyperlink.URL.query` string.
1124 """
1125 return self._path
1127 @property
1128 def query(self):
1129 # type: () -> QueryPairs
1130 """Tuple of pairs, created by splitting the ampersand-separated
1131 mapping of keys and optional values representing
1132 non-hierarchical data used to identify the resource. Keys are
1133 always strings. Values are strings when present, or None when
1134 missing.
1136 For more operations on the mapping, see
1137 :meth:`~hyperlink.URL.get()`, :meth:`~hyperlink.URL.add()`,
1138 :meth:`~hyperlink.URL.set()`, and
1139 :meth:`~hyperlink.URL.delete()`.
1140 """
1141 return self._query
1143 @property
1144 def fragment(self):
1145 # type: () -> Text
1146 """A string, the last part of the URL, indicated by the first "#"
1147 after the :attr:`~hyperlink.URL.path` or
1148 :attr:`~hyperlink.URL.query`. Enables indirect identification
1149 of a secondary resource, like an anchor within an HTML page.
1150 """
1151 return self._fragment
1153 @property
1154 def rooted(self):
1155 # type: () -> bool
1156 """Whether or not the path starts with a forward slash (``/``).
1158 This is taken from the terminology in the BNF grammar,
1159 specifically the "path-rootless", rule, since "absolute path"
1160 and "absolute URI" are somewhat ambiguous. :attr:`path` does
1161 not contain the implicit prefixed ``"/"`` since that is
1162 somewhat awkward to work with.
1163 """
1164 return self._rooted
1166 @property
1167 def userinfo(self):
1168 # type: () -> Text
1169 """The colon-separated string forming the username-password
1170 combination.
1171 """
1172 return self._userinfo
1174 @property
1175 def uses_netloc(self):
1176 # type: () -> Optional[bool]
1177 """
1178 Indicates whether ``://`` (the "netloc separator") will appear to
1179 separate the scheme from the *path* in cases where no host is present.
1180 """
1181 return self._uses_netloc
1183 @property
1184 def user(self):
1185 # type: () -> Text
1186 """
1187 The user portion of :attr:`~hyperlink.URL.userinfo`.
1188 """
1189 return self.userinfo.split(u":")[0]
1191 def authority(self, with_password=False, **kw):
1192 # type: (bool, Any) -> Text
1193 """Compute and return the appropriate host/port/userinfo combination.
1195 >>> url = URL.from_text(u'http://user:pass@localhost:8080/a/b?x=y')
1196 >>> url.authority()
1197 u'user:@localhost:8080'
1198 >>> url.authority(with_password=True)
1199 u'user:pass@localhost:8080'
1201 Args:
1202 with_password: Whether the return value of this method include the
1203 password in the URL, if it is set.
1204 Defaults to False.
1206 Returns:
1207 Text: The authority (network location and user information) portion
1208 of the URL.
1209 """
1210 # first, a bit of twisted compat
1211 with_password = kw.pop("includeSecrets", with_password)
1212 if kw:
1213 raise TypeError("got unexpected keyword arguments: %r" % kw.keys())
1214 host = self.host
1215 if ":" in host:
1216 hostport = ["[" + host + "]"]
1217 else:
1218 hostport = [self.host]
1219 if self.port != SCHEME_PORT_MAP.get(self.scheme):
1220 hostport.append(Text(self.port))
1221 authority = []
1222 if self.userinfo:
1223 userinfo = self.userinfo
1224 if not with_password and u":" in userinfo:
1225 userinfo = userinfo[: userinfo.index(u":") + 1]
1226 authority.append(userinfo)
1227 authority.append(u":".join(hostport))
1228 return u"@".join(authority)
1230 def __eq__(self, other):
1231 # type: (Any) -> bool
1232 if not isinstance(other, self.__class__):
1233 return NotImplemented
1234 for attr in [
1235 "scheme",
1236 "userinfo",
1237 "host",
1238 "query",
1239 "fragment",
1240 "port",
1241 "uses_netloc",
1242 "rooted",
1243 ]:
1244 if getattr(self, attr) != getattr(other, attr):
1245 return False
1246 if self.path == other.path or (
1247 self.path in _ROOT_PATHS and other.path in _ROOT_PATHS
1248 ):
1249 return True
1250 return False
1252 def __ne__(self, other):
1253 # type: (Any) -> bool
1254 if not isinstance(other, self.__class__):
1255 return NotImplemented
1256 return not self.__eq__(other)
1258 def __hash__(self):
1259 # type: () -> int
1260 return hash(
1261 (
1262 self.__class__,
1263 self.scheme,
1264 self.userinfo,
1265 self.host,
1266 self.path,
1267 self.query,
1268 self.fragment,
1269 self.port,
1270 self.rooted,
1271 self.uses_netloc,
1272 )
1273 )
1275 @property
1276 def absolute(self):
1277 # type: () -> bool
1278 """Whether or not the URL is "absolute". Absolute URLs are complete
1279 enough to resolve to a network resource without being relative
1280 to a base URI.
1282 >>> URL.from_text(u'http://wikipedia.org/').absolute
1283 True
1284 >>> URL.from_text(u'?a=b&c=d').absolute
1285 False
1287 Absolute URLs must have both a scheme and a host set.
1288 """
1289 return bool(self.scheme and self.host)
1291 def replace(
1292 self,
1293 scheme=_UNSET, # type: Optional[Text]
1294 host=_UNSET, # type: Optional[Text]
1295 path=_UNSET, # type: Iterable[Text]
1296 query=_UNSET, # type: QueryParameters
1297 fragment=_UNSET, # type: Text
1298 port=_UNSET, # type: Optional[int]
1299 rooted=_UNSET, # type: Optional[bool]
1300 userinfo=_UNSET, # type: Text
1301 uses_netloc=_UNSET, # type: Optional[bool]
1302 ):
1303 # type: (...) -> URL
1304 """:class:`URL` objects are immutable, which means that attributes
1305 are designed to be set only once, at construction. Instead of
1306 modifying an existing URL, one simply creates a copy with the
1307 desired changes.
1309 If any of the following arguments is omitted, it defaults to
1310 the value on the current URL.
1312 Args:
1313 scheme: The text name of the scheme.
1314 host: The host portion of the network location.
1315 path: A tuple of strings representing the slash-separated parts of
1316 the path.
1317 query: The query parameters, as a dictionary or as an sequence of
1318 key-value pairs.
1319 fragment: The fragment part of the URL.
1320 port: The port part of the network location.
1321 rooted: Whether or not the path begins with a slash.
1322 userinfo: The username or colon-separated username:password pair.
1323 uses_netloc: Indicates whether ``://`` (the "netloc separator")
1324 will appear to separate the scheme from the *path* in cases
1325 where no host is present.
1326 Setting this to ``True`` is a non-spec-compliant affordance for
1327 the common practice of having URIs that are *not* URLs (cannot
1328 have a 'host' part) but nevertheless use the common ``://``
1329 idiom that most people associate with URLs; e.g. ``message:``
1330 URIs like ``message://message-id`` being equivalent to
1331 ``message:message-id``.
1332 This may be inferred based on the scheme depending on whether
1333 :func:`register_scheme` has been used to register the scheme
1334 and should not be passed directly unless you know the scheme
1335 works like this and you know it has not been registered.
1337 Returns:
1338 URL: A copy of the current :class:`URL`, with new values for
1339 parameters passed.
1340 """
1341 if scheme is not _UNSET and scheme != self.scheme:
1342 # when changing schemes, reset the explicit uses_netloc preference
1343 # to honor the new scheme.
1344 uses_netloc = None
1345 return self.__class__(
1346 scheme=_optional(scheme, self.scheme),
1347 host=_optional(host, self.host),
1348 path=_optional(path, self.path),
1349 query=_optional(query, self.query),
1350 fragment=_optional(fragment, self.fragment),
1351 port=_optional(port, self.port),
1352 rooted=_optional(rooted, self.rooted),
1353 userinfo=_optional(userinfo, self.userinfo),
1354 uses_netloc=_optional(uses_netloc, self.uses_netloc),
1355 )
1357 @classmethod
1358 def from_text(cls, text):
1359 # type: (Text) -> URL
1360 """Whereas the :class:`URL` constructor is useful for constructing
1361 URLs from parts, :meth:`~URL.from_text` supports parsing whole
1362 URLs from their string form::
1364 >>> URL.from_text(u'http://example.com')
1365 URL.from_text(u'http://example.com')
1366 >>> URL.from_text(u'?a=b&x=y')
1367 URL.from_text(u'?a=b&x=y')
1369 As you can see above, it's also used as the :func:`repr` of
1370 :class:`URL` objects. The natural counterpart to
1371 :func:`~URL.to_text()`. This method only accepts *text*, so be
1372 sure to decode those bytestrings.
1374 Args:
1375 text: A valid URL string.
1377 Returns:
1378 URL: The structured object version of the parsed string.
1380 .. note::
1382 Somewhat unexpectedly, URLs are a far more permissive
1383 format than most would assume. Many strings which don't
1384 look like URLs are still valid URLs. As a result, this
1385 method only raises :class:`URLParseError` on invalid port
1386 and IPv6 values in the host portion of the URL.
1387 """
1388 um = _URL_RE.match(_textcheck("text", text))
1389 if um is None:
1390 raise URLParseError("could not parse url: %r" % text)
1391 gs = um.groupdict()
1393 au_text = gs["authority"] or u""
1394 au_m = _AUTHORITY_RE.match(au_text)
1395 if au_m is None:
1396 raise URLParseError(
1397 "invalid authority %r in url: %r" % (au_text, text)
1398 )
1399 au_gs = au_m.groupdict()
1400 if au_gs["bad_host"]:
1401 raise URLParseError(
1402 "invalid host %r in url: %r" % (au_gs["bad_host"], text)
1403 )
1405 userinfo = au_gs["userinfo"] or u""
1407 host = au_gs["ipv6_host"] or au_gs["plain_host"]
1408 port = au_gs["port"]
1409 if port is not None:
1410 try:
1411 port = int(port) # type: ignore[assignment] # FIXME, see below
1412 except ValueError:
1413 if not port: # TODO: excessive?
1414 raise URLParseError("port must not be empty: %r" % au_text)
1415 raise URLParseError("expected integer for port, not %r" % port)
1417 scheme = gs["scheme"] or u""
1418 fragment = gs["fragment"] or u""
1419 uses_netloc = bool(gs["_netloc_sep"])
1421 if gs["path"]:
1422 path = tuple(gs["path"].split(u"/"))
1423 if not path[0]:
1424 path = path[1:]
1425 rooted = True
1426 else:
1427 rooted = False
1428 else:
1429 path = ()
1430 rooted = bool(au_text)
1431 if gs["query"]:
1432 query = tuple(
1433 (
1434 qe.split(u"=", 1) # type: ignore[misc]
1435 if u"=" in qe
1436 else (qe, None)
1437 )
1438 for qe in gs["query"].split(u"&")
1439 ) # type: QueryPairs
1440 else:
1441 query = ()
1442 return cls(
1443 scheme,
1444 host,
1445 path,
1446 query,
1447 fragment,
1448 port, # type: ignore[arg-type] # FIXME, see above
1449 rooted,
1450 userinfo,
1451 uses_netloc,
1452 )
1454 def normalize(
1455 self,
1456 scheme=True,
1457 host=True,
1458 path=True,
1459 query=True,
1460 fragment=True,
1461 userinfo=True,
1462 percents=True,
1463 ):
1464 # type: (bool, bool, bool, bool, bool, bool, bool) -> URL
1465 """Return a new URL object with several standard normalizations
1466 applied:
1468 * Decode unreserved characters (`RFC 3986 2.3`_)
1469 * Uppercase remaining percent-encoded octets (`RFC 3986 2.1`_)
1470 * Convert scheme and host casing to lowercase (`RFC 3986 3.2.2`_)
1471 * Resolve any "." and ".." references in the path (`RFC 3986 6.2.2.3`_)
1472 * Ensure an ending slash on URLs with an empty path (`RFC 3986 6.2.3`_)
1473 * Encode any stray percent signs (`%`) in percent-encoded
1474 fields (path, query, fragment, userinfo) (`RFC 3986 2.4`_)
1476 All are applied by default, but normalizations can be disabled
1477 per-part by passing `False` for that part's corresponding
1478 name.
1480 Args:
1481 scheme: Convert the scheme to lowercase
1482 host: Convert the host to lowercase
1483 path: Normalize the path (see above for details)
1484 query: Normalize the query string
1485 fragment: Normalize the fragment
1486 userinfo: Normalize the userinfo
1487 percents: Encode isolated percent signs for any percent-encoded
1488 fields which are being normalized (defaults to `True`).
1490 >>> url = URL.from_text(u'Http://example.COM/a/../b/./c%2f?%61%')
1491 >>> print(url.normalize().to_text())
1492 http://example.com/b/c%2F?a%25
1494 .. _RFC 3986 3.2.2: https://tools.ietf.org/html/rfc3986#section-3.2.2
1495 .. _RFC 3986 2.3: https://tools.ietf.org/html/rfc3986#section-2.3
1496 .. _RFC 3986 2.1: https://tools.ietf.org/html/rfc3986#section-2.1
1497 .. _RFC 3986 6.2.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.2.3
1498 .. _RFC 3986 6.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.3
1499 .. _RFC 3986 2.4: https://tools.ietf.org/html/rfc3986#section-2.4
1500 """ # noqa: E501
1501 kw = {} # type: Dict[str, Any]
1502 if scheme:
1503 kw["scheme"] = self.scheme.lower()
1504 if host:
1505 kw["host"] = self.host.lower()
1507 def _dec_unres(target):
1508 # type: (Text) -> Text
1509 return _decode_unreserved(
1510 target, normalize_case=True, encode_stray_percents=percents
1511 )
1513 if path:
1514 if self.path:
1515 kw["path"] = [
1516 _dec_unres(p) for p in _resolve_dot_segments(self.path)
1517 ]
1518 else:
1519 kw["path"] = (u"",)
1520 if query:
1521 kw["query"] = [
1522 (_dec_unres(k), _dec_unres(v) if v else v)
1523 for k, v in self.query
1524 ]
1525 if fragment:
1526 kw["fragment"] = _dec_unres(self.fragment)
1527 if userinfo:
1528 kw["userinfo"] = u":".join(
1529 [_dec_unres(p) for p in self.userinfo.split(":", 1)]
1530 )
1532 return self.replace(**kw)
1534 def child(self, *segments):
1535 # type: (Text) -> URL
1536 """Make a new :class:`URL` where the given path segments are a child
1537 of this URL, preserving other parts of the URL, including the
1538 query string and fragment.
1540 For example::
1542 >>> url = URL.from_text(u'http://localhost/a/b?x=y')
1543 >>> child_url = url.child(u"c", u"d")
1544 >>> child_url.to_text()
1545 u'http://localhost/a/b/c/d?x=y'
1547 Args:
1548 segments: Additional parts to be joined and added to the path, like
1549 :func:`os.path.join`. Special characters in segments will be
1550 percent encoded.
1552 Returns:
1553 URL: A copy of the current URL with the extra path segments.
1554 """
1555 if not segments:
1556 return self
1558 segments = [ # type: ignore[assignment] # variable is tuple
1559 _textcheck("path segment", s) for s in segments
1560 ]
1561 new_path = tuple(self.path)
1562 if self.path and self.path[-1] == u"":
1563 new_path = new_path[:-1]
1564 new_path += tuple(_encode_path_parts(segments, maximal=False))
1565 return self.replace(path=new_path)
1567 def sibling(self, segment):
1568 # type: (Text) -> URL
1569 """Make a new :class:`URL` with a single path segment that is a
1570 sibling of this URL path.
1572 Args:
1573 segment: A single path segment.
1575 Returns:
1576 URL: A copy of the current URL with the last path segment
1577 replaced by *segment*. Special characters such as
1578 ``/?#`` will be percent encoded.
1579 """
1580 _textcheck("path segment", segment)
1581 new_path = tuple(self.path)[:-1] + (_encode_path_part(segment),)
1582 return self.replace(path=new_path)
1584 def click(self, href=u""):
1585 # type: (Union[Text, URL]) -> URL
1586 """Resolve the given URL relative to this URL.
1588 The resulting URI should match what a web browser would
1589 generate if you visited the current URL and clicked on *href*.
1591 >>> url = URL.from_text(u'http://blog.hatnote.com/')
1592 >>> url.click(u'/post/155074058790').to_text()
1593 u'http://blog.hatnote.com/post/155074058790'
1594 >>> url = URL.from_text(u'http://localhost/a/b/c/')
1595 >>> url.click(u'../d/./e').to_text()
1596 u'http://localhost/a/b/d/e'
1598 Args (Text):
1599 href: A string representing a clicked URL.
1601 Return:
1602 A copy of the current URL with navigation logic applied.
1604 For more information, see `RFC 3986 section 5`_.
1606 .. _RFC 3986 section 5: https://tools.ietf.org/html/rfc3986#section-5
1607 """
1608 if href:
1609 if isinstance(href, URL):
1610 clicked = href
1611 else:
1612 # TODO: This error message is not completely accurate,
1613 # as URL objects are now also valid, but Twisted's
1614 # test suite (wrongly) relies on this exact message.
1615 _textcheck("relative URL", href)
1616 clicked = URL.from_text(href)
1617 if clicked.absolute:
1618 return clicked
1619 else:
1620 clicked = self
1622 query = clicked.query
1623 if clicked.scheme and not clicked.rooted:
1624 # Schemes with relative paths are not well-defined. RFC 3986 calls
1625 # them a "loophole in prior specifications" that should be avoided,
1626 # or supported only for backwards compatibility.
1627 raise NotImplementedError(
1628 "absolute URI with rootless path: %r" % (href,)
1629 )
1630 else:
1631 if clicked.rooted:
1632 path = clicked.path
1633 elif clicked.path:
1634 path = tuple(self.path)[:-1] + tuple(clicked.path)
1635 else:
1636 path = self.path
1637 if not query:
1638 query = self.query
1639 return self.replace(
1640 scheme=clicked.scheme or self.scheme,
1641 host=clicked.host or self.host,
1642 port=clicked.port or self.port,
1643 path=_resolve_dot_segments(path),
1644 query=query,
1645 fragment=clicked.fragment,
1646 )
1648 def to_uri(self):
1649 # type: () -> URL
1650 u"""Make a new :class:`URL` instance with all non-ASCII characters
1651 appropriately percent-encoded. This is useful to do in preparation
1652 for sending a :class:`URL` over a network protocol.
1654 For example::
1656 >>> URL.from_text(u'https://ايران.com/foo⇧bar/').to_uri()
1657 URL.from_text(u'https://xn--mgba3a4fra.com/foo%E2%87%A7bar/')
1659 Returns:
1660 URL: A new instance with its path segments, query parameters, and
1661 hostname encoded, so that they are all in the standard
1662 US-ASCII range.
1663 """
1664 new_userinfo = u":".join(
1665 [_encode_userinfo_part(p) for p in self.userinfo.split(":", 1)]
1666 )
1667 new_path = _encode_path_parts(
1668 self.path, has_scheme=bool(self.scheme), rooted=False, maximal=True
1669 )
1670 new_host = (
1671 self.host
1672 if not self.host
1673 else idna_encode(self.host, uts46=True).decode("ascii")
1674 )
1675 return self.replace(
1676 userinfo=new_userinfo,
1677 host=new_host,
1678 path=new_path,
1679 query=tuple(
1680 [
1681 (
1682 _encode_query_key(k, maximal=True),
1683 _encode_query_value(v, maximal=True)
1684 if v is not None
1685 else None,
1686 )
1687 for k, v in self.query
1688 ]
1689 ),
1690 fragment=_encode_fragment_part(self.fragment, maximal=True),
1691 )
1693 def to_iri(self):
1694 # type: () -> URL
1695 u"""Make a new :class:`URL` instance with all but a few reserved
1696 characters decoded into human-readable format.
1698 Percent-encoded Unicode and IDNA-encoded hostnames are
1699 decoded, like so::
1701 >>> url = URL.from_text(u'https://xn--mgba3a4fra.example.com/foo%E2%87%A7bar/')
1702 >>> print(url.to_iri().to_text())
1703 https://ايران.example.com/foo⇧bar/
1705 .. note::
1707 As a general Python issue, "narrow" (UCS-2) builds of
1708 Python may not be able to fully decode certain URLs, and
1709 the in those cases, this method will return a best-effort,
1710 partially-decoded, URL which is still valid. This issue
1711 does not affect any Python builds 3.4+.
1713 Returns:
1714 URL: A new instance with its path segments, query parameters, and
1715 hostname decoded for display purposes.
1716 """ # noqa: E501
1717 new_userinfo = u":".join(
1718 [_decode_userinfo_part(p) for p in self.userinfo.split(":", 1)]
1719 )
1720 host_text = _decode_host(self.host)
1722 return self.replace(
1723 userinfo=new_userinfo,
1724 host=host_text,
1725 path=[_decode_path_part(segment) for segment in self.path],
1726 query=tuple(
1727 (
1728 _decode_query_key(k),
1729 _decode_query_value(v) if v is not None else None,
1730 )
1731 for k, v in self.query
1732 ),
1733 fragment=_decode_fragment_part(self.fragment),
1734 )
1736 def to_text(self, with_password=False):
1737 # type: (bool) -> Text
1738 """Render this URL to its textual representation.
1740 By default, the URL text will *not* include a password, if one
1741 is set. RFC 3986 considers using URLs to represent such
1742 sensitive information as deprecated. Quoting from RFC 3986,
1743 `section 3.2.1`:
1745 "Applications should not render as clear text any data after the
1746 first colon (":") character found within a userinfo subcomponent
1747 unless the data after the colon is the empty string (indicating no
1748 password)."
1750 Args (bool):
1751 with_password: Whether or not to include the password in the URL
1752 text. Defaults to False.
1754 Returns:
1755 Text: The serialized textual representation of this URL, such as
1756 ``u"http://example.com/some/path?some=query"``.
1758 The natural counterpart to :class:`URL.from_text()`.
1760 .. _section 3.2.1: https://tools.ietf.org/html/rfc3986#section-3.2.1
1761 """
1762 scheme = self.scheme
1763 authority = self.authority(with_password)
1764 path = "/".join(
1765 _encode_path_parts(
1766 self.path,
1767 rooted=self.rooted,
1768 has_scheme=bool(scheme),
1769 has_authority=bool(authority),
1770 maximal=False,
1771 )
1772 )
1773 query_parts = []
1774 for k, v in self.query:
1775 if v is None:
1776 query_parts.append(_encode_query_key(k, maximal=False))
1777 else:
1778 query_parts.append(
1779 u"=".join(
1780 (
1781 _encode_query_key(k, maximal=False),
1782 _encode_query_value(v, maximal=False),
1783 )
1784 )
1785 )
1786 query_string = u"&".join(query_parts)
1788 fragment = self.fragment
1790 parts = [] # type: List[Text]
1791 _add = parts.append
1792 if scheme:
1793 _add(scheme)
1794 _add(":")
1795 if authority:
1796 _add("//")
1797 _add(authority)
1798 elif scheme and path[:2] != "//" and self.uses_netloc:
1799 _add("//")
1800 if path:
1801 if scheme and authority and path[:1] != "/":
1802 _add("/") # relpaths with abs authorities auto get '/'
1803 _add(path)
1804 if query_string:
1805 _add("?")
1806 _add(query_string)
1807 if fragment:
1808 _add("#")
1809 _add(fragment)
1810 return u"".join(parts)
1812 def __repr__(self):
1813 # type: () -> str
1814 """Convert this URL to an representation that shows all of its
1815 constituent parts, as well as being a valid argument to
1816 :func:`eval`.
1817 """
1818 return "%s.from_text(%r)" % (self.__class__.__name__, self.to_text())
1820 def _to_bytes(self):
1821 # type: () -> bytes
1822 """
1823 Allows for direct usage of URL objects with libraries like
1824 requests, which automatically stringify URL parameters. See
1825 issue #49.
1826 """
1827 return self.to_uri().to_text().encode("ascii")
1829 if PY2:
1830 __str__ = _to_bytes
1831 __unicode__ = to_text
1832 else:
1833 __bytes__ = _to_bytes
1834 __str__ = to_text
1836 # # Begin Twisted Compat Code
1837 asURI = to_uri
1838 asIRI = to_iri
1840 @classmethod
1841 def fromText(cls, s):
1842 # type: (Text) -> URL
1843 return cls.from_text(s)
1845 def asText(self, includeSecrets=False):
1846 # type: (bool) -> Text
1847 return self.to_text(with_password=includeSecrets)
1849 def __dir__(self):
1850 # type: () -> Sequence[Text]
1851 try:
1852 ret = object.__dir__(self)
1853 except AttributeError:
1854 # object.__dir__ == AttributeError # pdw for py2
1855 ret = dir(self.__class__) + list(self.__dict__.keys())
1856 ret = sorted(set(ret) - set(["fromText", "asURI", "asIRI", "asText"]))
1857 return ret
1859 # # End Twisted Compat Code
1861 def add(self, name, value=None):
1862 # type: (Text, Optional[Text]) -> URL
1863 """Make a new :class:`URL` instance with a given query argument,
1864 *name*, added to it with the value *value*, like so::
1866 >>> URL.from_text(u'https://example.com/?x=y').add(u'x')
1867 URL.from_text(u'https://example.com/?x=y&x')
1868 >>> URL.from_text(u'https://example.com/?x=y').add(u'x', u'z')
1869 URL.from_text(u'https://example.com/?x=y&x=z')
1871 Args:
1872 name: The name of the query parameter to add.
1873 The part before the ``=``.
1874 value: The value of the query parameter to add.
1875 The part after the ``=``.
1876 Defaults to ``None``, meaning no value.
1878 Returns:
1879 URL: A new :class:`URL` instance with the parameter added.
1880 """
1881 return self.replace(query=self.query + ((name, value),))
1883 def set(self, name, value=None):
1884 # type: (Text, Optional[Text]) -> URL
1885 """Make a new :class:`URL` instance with the query parameter *name*
1886 set to *value*. All existing occurences, if any are replaced
1887 by the single name-value pair.
1889 >>> URL.from_text(u'https://example.com/?x=y').set(u'x')
1890 URL.from_text(u'https://example.com/?x')
1891 >>> URL.from_text(u'https://example.com/?x=y').set(u'x', u'z')
1892 URL.from_text(u'https://example.com/?x=z')
1894 Args:
1895 name: The name of the query parameter to set.
1896 The part before the ``=``.
1897 value: The value of the query parameter to set.
1898 The part after the ``=``.
1899 Defaults to ``None``, meaning no value.
1901 Returns:
1902 URL: A new :class:`URL` instance with the parameter set.
1903 """
1904 # Preserve the original position of the query key in the list
1905 q = [(k, v) for (k, v) in self.query if k != name]
1906 idx = next(
1907 (i for (i, (k, v)) in enumerate(self.query) if k == name), -1
1908 )
1909 q[idx:idx] = [(name, value)]
1910 return self.replace(query=q)
1912 def get(self, name):
1913 # type: (Text) -> List[Optional[Text]]
1914 """Get a list of values for the given query parameter, *name*::
1916 >>> url = URL.from_text(u'?x=1&x=2')
1917 >>> url.get('x')
1918 [u'1', u'2']
1919 >>> url.get('y')
1920 []
1922 If the given *name* is not set, an empty list is returned. A
1923 list is always returned, and this method raises no exceptions.
1925 Args:
1926 name: The name of the query parameter to get.
1928 Returns:
1929 List[Optional[Text]]: A list of all the values associated with the
1930 key, in string form.
1931 """
1932 return [value for (key, value) in self.query if name == key]
1934 def remove(
1935 self,
1936 name, # type: Text
1937 value=_UNSET, # type: Text
1938 limit=None, # type: Optional[int]
1939 ):
1940 # type: (...) -> URL
1941 """Make a new :class:`URL` instance with occurrences of the query
1942 parameter *name* removed, or, if *value* is set, parameters
1943 matching *name* and *value*. No exception is raised if the
1944 parameter is not already set.
1946 Args:
1947 name: The name of the query parameter to remove.
1948 value: Optional value to additionally filter on.
1949 Setting this removes query parameters which match both name
1950 and value.
1951 limit: Optional maximum number of parameters to remove.
1953 Returns:
1954 URL: A new :class:`URL` instance with the parameter removed.
1955 """
1956 if limit is None:
1957 if value is _UNSET:
1958 nq = [(k, v) for (k, v) in self.query if k != name]
1959 else:
1960 nq = [
1961 (k, v)
1962 for (k, v) in self.query
1963 if not (k == name and v == value)
1964 ]
1965 else:
1966 nq, removed_count = [], 0
1968 for k, v in self.query:
1969 if (
1970 k == name
1971 and (value is _UNSET or v == value)
1972 and removed_count < limit
1973 ):
1974 removed_count += 1 # drop it
1975 else:
1976 nq.append((k, v)) # keep it
1978 return self.replace(query=nq)
1981EncodedURL = URL # An alias better describing what the URL really is
1983_EMPTY_URL = URL()
1986def _replace_plus(text):
1987 # type: (Text) -> Text
1988 return text.replace("+", "%20")
1991def _no_op(text):
1992 # type: (Text) -> Text
1993 return text
1996class DecodedURL(object):
1997 """
1998 :class:`DecodedURL` is a type designed to act as a higher-level
1999 interface to :class:`URL` and the recommended type for most
2000 operations. By analogy, :class:`DecodedURL` is the
2001 :class:`unicode` to URL's :class:`bytes`.
2003 :class:`DecodedURL` automatically handles encoding and decoding
2004 all its components, such that all inputs and outputs are in a
2005 maximally-decoded state. Note that this means, for some special
2006 cases, a URL may not "roundtrip" character-for-character, but this
2007 is considered a good tradeoff for the safety of automatic
2008 encoding.
2010 Otherwise, :class:`DecodedURL` has almost exactly the same API as
2011 :class:`URL`.
2013 Where applicable, a UTF-8 encoding is presumed. Be advised that
2014 some interactions can raise :exc:`UnicodeEncodeErrors` and
2015 :exc:`UnicodeDecodeErrors`, just like when working with
2016 bytestrings. Examples of such interactions include handling query
2017 strings encoding binary data, and paths containing segments with
2018 special characters encoded with codecs other than UTF-8.
2020 Args:
2021 url: A :class:`URL` object to wrap.
2022 lazy: Set to True to avoid pre-decode all parts of the URL to check for
2023 validity.
2024 Defaults to False.
2025 query_plus_is_space: + characters in the query string should be treated
2026 as spaces when decoding. If unspecified, the default is taken from
2027 the scheme.
2029 .. note::
2031 The :class:`DecodedURL` initializer takes a :class:`URL` object,
2032 not URL components, like :class:`URL`. To programmatically
2033 construct a :class:`DecodedURL`, you can use this pattern:
2035 >>> print(DecodedURL().replace(scheme=u'https',
2036 ... host=u'pypi.org', path=(u'projects', u'hyperlink')).to_text())
2037 https://pypi.org/projects/hyperlink
2039 .. versionadded:: 18.0.0
2040 """
2042 def __init__(self, url=_EMPTY_URL, lazy=False, query_plus_is_space=None):
2043 # type: (URL, bool, Optional[bool]) -> None
2044 self._url = url
2045 if query_plus_is_space is None:
2046 query_plus_is_space = url.scheme not in NO_QUERY_PLUS_SCHEMES
2047 self._query_plus_is_space = query_plus_is_space
2048 if not lazy:
2049 # cache the following, while triggering any decoding
2050 # issues with decodable fields
2051 self.host, self.userinfo, self.path, self.query, self.fragment
2052 return
2054 @classmethod
2055 def from_text(cls, text, lazy=False, query_plus_is_space=None):
2056 # type: (Text, bool, Optional[bool]) -> DecodedURL
2057 """\
2058 Make a `DecodedURL` instance from any text string containing a URL.
2060 Args:
2061 text: Text containing the URL
2062 lazy: Whether to pre-decode all parts of the URL to check for
2063 validity.
2064 Defaults to True.
2065 """
2066 _url = URL.from_text(text)
2067 return cls(_url, lazy=lazy, query_plus_is_space=query_plus_is_space)
2069 @property
2070 def encoded_url(self):
2071 # type: () -> URL
2072 """Access the underlying :class:`URL` object, which has any special
2073 characters encoded.
2074 """
2075 return self._url
2077 def to_text(self, with_password=False):
2078 # type: (bool) -> Text
2079 "Passthrough to :meth:`~hyperlink.URL.to_text()`"
2080 return self._url.to_text(with_password)
2082 def to_uri(self):
2083 # type: () -> URL
2084 "Passthrough to :meth:`~hyperlink.URL.to_uri()`"
2085 return self._url.to_uri()
2087 def to_iri(self):
2088 # type: () -> URL
2089 "Passthrough to :meth:`~hyperlink.URL.to_iri()`"
2090 return self._url.to_iri()
2092 def _clone(self, url):
2093 # type: (URL) -> DecodedURL
2094 return self.__class__(
2095 url,
2096 # TODO: propagate laziness?
2097 query_plus_is_space=self._query_plus_is_space,
2098 )
2100 def click(self, href=u""):
2101 # type: (Union[Text, URL, DecodedURL]) -> DecodedURL
2102 """Return a new DecodedURL wrapping the result of
2103 :meth:`~hyperlink.URL.click()`
2104 """
2105 if isinstance(href, DecodedURL):
2106 href = href._url
2107 return self._clone(
2108 self._url.click(href=href),
2109 )
2111 def sibling(self, segment):
2112 # type: (Text) -> DecodedURL
2113 """Automatically encode any reserved characters in *segment* and
2114 return a new `DecodedURL` wrapping the result of
2115 :meth:`~hyperlink.URL.sibling()`
2116 """
2117 return self._clone(
2118 self._url.sibling(_encode_reserved(segment)),
2119 )
2121 def child(self, *segments):
2122 # type: (Text) -> DecodedURL
2123 """Automatically encode any reserved characters in *segments* and
2124 return a new `DecodedURL` wrapping the result of
2125 :meth:`~hyperlink.URL.child()`.
2126 """
2127 if not segments:
2128 return self
2129 new_segs = [_encode_reserved(s) for s in segments]
2130 return self._clone(self._url.child(*new_segs))
2132 def normalize(
2133 self,
2134 scheme=True,
2135 host=True,
2136 path=True,
2137 query=True,
2138 fragment=True,
2139 userinfo=True,
2140 percents=True,
2141 ):
2142 # type: (bool, bool, bool, bool, bool, bool, bool) -> DecodedURL
2143 """Return a new `DecodedURL` wrapping the result of
2144 :meth:`~hyperlink.URL.normalize()`
2145 """
2146 return self._clone(
2147 self._url.normalize(
2148 scheme, host, path, query, fragment, userinfo, percents
2149 )
2150 )
2152 @property
2153 def absolute(self):
2154 # type: () -> bool
2155 return self._url.absolute
2157 @property
2158 def scheme(self):
2159 # type: () -> Text
2160 return self._url.scheme
2162 @property
2163 def host(self):
2164 # type: () -> Text
2165 return _decode_host(self._url.host)
2167 @property
2168 def port(self):
2169 # type: () -> Optional[int]
2170 return self._url.port
2172 @property
2173 def rooted(self):
2174 # type: () -> bool
2175 return self._url.rooted
2177 @property
2178 def path(self):
2179 # type: () -> Sequence[Text]
2180 if not hasattr(self, "_path"):
2181 self._path = tuple(
2182 [
2183 _percent_decode(p, raise_subencoding_exc=True)
2184 for p in self._url.path
2185 ]
2186 )
2187 return self._path
2189 @property
2190 def query(self):
2191 # type: () -> QueryPairs
2192 if not hasattr(self, "_query"):
2193 if self._query_plus_is_space:
2194 predecode = _replace_plus
2195 else:
2196 predecode = _no_op
2198 self._query = cast(
2199 QueryPairs,
2200 tuple(
2201 tuple(
2202 _percent_decode(
2203 predecode(x), raise_subencoding_exc=True
2204 )
2205 if x is not None
2206 else None
2207 for x in (k, v)
2208 )
2209 for k, v in self._url.query
2210 ),
2211 )
2212 return self._query
2214 @property
2215 def fragment(self):
2216 # type: () -> Text
2217 if not hasattr(self, "_fragment"):
2218 frag = self._url.fragment
2219 self._fragment = _percent_decode(frag, raise_subencoding_exc=True)
2220 return self._fragment
2222 @property
2223 def userinfo(self):
2224 # type: () -> Union[Tuple[str], Tuple[str, str]]
2225 if not hasattr(self, "_userinfo"):
2226 self._userinfo = cast(
2227 Union[Tuple[str], Tuple[str, str]],
2228 tuple(
2229 tuple(
2230 _percent_decode(p, raise_subencoding_exc=True)
2231 for p in self._url.userinfo.split(":", 1)
2232 )
2233 ),
2234 )
2235 return self._userinfo
2237 @property
2238 def user(self):
2239 # type: () -> Text
2240 return self.userinfo[0]
2242 @property
2243 def uses_netloc(self):
2244 # type: () -> Optional[bool]
2245 return self._url.uses_netloc
2247 def replace(
2248 self,
2249 scheme=_UNSET, # type: Optional[Text]
2250 host=_UNSET, # type: Optional[Text]
2251 path=_UNSET, # type: Iterable[Text]
2252 query=_UNSET, # type: QueryParameters
2253 fragment=_UNSET, # type: Text
2254 port=_UNSET, # type: Optional[int]
2255 rooted=_UNSET, # type: Optional[bool]
2256 userinfo=_UNSET, # type: Union[Tuple[str], Tuple[str, str]]
2257 uses_netloc=_UNSET, # type: Optional[bool]
2258 ):
2259 # type: (...) -> DecodedURL
2260 """While the signature is the same, this `replace()` differs a little
2261 from URL.replace. For instance, it accepts userinfo as a
2262 tuple, not as a string, handling the case of having a username
2263 containing a `:`. As with the rest of the methods on
2264 DecodedURL, if you pass a reserved character, it will be
2265 automatically encoded instead of an error being raised.
2266 """
2267 if path is not _UNSET:
2268 path = tuple(_encode_reserved(p) for p in path)
2269 if query is not _UNSET:
2270 query = cast(
2271 QueryPairs,
2272 tuple(
2273 tuple(
2274 _encode_reserved(x) if x is not None else None
2275 for x in (k, v)
2276 )
2277 for k, v in iter_pairs(query)
2278 ),
2279 )
2280 if userinfo is not _UNSET:
2281 if len(userinfo) > 2:
2282 raise ValueError(
2283 'userinfo expected sequence of ["user"] or'
2284 ' ["user", "password"], got %r' % (userinfo,)
2285 )
2286 userinfo_text = u":".join([_encode_reserved(p) for p in userinfo])
2287 else:
2288 userinfo_text = _UNSET
2289 new_url = self._url.replace(
2290 scheme=scheme,
2291 host=host,
2292 path=path,
2293 query=query,
2294 fragment=fragment,
2295 port=port,
2296 rooted=rooted,
2297 userinfo=userinfo_text,
2298 uses_netloc=uses_netloc,
2299 )
2300 return self._clone(url=new_url)
2302 def get(self, name):
2303 # type: (Text) -> List[Optional[Text]]
2304 "Get the value of all query parameters whose name matches *name*"
2305 return [v for (k, v) in self.query if name == k]
2307 def add(self, name, value=None):
2308 # type: (Text, Optional[Text]) -> DecodedURL
2309 """Return a new DecodedURL with the query parameter *name* and *value*
2310 added."""
2311 return self.replace(query=self.query + ((name, value),))
2313 def set(self, name, value=None):
2314 # type: (Text, Optional[Text]) -> DecodedURL
2315 "Return a new DecodedURL with query parameter *name* set to *value*"
2316 query = self.query
2317 q = [(k, v) for (k, v) in query if k != name]
2318 idx = next((i for (i, (k, v)) in enumerate(query) if k == name), -1)
2319 q[idx:idx] = [(name, value)]
2320 return self.replace(query=q)
2322 def remove(
2323 self,
2324 name, # type: Text
2325 value=_UNSET, # type: Text
2326 limit=None, # type: Optional[int]
2327 ):
2328 # type: (...) -> DecodedURL
2329 """Return a new DecodedURL with query parameter *name* removed.
2331 Optionally also filter for *value*, as well as cap the number
2332 of parameters removed with *limit*.
2333 """
2334 if limit is None:
2335 if value is _UNSET:
2336 nq = [(k, v) for (k, v) in self.query if k != name]
2337 else:
2338 nq = [
2339 (k, v)
2340 for (k, v) in self.query
2341 if not (k == name and v == value)
2342 ]
2343 else:
2344 nq, removed_count = [], 0
2345 for k, v in self.query:
2346 if (
2347 k == name
2348 and (value is _UNSET or v == value)
2349 and removed_count < limit
2350 ):
2351 removed_count += 1 # drop it
2352 else:
2353 nq.append((k, v)) # keep it
2355 return self.replace(query=nq)
2357 def __repr__(self):
2358 # type: () -> str
2359 cn = self.__class__.__name__
2360 return "%s(url=%r)" % (cn, self._url)
2362 def __str__(self):
2363 # type: () -> str
2364 # TODO: the underlying URL's __str__ needs to change to make
2365 # this work as the URL, see #55
2366 return str(self._url)
2368 def __eq__(self, other):
2369 # type: (Any) -> bool
2370 if not isinstance(other, self.__class__):
2371 return NotImplemented
2372 return self.normalize().to_uri() == other.normalize().to_uri()
2374 def __ne__(self, other):
2375 # type: (Any) -> bool
2376 if not isinstance(other, self.__class__):
2377 return NotImplemented
2378 return not self.__eq__(other)
2380 def __hash__(self):
2381 # type: () -> int
2382 return hash(
2383 (
2384 self.__class__,
2385 self.scheme,
2386 self.userinfo,
2387 self.host,
2388 self.path,
2389 self.query,
2390 self.fragment,
2391 self.port,
2392 self.rooted,
2393 self.uses_netloc,
2394 )
2395 )
2397 # # Begin Twisted Compat Code
2398 asURI = to_uri
2399 asIRI = to_iri
2401 @classmethod
2402 def fromText(cls, s, lazy=False):
2403 # type: (Text, bool) -> DecodedURL
2404 return cls.from_text(s, lazy=lazy)
2406 def asText(self, includeSecrets=False):
2407 # type: (bool) -> Text
2408 return self.to_text(with_password=includeSecrets)
2410 def __dir__(self):
2411 # type: () -> Sequence[Text]
2412 try:
2413 ret = object.__dir__(self)
2414 except AttributeError:
2415 # object.__dir__ == AttributeError # pdw for py2
2416 ret = dir(self.__class__) + list(self.__dict__.keys())
2417 ret = sorted(set(ret) - set(["fromText", "asURI", "asIRI", "asText"]))
2418 return ret
2420 # # End Twisted Compat Code
2423# Add some overloads so that parse gives a better return value.
2424@overload
2425def parse(url, decoded, lazy=False):
2426 # type: (Text, Literal[False], bool) -> URL
2427 """Passing decoded=False returns URL."""
2430@overload
2431def parse(url, decoded=True, lazy=False):
2432 # type: (Text, Literal[True], bool) -> DecodedURL
2433 """Passing decoded=True (or the default value) returns DecodedURL."""
2436@overload
2437def parse(url, decoded=True, lazy=False):
2438 # type: (Text, bool, bool) -> Union[URL, DecodedURL]
2439 """If decoded is not a literal we don't know the return type."""
2442def parse(url, decoded=True, lazy=False):
2443 # type: (Text, bool, bool) -> Union[URL, DecodedURL]
2444 """
2445 Automatically turn text into a structured URL object.
2447 >>> url = parse(u"https://github.com/python-hyper/hyperlink")
2448 >>> print(url.to_text())
2449 https://github.com/python-hyper/hyperlink
2451 Args:
2452 url: A text string representation of a URL.
2454 decoded: Whether or not to return a :class:`DecodedURL`,
2455 which automatically handles all
2456 encoding/decoding/quoting/unquoting for all the various
2457 accessors of parts of the URL, or a :class:`URL`,
2458 which has the same API, but requires handling of special
2459 characters for different parts of the URL.
2461 lazy: In the case of `decoded=True`, this controls
2462 whether the URL is decoded immediately or as accessed. The
2463 default, `lazy=False`, checks all encoded parts of the URL
2464 for decodability.
2466 .. versionadded:: 18.0.0
2467 """
2468 enc_url = EncodedURL.from_text(url)
2469 if not decoded:
2470 return enc_url
2471 dec_url = DecodedURL(enc_url, lazy=lazy)
2472 return dec_url