Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/validators/url.py: 98%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""URL."""
3# standard
4from functools import lru_cache
5import re
6from typing import Optional
7from urllib.parse import parse_qs, unquote, urlsplit
9# local
10from .hostname import hostname
11from .utils import validator
14@lru_cache
15def _username_regex():
16 return re.compile(
17 # extended latin
18 r"(^[\u0100-\u017F\u0180-\u024F]"
19 # dot-atom
20 + r"|[-!#$%&'*+/=?^_`{}|~0-9a-z]+(\.[-!#$%&'*+/=?^_`{}|~0-9a-z]+)*$"
21 # non-quoted-string
22 + r"|^([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\011.])*$)",
23 re.IGNORECASE,
24 )
27@lru_cache
28def _path_regex():
29 return re.compile(
30 # allowed symbols
31 r"^[\/a-z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\%"
32 # symbols / pictographs
33 + r"\U0001F300-\U0001F5FF"
34 # emoticons / emoji
35 + r"\U0001F600-\U0001F64F"
36 # multilingual unicode ranges
37 + r"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF]+$",
38 re.IGNORECASE,
39 )
42def _validate_scheme(value: str):
43 """Validate scheme."""
44 # More schemes will be considered later.
45 return (
46 value
47 # fmt: off
48 in {
49 "ftp", "ftps", "git", "http", "https",
50 "irc", "rtmp", "rtmps", "rtsp", "sftp",
51 "ssh", "telnet",
52 }
53 # fmt: on
54 if value
55 else False
56 )
59def _confirm_ipv6_skip(value: str, skip_ipv6_addr: bool):
60 """Confirm skip IPv6 check."""
61 return skip_ipv6_addr or value.count(":") < 2 or not value.startswith("[")
64def _validate_auth_segment(value: str):
65 """Validate authentication segment."""
66 if not value:
67 return True
68 if (colon_count := value.count(":")) > 1:
69 # everything before @ is then considered as a username
70 # this is a bad practice, but syntactically valid URL
71 return _username_regex().match(unquote(value))
72 if colon_count < 1:
73 return _username_regex().match(value)
74 username, password = value.rsplit(":", 1)
75 return _username_regex().match(username) and all(
76 char_to_avoid not in password for char_to_avoid in ("/", "?", "#", "@")
77 )
80def _validate_netloc(
81 value: str,
82 skip_ipv6_addr: bool,
83 skip_ipv4_addr: bool,
84 may_have_port: bool,
85 simple_host: bool,
86 consider_tld: bool,
87 private: Optional[bool],
88 rfc_1034: bool,
89 rfc_2782: bool,
90):
91 """Validate netloc."""
92 if not value or value.count("@") > 1:
93 return False
94 if value.count("@") < 1:
95 return hostname(
96 (
97 value
98 if _confirm_ipv6_skip(value, skip_ipv6_addr) or "]:" in value
99 else value.lstrip("[").replace("]", "", 1)
100 ),
101 skip_ipv6_addr=_confirm_ipv6_skip(value, skip_ipv6_addr),
102 skip_ipv4_addr=skip_ipv4_addr,
103 may_have_port=may_have_port,
104 maybe_simple=simple_host,
105 consider_tld=consider_tld,
106 private=private,
107 rfc_1034=rfc_1034,
108 rfc_2782=rfc_2782,
109 )
110 basic_auth, host = value.rsplit("@", 1)
111 return hostname(
112 (
113 host
114 if _confirm_ipv6_skip(host, skip_ipv6_addr) or "]:" in value
115 else host.lstrip("[").replace("]", "", 1)
116 ),
117 skip_ipv6_addr=_confirm_ipv6_skip(host, skip_ipv6_addr),
118 skip_ipv4_addr=skip_ipv4_addr,
119 may_have_port=may_have_port,
120 maybe_simple=simple_host,
121 consider_tld=consider_tld,
122 private=private,
123 rfc_1034=rfc_1034,
124 rfc_2782=rfc_2782,
125 ) and _validate_auth_segment(basic_auth)
128def _validate_optionals(path: str, query: str, fragment: str, strict_query: bool):
129 """Validate path query and fragments."""
130 optional_segments = True
131 if path:
132 optional_segments &= bool(_path_regex().match(path))
133 try:
134 if (
135 query
136 # ref: https://github.com/python/cpython/issues/117109
137 and parse_qs(query, strict_parsing=strict_query, separator="&")
138 and parse_qs(query, strict_parsing=strict_query, separator=";")
139 ):
140 optional_segments &= True
141 except TypeError:
142 # for Python < v3.9.2 (official v3.10)
143 if query and parse_qs(query, strict_parsing=strict_query):
144 optional_segments &= True
145 if fragment:
146 # See RFC3986 Section 3.5 Fragment for allowed characters
147 # Adding "#", see https://github.com/python-validators/validators/issues/403
148 optional_segments &= bool(
149 re.fullmatch(r"[0-9a-z?/:@\-._~%!$&'()*+,;=#]*", fragment, re.IGNORECASE)
150 )
151 return optional_segments
154@validator
155def url(
156 value: str,
157 /,
158 *,
159 skip_ipv6_addr: bool = False,
160 skip_ipv4_addr: bool = False,
161 may_have_port: bool = True,
162 simple_host: bool = False,
163 strict_query: bool = True,
164 consider_tld: bool = False,
165 private: Optional[bool] = None, # only for ip-addresses
166 rfc_1034: bool = False,
167 rfc_2782: bool = False,
168):
169 r"""Return whether or not given value is a valid URL.
171 This validator was originally inspired from [URL validator of dperini][1].
172 The following diagram is from [urlly][2]::
175 foo://admin:hunter1@example.com:8042/over/there?name=ferret#nose
176 \_/ \___/ \_____/ \_________/ \__/\_________/ \_________/ \__/
177 | | | | | | | |
178 scheme username password hostname port path query fragment
180 [1]: https://gist.github.com/dperini/729294
181 [2]: https://github.com/treeform/urlly
183 Examples:
184 >>> url('http://duck.com')
185 # Output: True
186 >>> url('ftp://foobar.dk')
187 # Output: True
188 >>> url('http://10.0.0.1')
189 # Output: True
190 >>> url('http://example.com/">user@example.com')
191 # Output: ValidationError(func=url, ...)
193 Args:
194 value:
195 URL string to validate.
196 skip_ipv6_addr:
197 When URL string cannot contain an IPv6 address.
198 skip_ipv4_addr:
199 When URL string cannot contain an IPv4 address.
200 may_have_port:
201 URL string may contain port number.
202 simple_host:
203 URL string maybe only hyphens and alpha-numerals.
204 strict_query:
205 Fail validation on query string parsing error.
206 consider_tld:
207 Restrict domain to TLDs allowed by IANA.
208 private:
209 Embedded IP address is public if `False`, private/local if `True`.
210 rfc_1034:
211 Allow trailing dot in domain/host name.
212 Ref: [RFC 1034](https://www.rfc-editor.org/rfc/rfc1034).
213 rfc_2782:
214 Domain/Host name is of type service record.
215 Ref: [RFC 2782](https://www.rfc-editor.org/rfc/rfc2782).
217 Returns:
218 (Literal[True]): If `value` is a valid url.
219 (ValidationError): If `value` is an invalid url.
220 """
221 if not value or re.search(r"\s", value):
222 # url must not contain any white
223 # spaces, they must be encoded
224 return False
226 try:
227 scheme, netloc, path, query, fragment = urlsplit(value)
228 except ValueError:
229 return False
231 return (
232 _validate_scheme(scheme)
233 and _validate_netloc(
234 netloc,
235 skip_ipv6_addr,
236 skip_ipv4_addr,
237 may_have_port,
238 simple_host,
239 consider_tld,
240 private,
241 rfc_1034,
242 rfc_2782,
243 )
244 and _validate_optionals(path, query, fragment, strict_query)
245 )