Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/validators/url.py: 96%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""URL."""
3# standard
4from functools import lru_cache
5import re
6from typing import Callable, Optional
7from urllib.parse import parse_qs, unquote, urlsplit
9# local
10from .hostname import hostname
11from .utils import validator
14@lru_cache
15def _username_regex():
16 return re.compile(
17 # extended latin
18 r"(^[\u0100-\u017F\u0180-\u024F]"
19 # dot-atom
20 + r"|[-!#$%&'*+/=?^_`{}|~0-9a-z]+(\.[-!#$%&'*+/=?^_`{}|~0-9a-z]+)*$"
21 # non-quoted-string
22 + r"|^([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\011.])*$)",
23 re.IGNORECASE,
24 )
27@lru_cache
28def _path_regex():
29 return re.compile(
30 # allowed symbols
31 r"^[\/a-z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\%"
32 # symbols / pictographs
33 + r"\U0001F300-\U0001F5FF"
34 # emoticons / emoji
35 + r"\U0001F600-\U0001F64F"
36 # multilingual unicode ranges
37 + r"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF]+$",
38 re.IGNORECASE,
39 )
42def _validate_scheme(value: str):
43 """Validate scheme."""
44 # More schemes will be considered later.
45 return (
46 value
47 # fmt: off
48 in {
49 "ftp",
50 "ftps",
51 "git",
52 "http",
53 "https",
54 "irc",
55 "rtmp",
56 "rtmps",
57 "rtsp",
58 "sftp",
59 "ssh",
60 "telnet",
61 }
62 # fmt: on
63 if value
64 else False
65 )
68def _confirm_ipv6_skip(value: str, skip_ipv6_addr: bool):
69 """Confirm skip IPv6 check."""
70 return skip_ipv6_addr or value.count(":") < 2 or not value.startswith("[")
73def _validate_auth_segment(value: str):
74 """Validate authentication segment."""
75 if not value:
76 return True
77 if (colon_count := value.count(":")) > 1:
78 # everything before @ is then considered as a username
79 # this is a bad practice, but syntactically valid URL
80 return _username_regex().match(unquote(value))
81 if colon_count < 1:
82 return _username_regex().match(value)
83 username, password = value.rsplit(":", 1)
84 return _username_regex().match(username) and all(
85 char_to_avoid not in password for char_to_avoid in ("/", "?", "#", "@")
86 )
89def _validate_netloc(
90 value: str,
91 skip_ipv6_addr: bool,
92 skip_ipv4_addr: bool,
93 may_have_port: bool,
94 simple_host: bool,
95 consider_tld: bool,
96 private: Optional[bool],
97 rfc_1034: bool,
98 rfc_2782: bool,
99):
100 """Validate netloc."""
101 if not value or value.count("@") > 1:
102 return False
103 if value.count("@") < 1:
104 return hostname(
105 (
106 value
107 if _confirm_ipv6_skip(value, skip_ipv6_addr) or "]:" in value
108 else value.lstrip("[").replace("]", "", 1)
109 ),
110 skip_ipv6_addr=_confirm_ipv6_skip(value, skip_ipv6_addr),
111 skip_ipv4_addr=skip_ipv4_addr,
112 may_have_port=may_have_port,
113 maybe_simple=simple_host,
114 consider_tld=consider_tld,
115 private=private,
116 rfc_1034=rfc_1034,
117 rfc_2782=rfc_2782,
118 )
119 basic_auth, host = value.rsplit("@", 1)
120 return hostname(
121 (
122 host
123 if _confirm_ipv6_skip(host, skip_ipv6_addr) or "]:" in value
124 else host.lstrip("[").replace("]", "", 1)
125 ),
126 skip_ipv6_addr=_confirm_ipv6_skip(host, skip_ipv6_addr),
127 skip_ipv4_addr=skip_ipv4_addr,
128 may_have_port=may_have_port,
129 maybe_simple=simple_host,
130 consider_tld=consider_tld,
131 private=private,
132 rfc_1034=rfc_1034,
133 rfc_2782=rfc_2782,
134 ) and _validate_auth_segment(basic_auth)
137def _validate_optionals(path: str, query: str, fragment: str, strict_query: bool):
138 """Validate path query and fragments."""
139 optional_segments = True
140 if path:
141 optional_segments &= bool(_path_regex().match(path))
142 try:
143 if (
144 query
145 # ref: https://github.com/python/cpython/issues/117109
146 and parse_qs(query, strict_parsing=strict_query, separator="&")
147 and parse_qs(query, strict_parsing=strict_query, separator=";")
148 ):
149 optional_segments &= True
150 except TypeError:
151 # for Python < v3.9.2 (official v3.10)
152 if query and parse_qs(query, strict_parsing=strict_query):
153 optional_segments &= True
154 if fragment:
155 # See RFC3986 Section 3.5 Fragment for allowed characters
156 # Adding "#", see https://github.com/python-validators/validators/issues/403
157 optional_segments &= bool(
158 re.fullmatch(r"[0-9a-z?/:@\-._~%!$&'()*+,;=#]*", fragment, re.IGNORECASE)
159 )
160 return optional_segments
163@validator
164def url(
165 value: str,
166 /,
167 *,
168 skip_ipv6_addr: bool = False,
169 skip_ipv4_addr: bool = False,
170 may_have_port: bool = True,
171 simple_host: bool = False,
172 strict_query: bool = True,
173 consider_tld: bool = False,
174 private: Optional[bool] = None, # only for ip-addresses
175 rfc_1034: bool = False,
176 rfc_2782: bool = False,
177 validate_scheme: Callable[[str], bool] = _validate_scheme,
178):
179 r"""Return whether or not given value is a valid URL.
181 This validator was originally inspired from [URL validator of dperini][1].
182 The following diagram is from [urlly][2]::
185 foo://admin:hunter1@example.com:8042/over/there?name=ferret#nose
186 \_/ \___/ \_____/ \_________/ \__/\_________/ \_________/ \__/
187 | | | | | | | |
188 scheme username password hostname port path query fragment
190 [1]: https://gist.github.com/dperini/729294
191 [2]: https://github.com/treeform/urlly
193 Examples:
194 >>> url('http://duck.com')
195 True
196 >>> url('ftp://foobar.dk')
197 True
198 >>> url('http://10.0.0.1')
199 True
200 >>> url('http://example.com/">user@example.com')
201 ValidationError(func=url, args={'value': 'http://example.com/">user@example.com'})
203 Args:
204 value:
205 URL string to validate.
206 skip_ipv6_addr:
207 When URL string cannot contain an IPv6 address.
208 skip_ipv4_addr:
209 When URL string cannot contain an IPv4 address.
210 may_have_port:
211 URL string may contain port number.
212 simple_host:
213 URL string maybe only hyphens and alpha-numerals.
214 strict_query:
215 Fail validation on query string parsing error.
216 consider_tld:
217 Restrict domain to TLDs allowed by IANA.
218 private:
219 Embedded IP address is public if `False`, private/local if `True`.
220 rfc_1034:
221 Allow trailing dot in domain/host name.
222 Ref: [RFC 1034](https://www.rfc-editor.org/rfc/rfc1034).
223 rfc_2782:
224 Domain/Host name is of type service record.
225 Ref: [RFC 2782](https://www.rfc-editor.org/rfc/rfc2782).
226 validate_scheme:
227 Function that validates URL scheme.
229 Returns:
230 (Literal[True]): If `value` is a valid url.
231 (ValidationError): If `value` is an invalid url.
232 """
233 if not value or re.search(r"\s", value):
234 # url must not contain any white
235 # spaces, they must be encoded
236 return False
238 try:
239 scheme, netloc, path, query, fragment = urlsplit(value)
240 except ValueError:
241 return False
243 return (
244 validate_scheme(scheme)
245 and _validate_netloc(
246 netloc,
247 skip_ipv6_addr,
248 skip_ipv4_addr,
249 may_have_port,
250 simple_host,
251 consider_tld,
252 private,
253 rfc_1034,
254 rfc_2782,
255 )
256 and _validate_optionals(path, query, fragment, strict_query)
257 )