Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/validators/url.py: 96%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

55 statements  

1"""URL.""" 

2 

3# standard 

4from functools import lru_cache 

5import re 

6from typing import Callable, Optional 

7from urllib.parse import parse_qs, unquote, urlsplit 

8 

9# local 

10from .hostname import hostname 

11from .utils import validator 

12 

13 

14@lru_cache 

15def _username_regex(): 

16 return re.compile( 

17 # extended latin 

18 r"(^[\u0100-\u017F\u0180-\u024F]" 

19 # dot-atom 

20 + r"|[-!#$%&'*+/=?^_`{}|~0-9a-z]+(\.[-!#$%&'*+/=?^_`{}|~0-9a-z]+)*$" 

21 # non-quoted-string 

22 + r"|^([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\011.])*$)", 

23 re.IGNORECASE, 

24 ) 

25 

26 

27@lru_cache 

28def _path_regex(): 

29 return re.compile( 

30 # allowed symbols 

31 r"^[\/a-z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\%" 

32 # symbols / pictographs 

33 + r"\U0001F300-\U0001F5FF" 

34 # emoticons / emoji 

35 + r"\U0001F600-\U0001F64F" 

36 # multilingual unicode ranges 

37 + r"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF]+$", 

38 re.IGNORECASE, 

39 ) 

40 

41 

42def _validate_scheme(value: str): 

43 """Validate scheme.""" 

44 # More schemes will be considered later. 

45 return ( 

46 value 

47 # fmt: off 

48 in { 

49 "ftp", 

50 "ftps", 

51 "git", 

52 "http", 

53 "https", 

54 "irc", 

55 "rtmp", 

56 "rtmps", 

57 "rtsp", 

58 "sftp", 

59 "ssh", 

60 "telnet", 

61 } 

62 # fmt: on 

63 if value 

64 else False 

65 ) 

66 

67 

68def _confirm_ipv6_skip(value: str, skip_ipv6_addr: bool): 

69 """Confirm skip IPv6 check.""" 

70 return skip_ipv6_addr or value.count(":") < 2 or not value.startswith("[") 

71 

72 

73def _validate_auth_segment(value: str): 

74 """Validate authentication segment.""" 

75 if not value: 

76 return True 

77 if (colon_count := value.count(":")) > 1: 

78 # everything before @ is then considered as a username 

79 # this is a bad practice, but syntactically valid URL 

80 return _username_regex().match(unquote(value)) 

81 if colon_count < 1: 

82 return _username_regex().match(value) 

83 username, password = value.rsplit(":", 1) 

84 return _username_regex().match(username) and all( 

85 char_to_avoid not in password for char_to_avoid in ("/", "?", "#", "@") 

86 ) 

87 

88 

89def _validate_netloc( 

90 value: str, 

91 skip_ipv6_addr: bool, 

92 skip_ipv4_addr: bool, 

93 may_have_port: bool, 

94 simple_host: bool, 

95 consider_tld: bool, 

96 private: Optional[bool], 

97 rfc_1034: bool, 

98 rfc_2782: bool, 

99): 

100 """Validate netloc.""" 

101 if not value or value.count("@") > 1: 

102 return False 

103 if value.count("@") < 1: 

104 return hostname( 

105 ( 

106 value 

107 if _confirm_ipv6_skip(value, skip_ipv6_addr) or "]:" in value 

108 else value.lstrip("[").replace("]", "", 1) 

109 ), 

110 skip_ipv6_addr=_confirm_ipv6_skip(value, skip_ipv6_addr), 

111 skip_ipv4_addr=skip_ipv4_addr, 

112 may_have_port=may_have_port, 

113 maybe_simple=simple_host, 

114 consider_tld=consider_tld, 

115 private=private, 

116 rfc_1034=rfc_1034, 

117 rfc_2782=rfc_2782, 

118 ) 

119 basic_auth, host = value.rsplit("@", 1) 

120 return hostname( 

121 ( 

122 host 

123 if _confirm_ipv6_skip(host, skip_ipv6_addr) or "]:" in value 

124 else host.lstrip("[").replace("]", "", 1) 

125 ), 

126 skip_ipv6_addr=_confirm_ipv6_skip(host, skip_ipv6_addr), 

127 skip_ipv4_addr=skip_ipv4_addr, 

128 may_have_port=may_have_port, 

129 maybe_simple=simple_host, 

130 consider_tld=consider_tld, 

131 private=private, 

132 rfc_1034=rfc_1034, 

133 rfc_2782=rfc_2782, 

134 ) and _validate_auth_segment(basic_auth) 

135 

136 

137def _validate_optionals(path: str, query: str, fragment: str, strict_query: bool): 

138 """Validate path query and fragments.""" 

139 optional_segments = True 

140 if path: 

141 optional_segments &= bool(_path_regex().match(path)) 

142 try: 

143 if ( 

144 query 

145 # ref: https://github.com/python/cpython/issues/117109 

146 and parse_qs(query, strict_parsing=strict_query, separator="&") 

147 and parse_qs(query, strict_parsing=strict_query, separator=";") 

148 ): 

149 optional_segments &= True 

150 except TypeError: 

151 # for Python < v3.9.2 (official v3.10) 

152 if query and parse_qs(query, strict_parsing=strict_query): 

153 optional_segments &= True 

154 if fragment: 

155 # See RFC3986 Section 3.5 Fragment for allowed characters 

156 # Adding "#", see https://github.com/python-validators/validators/issues/403 

157 optional_segments &= bool( 

158 re.fullmatch(r"[0-9a-z?/:@\-._~%!$&'()*+,;=#]*", fragment, re.IGNORECASE) 

159 ) 

160 return optional_segments 

161 

162 

163@validator 

164def url( 

165 value: str, 

166 /, 

167 *, 

168 skip_ipv6_addr: bool = False, 

169 skip_ipv4_addr: bool = False, 

170 may_have_port: bool = True, 

171 simple_host: bool = False, 

172 strict_query: bool = True, 

173 consider_tld: bool = False, 

174 private: Optional[bool] = None, # only for ip-addresses 

175 rfc_1034: bool = False, 

176 rfc_2782: bool = False, 

177 validate_scheme: Callable[[str], bool] = _validate_scheme, 

178): 

179 r"""Return whether or not given value is a valid URL. 

180 

181 This validator was originally inspired from [URL validator of dperini][1]. 

182 The following diagram is from [urlly][2]:: 

183 

184 

185 foo://admin:hunter1@example.com:8042/over/there?name=ferret#nose 

186 \_/ \___/ \_____/ \_________/ \__/\_________/ \_________/ \__/ 

187 | | | | | | | | 

188 scheme username password hostname port path query fragment 

189 

190 [1]: https://gist.github.com/dperini/729294 

191 [2]: https://github.com/treeform/urlly 

192 

193 Examples: 

194 >>> url('http://duck.com') 

195 True 

196 >>> url('ftp://foobar.dk') 

197 True 

198 >>> url('http://10.0.0.1') 

199 True 

200 >>> url('http://example.com/">user@example.com') 

201 ValidationError(func=url, args={'value': 'http://example.com/">user@example.com'}) 

202 

203 Args: 

204 value: 

205 URL string to validate. 

206 skip_ipv6_addr: 

207 When URL string cannot contain an IPv6 address. 

208 skip_ipv4_addr: 

209 When URL string cannot contain an IPv4 address. 

210 may_have_port: 

211 URL string may contain port number. 

212 simple_host: 

213 URL string maybe only hyphens and alpha-numerals. 

214 strict_query: 

215 Fail validation on query string parsing error. 

216 consider_tld: 

217 Restrict domain to TLDs allowed by IANA. 

218 private: 

219 Embedded IP address is public if `False`, private/local if `True`. 

220 rfc_1034: 

221 Allow trailing dot in domain/host name. 

222 Ref: [RFC 1034](https://www.rfc-editor.org/rfc/rfc1034). 

223 rfc_2782: 

224 Domain/Host name is of type service record. 

225 Ref: [RFC 2782](https://www.rfc-editor.org/rfc/rfc2782). 

226 validate_scheme: 

227 Function that validates URL scheme. 

228 

229 Returns: 

230 (Literal[True]): If `value` is a valid url. 

231 (ValidationError): If `value` is an invalid url. 

232 """ 

233 if not value or re.search(r"\s", value): 

234 # url must not contain any white 

235 # spaces, they must be encoded 

236 return False 

237 

238 try: 

239 scheme, netloc, path, query, fragment = urlsplit(value) 

240 except ValueError: 

241 return False 

242 

243 return ( 

244 validate_scheme(scheme) 

245 and _validate_netloc( 

246 netloc, 

247 skip_ipv6_addr, 

248 skip_ipv4_addr, 

249 may_have_port, 

250 simple_host, 

251 consider_tld, 

252 private, 

253 rfc_1034, 

254 rfc_2782, 

255 ) 

256 and _validate_optionals(path, query, fragment, strict_query) 

257 )