Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/validators/url.py: 98%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

54 statements  

1"""URL.""" 

2 

3# standard 

4from functools import lru_cache 

5import re 

6from typing import Optional 

7from urllib.parse import parse_qs, unquote, urlsplit 

8 

9# local 

10from .hostname import hostname 

11from .utils import validator 

12 

13 

14@lru_cache 

15def _username_regex(): 

16 return re.compile( 

17 # extended latin 

18 r"(^[\u0100-\u017F\u0180-\u024F]" 

19 # dot-atom 

20 + r"|[-!#$%&'*+/=?^_`{}|~0-9a-z]+(\.[-!#$%&'*+/=?^_`{}|~0-9a-z]+)*$" 

21 # non-quoted-string 

22 + r"|^([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\011.])*$)", 

23 re.IGNORECASE, 

24 ) 

25 

26 

27@lru_cache 

28def _path_regex(): 

29 return re.compile( 

30 # allowed symbols 

31 r"^[\/a-z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\%" 

32 # symbols / pictographs 

33 + r"\U0001F300-\U0001F5FF" 

34 # emoticons / emoji 

35 + r"\U0001F600-\U0001F64F" 

36 # multilingual unicode ranges 

37 + r"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF]+$", 

38 re.IGNORECASE, 

39 ) 

40 

41 

42def _validate_scheme(value: str): 

43 """Validate scheme.""" 

44 # More schemes will be considered later. 

45 return ( 

46 value 

47 # fmt: off 

48 in { 

49 "ftp", "ftps", "git", "http", "https", 

50 "irc", "rtmp", "rtmps", "rtsp", "sftp", 

51 "ssh", "telnet", 

52 } 

53 # fmt: on 

54 if value 

55 else False 

56 ) 

57 

58 

59def _confirm_ipv6_skip(value: str, skip_ipv6_addr: bool): 

60 """Confirm skip IPv6 check.""" 

61 return skip_ipv6_addr or value.count(":") < 2 or not value.startswith("[") 

62 

63 

64def _validate_auth_segment(value: str): 

65 """Validate authentication segment.""" 

66 if not value: 

67 return True 

68 if (colon_count := value.count(":")) > 1: 

69 # everything before @ is then considered as a username 

70 # this is a bad practice, but syntactically valid URL 

71 return _username_regex().match(unquote(value)) 

72 if colon_count < 1: 

73 return _username_regex().match(value) 

74 username, password = value.rsplit(":", 1) 

75 return _username_regex().match(username) and all( 

76 char_to_avoid not in password for char_to_avoid in ("/", "?", "#", "@") 

77 ) 

78 

79 

80def _validate_netloc( 

81 value: str, 

82 skip_ipv6_addr: bool, 

83 skip_ipv4_addr: bool, 

84 may_have_port: bool, 

85 simple_host: bool, 

86 consider_tld: bool, 

87 private: Optional[bool], 

88 rfc_1034: bool, 

89 rfc_2782: bool, 

90): 

91 """Validate netloc.""" 

92 if not value or value.count("@") > 1: 

93 return False 

94 if value.count("@") < 1: 

95 return hostname( 

96 ( 

97 value 

98 if _confirm_ipv6_skip(value, skip_ipv6_addr) or "]:" in value 

99 else value.lstrip("[").replace("]", "", 1) 

100 ), 

101 skip_ipv6_addr=_confirm_ipv6_skip(value, skip_ipv6_addr), 

102 skip_ipv4_addr=skip_ipv4_addr, 

103 may_have_port=may_have_port, 

104 maybe_simple=simple_host, 

105 consider_tld=consider_tld, 

106 private=private, 

107 rfc_1034=rfc_1034, 

108 rfc_2782=rfc_2782, 

109 ) 

110 basic_auth, host = value.rsplit("@", 1) 

111 return hostname( 

112 ( 

113 host 

114 if _confirm_ipv6_skip(host, skip_ipv6_addr) or "]:" in value 

115 else host.lstrip("[").replace("]", "", 1) 

116 ), 

117 skip_ipv6_addr=_confirm_ipv6_skip(host, skip_ipv6_addr), 

118 skip_ipv4_addr=skip_ipv4_addr, 

119 may_have_port=may_have_port, 

120 maybe_simple=simple_host, 

121 consider_tld=consider_tld, 

122 private=private, 

123 rfc_1034=rfc_1034, 

124 rfc_2782=rfc_2782, 

125 ) and _validate_auth_segment(basic_auth) 

126 

127 

128def _validate_optionals(path: str, query: str, fragment: str, strict_query: bool): 

129 """Validate path query and fragments.""" 

130 optional_segments = True 

131 if path: 

132 optional_segments &= bool(_path_regex().match(path)) 

133 try: 

134 if ( 

135 query 

136 # ref: https://github.com/python/cpython/issues/117109 

137 and parse_qs(query, strict_parsing=strict_query, separator="&") 

138 and parse_qs(query, strict_parsing=strict_query, separator=";") 

139 ): 

140 optional_segments &= True 

141 except TypeError: 

142 # for Python < v3.9.2 (official v3.10) 

143 if query and parse_qs(query, strict_parsing=strict_query): 

144 optional_segments &= True 

145 if fragment: 

146 # See RFC3986 Section 3.5 Fragment for allowed characters 

147 # Adding "#", see https://github.com/python-validators/validators/issues/403 

148 optional_segments &= bool( 

149 re.fullmatch(r"[0-9a-z?/:@\-._~%!$&'()*+,;=#]*", fragment, re.IGNORECASE) 

150 ) 

151 return optional_segments 

152 

153 

154@validator 

155def url( 

156 value: str, 

157 /, 

158 *, 

159 skip_ipv6_addr: bool = False, 

160 skip_ipv4_addr: bool = False, 

161 may_have_port: bool = True, 

162 simple_host: bool = False, 

163 strict_query: bool = True, 

164 consider_tld: bool = False, 

165 private: Optional[bool] = None, # only for ip-addresses 

166 rfc_1034: bool = False, 

167 rfc_2782: bool = False, 

168): 

169 r"""Return whether or not given value is a valid URL. 

170 

171 This validator was originally inspired from [URL validator of dperini][1]. 

172 The following diagram is from [urlly][2]:: 

173 

174 

175 foo://admin:hunter1@example.com:8042/over/there?name=ferret#nose 

176 \_/ \___/ \_____/ \_________/ \__/\_________/ \_________/ \__/ 

177 | | | | | | | | 

178 scheme username password hostname port path query fragment 

179 

180 [1]: https://gist.github.com/dperini/729294 

181 [2]: https://github.com/treeform/urlly 

182 

183 Examples: 

184 >>> url('http://duck.com') 

185 # Output: True 

186 >>> url('ftp://foobar.dk') 

187 # Output: True 

188 >>> url('http://10.0.0.1') 

189 # Output: True 

190 >>> url('http://example.com/">user@example.com') 

191 # Output: ValidationError(func=url, ...) 

192 

193 Args: 

194 value: 

195 URL string to validate. 

196 skip_ipv6_addr: 

197 When URL string cannot contain an IPv6 address. 

198 skip_ipv4_addr: 

199 When URL string cannot contain an IPv4 address. 

200 may_have_port: 

201 URL string may contain port number. 

202 simple_host: 

203 URL string maybe only hyphens and alpha-numerals. 

204 strict_query: 

205 Fail validation on query string parsing error. 

206 consider_tld: 

207 Restrict domain to TLDs allowed by IANA. 

208 private: 

209 Embedded IP address is public if `False`, private/local if `True`. 

210 rfc_1034: 

211 Allow trailing dot in domain/host name. 

212 Ref: [RFC 1034](https://www.rfc-editor.org/rfc/rfc1034). 

213 rfc_2782: 

214 Domain/Host name is of type service record. 

215 Ref: [RFC 2782](https://www.rfc-editor.org/rfc/rfc2782). 

216 

217 Returns: 

218 (Literal[True]): If `value` is a valid url. 

219 (ValidationError): If `value` is an invalid url. 

220 """ 

221 if not value or re.search(r"\s", value): 

222 # url must not contain any white 

223 # spaces, they must be encoded 

224 return False 

225 

226 try: 

227 scheme, netloc, path, query, fragment = urlsplit(value) 

228 except ValueError: 

229 return False 

230 

231 return ( 

232 _validate_scheme(scheme) 

233 and _validate_netloc( 

234 netloc, 

235 skip_ipv6_addr, 

236 skip_ipv4_addr, 

237 may_have_port, 

238 simple_host, 

239 consider_tld, 

240 private, 

241 rfc_1034, 

242 rfc_2782, 

243 ) 

244 and _validate_optionals(path, query, fragment, strict_query) 

245 )