Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/validators/url.py: 100%

52 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:08 +0000

1"""URL.""" 

2# -*- coding: utf-8 -*- 

3 

4# standard 

5from urllib.parse import urlsplit, unquote 

6from functools import lru_cache 

7import re 

8 

9# local 

10from .hostname import hostname 

11from .utils import validator 

12 

13 

14@lru_cache 

15def _username_regex(): 

16 return re.compile( 

17 # dot-atom 

18 r"(^[-!#$%&'*+/=?^_`{}|~0-9A-Z]+(\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*$" 

19 # non-quoted-string 

20 + r"|^([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\001-\011\013\014\016-\177])*$)", 

21 re.IGNORECASE, 

22 ) 

23 

24 

25@lru_cache 

26def _path_regex(): 

27 return re.compile( 

28 # allowed symbols 

29 r"^[\/a-zA-Z0-9\-\.\_\~\!\$\&\'\(\)\*\+\,\;\=\:\@\%" 

30 # emoticons / emoji 

31 + r"\U0001F600-\U0001F64F" 

32 # multilingual unicode ranges 

33 + r"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF]+$", 

34 re.IGNORECASE, 

35 ) 

36 

37 

38@lru_cache 

39def _query_regex(): 

40 return re.compile(r"&?(\w+=?[^\s&]*)", re.IGNORECASE) 

41 

42 

43def _validate_scheme(value: str): 

44 """Validate scheme.""" 

45 # More schemes will be considered later. 

46 return ( 

47 value in {"ftp", "ftps", "git", "http", "https", "rtsp", "sftp", "ssh", "telnet"} 

48 if value 

49 else False 

50 ) 

51 

52 

53def _confirm_ipv6_skip(value: str, skip_ipv6_addr: bool): 

54 """Confirm skip IPv6 check.""" 

55 return skip_ipv6_addr or value.count(":") < 2 or not value.startswith("[") 

56 

57 

58def _validate_auth_segment(value: str): 

59 """Validate authentication segment.""" 

60 if not value: 

61 return True 

62 if (colon_count := value.count(":")) > 1: 

63 # everything before @ is then considered as a username 

64 # this is a bad practice, but syntactically valid URL 

65 return _username_regex().match(unquote(value)) 

66 if colon_count < 1: 

67 return _username_regex().match(value) 

68 username, password = value.rsplit(":", 1) 

69 return _username_regex().match(username) and all( 

70 char_to_avoid not in password for char_to_avoid in ("/", "?", "#", "@") 

71 ) 

72 

73 

74def _validate_netloc( 

75 value: str, 

76 skip_ipv6_addr: bool, 

77 skip_ipv4_addr: bool, 

78 may_have_port: bool, 

79 simple_host: bool, 

80 rfc_1034: bool, 

81 rfc_2782: bool, 

82): 

83 """Validate netloc.""" 

84 if not value or value.count("@") > 1: 

85 return False 

86 if value.count("@") < 1: 

87 return hostname( 

88 value 

89 if _confirm_ipv6_skip(value, skip_ipv6_addr) or "]:" in value 

90 else value.lstrip("[").replace("]", "", 1), 

91 skip_ipv6_addr=_confirm_ipv6_skip(value, skip_ipv6_addr), 

92 skip_ipv4_addr=skip_ipv4_addr, 

93 may_have_port=may_have_port, 

94 maybe_simple=simple_host, 

95 rfc_1034=rfc_1034, 

96 rfc_2782=rfc_2782, 

97 ) 

98 basic_auth, host = value.rsplit("@", 1) 

99 return hostname( 

100 host 

101 if _confirm_ipv6_skip(host, skip_ipv6_addr) or "]:" in value 

102 else host.lstrip("[").replace("]", "", 1), 

103 skip_ipv6_addr=_confirm_ipv6_skip(host, skip_ipv6_addr), 

104 skip_ipv4_addr=skip_ipv4_addr, 

105 may_have_port=may_have_port, 

106 maybe_simple=simple_host, 

107 rfc_1034=rfc_1034, 

108 rfc_2782=rfc_2782, 

109 ) and _validate_auth_segment(basic_auth) 

110 

111 

112def _validate_optionals(path: str, query: str, fragment: str): 

113 """Validate path query and fragments.""" 

114 optional_segments = True 

115 if path: 

116 optional_segments &= bool(_path_regex().match(path)) 

117 if query: 

118 optional_segments &= bool(_query_regex().match(query)) 

119 if fragment: 

120 optional_segments &= all(char_to_avoid not in fragment for char_to_avoid in ("/", "?")) 

121 return optional_segments 

122 

123 

124@validator 

125def url( 

126 value: str, 

127 /, 

128 *, 

129 skip_ipv6_addr: bool = False, 

130 skip_ipv4_addr: bool = False, 

131 may_have_port: bool = True, 

132 simple_host: bool = False, 

133 rfc_1034: bool = False, 

134 rfc_2782: bool = False, 

135): 

136 r"""Return whether or not given value is a valid URL. 

137 

138 This validator was inspired from [URL validator of dperini][1]. 

139 The following diagram is from [urlly][2]. 

140 

141 foo://admin:hunter1@example.com:8042/over/there?name=ferret#nose 

142 \_/ \___/ \_____/ \_________/ \__/\_________/ \_________/ \__/ 

143 | | | | | | | | 

144 scheme username password hostname port path query fragment 

145 

146 [1]: https://gist.github.com/dperini/729294 

147 [2]: https://github.com/treeform/urlly 

148 

149 Examples: 

150 >>> url('http://duck.com') 

151 # Output: True 

152 >>> url('ftp://foobar.dk') 

153 # Output: True 

154 >>> url('http://10.0.0.1') 

155 # Output: True 

156 >>> url('http://example.com/">user@example.com') 

157 # Output: ValidationFailure(func=url, ...) 

158 

159 Args: 

160 value: 

161 URL string to validate. 

162 skip_ipv6_addr: 

163 When URL string cannot contain an IPv6 address. 

164 skip_ipv4_addr: 

165 When URL string cannot contain an IPv4 address. 

166 may_have_port: 

167 URL string may contain port number. 

168 simple_host: 

169 URL string maybe only hyphens and alpha-numerals. 

170 rfc_1034: 

171 Allow trailing dot in domain/host name. 

172 Ref: [RFC 1034](https://www.rfc-editor.org/rfc/rfc1034). 

173 rfc_2782: 

174 Domain/Host name is of type service record. 

175 Ref: [RFC 2782](https://www.rfc-editor.org/rfc/rfc2782). 

176 

177 Returns: 

178 (Literal[True]): 

179 If `value` is a valid slug. 

180 (ValidationFailure): 

181 If `value` is an invalid slug. 

182 

183 Note: 

184 - *In version 0.11.3*: 

185 - Added support for URLs containing localhost. 

186 - *In version 0.11.0*: 

187 - Made the regular expression case insensitive. 

188 - *In version 0.10.3*: 

189 - Added a `public` parameter. 

190 - *In version 0.10.2*: 

191 - Added support for various exotic URLs. 

192 - Fixed various false positives. 

193 

194 > *New in version 0.2.0*. 

195 """ 

196 if not value or re.search(r"\s", value): 

197 # url must not contain any white 

198 # spaces, they must be encoded 

199 return False 

200 

201 try: 

202 scheme, netloc, path, query, fragment = urlsplit(value) 

203 except ValueError: 

204 return False 

205 

206 return ( 

207 _validate_scheme(scheme) 

208 and _validate_netloc( 

209 netloc, 

210 skip_ipv6_addr, 

211 skip_ipv4_addr, 

212 may_have_port, 

213 simple_host, 

214 rfc_1034, 

215 rfc_2782, 

216 ) 

217 and _validate_optionals(path, query, fragment) 

218 )