Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scrapy/utils/url.py: 35%

55 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-07 06:38 +0000

1""" 

2This module contains general purpose URL functions not found in the standard 

3library. 

4 

5Some of the functions that used to be imported from this module have been moved 

6to the w3lib.url module. Always import those from there instead. 

7""" 

8import re 

9from typing import TYPE_CHECKING, Iterable, Optional, Type, Union, cast 

10from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse 

11 

12# scrapy.utils.url was moved to w3lib.url and import * ensures this 

13# move doesn't break old code 

14from w3lib.url import * 

15from w3lib.url import _safe_chars, _unquotepath # noqa: F401 

16 

17from scrapy.utils.python import to_unicode 

18 

19if TYPE_CHECKING: 

20 from scrapy import Spider 

21 

22 

23UrlT = Union[str, bytes, ParseResult] 

24 

25 

26def url_is_from_any_domain(url: UrlT, domains: Iterable[str]) -> bool: 

27 """Return True if the url belongs to any of the given domains""" 

28 host = parse_url(url).netloc.lower() 

29 if not host: 

30 return False 

31 domains = [d.lower() for d in domains] 

32 return any((host == d) or (host.endswith(f".{d}")) for d in domains) 

33 

34 

35def url_is_from_spider(url: UrlT, spider: Type["Spider"]) -> bool: 

36 """Return True if the url belongs to the given spider""" 

37 return url_is_from_any_domain( 

38 url, [spider.name] + list(getattr(spider, "allowed_domains", [])) 

39 ) 

40 

41 

42def url_has_any_extension(url: UrlT, extensions: Iterable[str]) -> bool: 

43 """Return True if the url ends with one of the extensions provided""" 

44 lowercase_path = parse_url(url).path.lower() 

45 return any(lowercase_path.endswith(ext) for ext in extensions) 

46 

47 

48def parse_url(url: UrlT, encoding: Optional[str] = None) -> ParseResult: 

49 """Return urlparsed url from the given argument (which could be an already 

50 parsed url) 

51 """ 

52 if isinstance(url, ParseResult): 

53 return url 

54 return cast(ParseResult, urlparse(to_unicode(url, encoding))) 

55 

56 

57def escape_ajax(url: str) -> str: 

58 """ 

59 Return the crawlable url according to: 

60 https://developers.google.com/webmasters/ajax-crawling/docs/getting-started 

61 

62 >>> escape_ajax("www.example.com/ajax.html#!key=value") 

63 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' 

64 >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value") 

65 'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue' 

66 >>> escape_ajax("www.example.com/ajax.html?#!key=value") 

67 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' 

68 >>> escape_ajax("www.example.com/ajax.html#!") 

69 'www.example.com/ajax.html?_escaped_fragment_=' 

70 

71 URLs that are not "AJAX crawlable" (according to Google) returned as-is: 

72 

73 >>> escape_ajax("www.example.com/ajax.html#key=value") 

74 'www.example.com/ajax.html#key=value' 

75 >>> escape_ajax("www.example.com/ajax.html#") 

76 'www.example.com/ajax.html#' 

77 >>> escape_ajax("www.example.com/ajax.html") 

78 'www.example.com/ajax.html' 

79 """ 

80 defrag, frag = urldefrag(url) 

81 if not frag.startswith("!"): 

82 return url 

83 return add_or_replace_parameter(defrag, "_escaped_fragment_", frag[1:]) 

84 

85 

86def add_http_if_no_scheme(url: str) -> str: 

87 """Add http as the default scheme if it is missing from the url.""" 

88 match = re.match(r"^\w+://", url, flags=re.I) 

89 if not match: 

90 parts = urlparse(url) 

91 scheme = "http:" if parts.netloc else "http://" 

92 url = scheme + url 

93 

94 return url 

95 

96 

97def _is_posix_path(string: str) -> bool: 

98 return bool( 

99 re.match( 

100 r""" 

101 ^ # start with... 

102 ( 

103 \. # ...a single dot, 

104 ( 

105 \. | [^/\.]+ # optionally followed by 

106 )? # either a second dot or some characters 

107 | 

108 ~ # $HOME 

109 )? # optional match of ".", ".." or ".blabla" 

110 / # at least one "/" for a file path, 

111 . # and something after the "/" 

112 """, 

113 string, 

114 flags=re.VERBOSE, 

115 ) 

116 ) 

117 

118 

119def _is_windows_path(string: str) -> bool: 

120 return bool( 

121 re.match( 

122 r""" 

123 ^ 

124 ( 

125 [a-z]:\\ 

126 | \\\\ 

127 ) 

128 """, 

129 string, 

130 flags=re.IGNORECASE | re.VERBOSE, 

131 ) 

132 ) 

133 

134 

135def _is_filesystem_path(string: str) -> bool: 

136 return _is_posix_path(string) or _is_windows_path(string) 

137 

138 

139def guess_scheme(url: str) -> str: 

140 """Add an URL scheme if missing: file:// for filepath-like input or 

141 http:// otherwise.""" 

142 if _is_filesystem_path(url): 

143 return any_to_uri(url) 

144 return add_http_if_no_scheme(url) 

145 

146 

147def strip_url( 

148 url: str, 

149 strip_credentials: bool = True, 

150 strip_default_port: bool = True, 

151 origin_only: bool = False, 

152 strip_fragment: bool = True, 

153) -> str: 

154 """Strip URL string from some of its components: 

155 

156 - ``strip_credentials`` removes "user:password@" 

157 - ``strip_default_port`` removes ":80" (resp. ":443", ":21") 

158 from http:// (resp. https://, ftp://) URLs 

159 - ``origin_only`` replaces path component with "/", also dropping 

160 query and fragment components ; it also strips credentials 

161 - ``strip_fragment`` drops any #fragment component 

162 """ 

163 

164 parsed_url = urlparse(url) 

165 netloc = parsed_url.netloc 

166 if (strip_credentials or origin_only) and ( 

167 parsed_url.username or parsed_url.password 

168 ): 

169 netloc = netloc.split("@")[-1] 

170 if strip_default_port and parsed_url.port: 

171 if (parsed_url.scheme, parsed_url.port) in ( 

172 ("http", 80), 

173 ("https", 443), 

174 ("ftp", 21), 

175 ): 

176 netloc = netloc.replace(f":{parsed_url.port}", "") 

177 return urlunparse( 

178 ( 

179 parsed_url.scheme, 

180 netloc, 

181 "/" if origin_only else parsed_url.path, 

182 "" if origin_only else parsed_url.params, 

183 "" if origin_only else parsed_url.query, 

184 "" if strip_fragment else parsed_url.fragment, 

185 ) 

186 )