Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scrapy/http/request/__init__.py: 37%

90 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-07 06:38 +0000

1""" 

2This module implements the Request class which is used to represent HTTP 

3requests in Scrapy. 

4 

5See documentation in docs/topics/request-response.rst 

6""" 

7import inspect 

8from typing import ( 

9 Any, 

10 AnyStr, 

11 Callable, 

12 Dict, 

13 Iterable, 

14 List, 

15 Mapping, 

16 NoReturn, 

17 Optional, 

18 Tuple, 

19 Type, 

20 TypeVar, 

21 Union, 

22 cast, 

23) 

24 

25from w3lib.url import safe_url_string 

26 

27import scrapy 

28from scrapy.http.headers import Headers 

29from scrapy.utils.curl import curl_to_request_kwargs 

30from scrapy.utils.python import to_bytes 

31from scrapy.utils.trackref import object_ref 

32from scrapy.utils.url import escape_ajax 

33 

34RequestTypeVar = TypeVar("RequestTypeVar", bound="Request") 

35 

36 

37def NO_CALLBACK(*args: Any, **kwargs: Any) -> NoReturn: 

38 """When assigned to the ``callback`` parameter of 

39 :class:`~scrapy.http.Request`, it indicates that the request is not meant 

40 to have a spider callback at all. 

41 

42 For example: 

43 

44 .. code-block:: python 

45 

46 Request("https://example.com", callback=NO_CALLBACK) 

47 

48 This value should be used by :ref:`components <topics-components>` that 

49 create and handle their own requests, e.g. through 

50 :meth:`scrapy.core.engine.ExecutionEngine.download`, so that downloader 

51 middlewares handling such requests can treat them differently from requests 

52 intended for the :meth:`~scrapy.Spider.parse` callback. 

53 """ 

54 raise RuntimeError( 

55 "The NO_CALLBACK callback has been called. This is a special callback " 

56 "value intended for requests whose callback is never meant to be " 

57 "called." 

58 ) 

59 

60 

61class Request(object_ref): 

62 """Represents an HTTP request, which is usually generated in a Spider and 

63 executed by the Downloader, thus generating a :class:`Response`. 

64 """ 

65 

66 attributes: Tuple[str, ...] = ( 

67 "url", 

68 "callback", 

69 "method", 

70 "headers", 

71 "body", 

72 "cookies", 

73 "meta", 

74 "encoding", 

75 "priority", 

76 "dont_filter", 

77 "errback", 

78 "flags", 

79 "cb_kwargs", 

80 ) 

81 """A tuple of :class:`str` objects containing the name of all public 

82 attributes of the class that are also keyword parameters of the 

83 ``__init__`` method. 

84 

85 Currently used by :meth:`Request.replace`, :meth:`Request.to_dict` and 

86 :func:`~scrapy.utils.request.request_from_dict`. 

87 """ 

88 

89 def __init__( 

90 self, 

91 url: str, 

92 callback: Optional[Callable] = None, 

93 method: str = "GET", 

94 headers: Union[Mapping[AnyStr, Any], Iterable[Tuple[AnyStr, Any]], None] = None, 

95 body: Optional[Union[bytes, str]] = None, 

96 cookies: Optional[Union[dict, List[dict]]] = None, 

97 meta: Optional[Dict[str, Any]] = None, 

98 encoding: str = "utf-8", 

99 priority: int = 0, 

100 dont_filter: bool = False, 

101 errback: Optional[Callable] = None, 

102 flags: Optional[List[str]] = None, 

103 cb_kwargs: Optional[Dict[str, Any]] = None, 

104 ) -> None: 

105 self._encoding: str = encoding # this one has to be set first 

106 self.method: str = str(method).upper() 

107 self._set_url(url) 

108 self._set_body(body) 

109 if not isinstance(priority, int): 

110 raise TypeError(f"Request priority not an integer: {priority!r}") 

111 self.priority: int = priority 

112 

113 if not (callable(callback) or callback is None): 

114 raise TypeError( 

115 f"callback must be a callable, got {type(callback).__name__}" 

116 ) 

117 if not (callable(errback) or errback is None): 

118 raise TypeError(f"errback must be a callable, got {type(errback).__name__}") 

119 self.callback: Optional[Callable] = callback 

120 self.errback: Optional[Callable] = errback 

121 

122 self.cookies: Union[dict, List[dict]] = cookies or {} 

123 self.headers: Headers = Headers(headers or {}, encoding=encoding) 

124 self.dont_filter: bool = dont_filter 

125 

126 self._meta: Optional[Dict[str, Any]] = dict(meta) if meta else None 

127 self._cb_kwargs: Optional[Dict[str, Any]] = ( 

128 dict(cb_kwargs) if cb_kwargs else None 

129 ) 

130 self.flags: List[str] = [] if flags is None else list(flags) 

131 

132 @property 

133 def cb_kwargs(self) -> Dict[str, Any]: 

134 if self._cb_kwargs is None: 

135 self._cb_kwargs = {} 

136 return self._cb_kwargs 

137 

138 @property 

139 def meta(self) -> Dict[str, Any]: 

140 if self._meta is None: 

141 self._meta = {} 

142 return self._meta 

143 

144 @property 

145 def url(self) -> str: 

146 return self._url 

147 

148 def _set_url(self, url: str) -> None: 

149 if not isinstance(url, str): 

150 raise TypeError(f"Request url must be str, got {type(url).__name__}") 

151 

152 s = safe_url_string(url, self.encoding) 

153 self._url = escape_ajax(s) 

154 

155 if ( 

156 "://" not in self._url 

157 and not self._url.startswith("about:") 

158 and not self._url.startswith("data:") 

159 ): 

160 raise ValueError(f"Missing scheme in request url: {self._url}") 

161 

162 @property 

163 def body(self) -> bytes: 

164 return self._body 

165 

166 def _set_body(self, body: Optional[Union[str, bytes]]) -> None: 

167 self._body = b"" if body is None else to_bytes(body, self.encoding) 

168 

169 @property 

170 def encoding(self) -> str: 

171 return self._encoding 

172 

173 def __repr__(self) -> str: 

174 return f"<{self.method} {self.url}>" 

175 

176 def copy(self) -> "Request": 

177 return self.replace() 

178 

179 def replace(self, *args: Any, **kwargs: Any) -> "Request": 

180 """Create a new Request with the same attributes except for those given new values""" 

181 for x in self.attributes: 

182 kwargs.setdefault(x, getattr(self, x)) 

183 cls = kwargs.pop("cls", self.__class__) 

184 return cast(Request, cls(*args, **kwargs)) 

185 

186 @classmethod 

187 def from_curl( 

188 cls: Type[RequestTypeVar], 

189 curl_command: str, 

190 ignore_unknown_options: bool = True, 

191 **kwargs: Any, 

192 ) -> RequestTypeVar: 

193 """Create a Request object from a string containing a `cURL 

194 <https://curl.haxx.se/>`_ command. It populates the HTTP method, the 

195 URL, the headers, the cookies and the body. It accepts the same 

196 arguments as the :class:`Request` class, taking preference and 

197 overriding the values of the same arguments contained in the cURL 

198 command. 

199 

200 Unrecognized options are ignored by default. To raise an error when 

201 finding unknown options call this method by passing 

202 ``ignore_unknown_options=False``. 

203 

204 .. caution:: Using :meth:`from_curl` from :class:`~scrapy.http.Request` 

205 subclasses, such as :class:`~scrapy.http.JsonRequest`, or 

206 :class:`~scrapy.http.XmlRpcRequest`, as well as having 

207 :ref:`downloader middlewares <topics-downloader-middleware>` 

208 and 

209 :ref:`spider middlewares <topics-spider-middleware>` 

210 enabled, such as 

211 :class:`~scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware`, 

212 :class:`~scrapy.downloadermiddlewares.useragent.UserAgentMiddleware`, 

213 or 

214 :class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`, 

215 may modify the :class:`~scrapy.http.Request` object. 

216 

217 To translate a cURL command into a Scrapy request, 

218 you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_. 

219 """ 

220 request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options) 

221 request_kwargs.update(kwargs) 

222 return cls(**request_kwargs) 

223 

224 def to_dict(self, *, spider: Optional["scrapy.Spider"] = None) -> Dict[str, Any]: 

225 """Return a dictionary containing the Request's data. 

226 

227 Use :func:`~scrapy.utils.request.request_from_dict` to convert back into a :class:`~scrapy.Request` object. 

228 

229 If a spider is given, this method will try to find out the name of the spider methods used as callback 

230 and errback and include them in the output dict, raising an exception if they cannot be found. 

231 """ 

232 d = { 

233 "url": self.url, # urls are safe (safe_string_url) 

234 "callback": _find_method(spider, self.callback) 

235 if callable(self.callback) 

236 else self.callback, 

237 "errback": _find_method(spider, self.errback) 

238 if callable(self.errback) 

239 else self.errback, 

240 "headers": dict(self.headers), 

241 } 

242 for attr in self.attributes: 

243 d.setdefault(attr, getattr(self, attr)) 

244 if type(self) is not Request: # pylint: disable=unidiomatic-typecheck 

245 d["_class"] = self.__module__ + "." + self.__class__.__name__ 

246 return d 

247 

248 

249def _find_method(obj: Any, func: Callable) -> str: 

250 """Helper function for Request.to_dict""" 

251 # Only instance methods contain ``__func__`` 

252 if obj and hasattr(func, "__func__"): 

253 members = inspect.getmembers(obj, predicate=inspect.ismethod) 

254 for name, obj_func in members: 

255 # We need to use __func__ to access the original function object because instance 

256 # method objects are generated each time attribute is retrieved from instance. 

257 # 

258 # Reference: The standard type hierarchy 

259 # https://docs.python.org/3/reference/datamodel.html 

260 if obj_func.__func__ is func.__func__: 

261 return name 

262 raise ValueError(f"Function {func} is not an instance method in: {obj}")