Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scrapy/http/request/form.py: 3%

131 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-07 06:38 +0000

1""" 

2This module implements the FormRequest class which is a more convenient class 

3(than Request) to generate Requests based on form data. 

4 

5See documentation in docs/topics/request-response.rst 

6""" 

7 

8from __future__ import annotations 

9 

10from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union, cast 

11from urllib.parse import urlencode, urljoin, urlsplit, urlunsplit 

12 

13from lxml.html import ( 

14 FormElement, 

15 HTMLParser, 

16 InputElement, 

17 MultipleSelectOptions, 

18 SelectElement, 

19 TextareaElement, 

20) 

21from parsel.selector import create_root_node 

22from w3lib.html import strip_html5_whitespace 

23 

24from scrapy.http.request import Request 

25from scrapy.http.response.text import TextResponse 

26from scrapy.utils.python import is_listlike, to_bytes 

27from scrapy.utils.response import get_base_url 

28 

29if TYPE_CHECKING: 

30 # typing.Self requires Python 3.11 

31 from typing_extensions import Self 

32 

33 

34FormdataKVType = Tuple[str, Union[str, Iterable[str]]] 

35FormdataType = Optional[Union[dict, List[FormdataKVType]]] 

36 

37 

38class FormRequest(Request): 

39 valid_form_methods = ["GET", "POST"] 

40 

41 def __init__( 

42 self, *args: Any, formdata: FormdataType = None, **kwargs: Any 

43 ) -> None: 

44 if formdata and kwargs.get("method") is None: 

45 kwargs["method"] = "POST" 

46 

47 super().__init__(*args, **kwargs) 

48 

49 if formdata: 

50 items = formdata.items() if isinstance(formdata, dict) else formdata 

51 form_query_str = _urlencode(items, self.encoding) 

52 if self.method == "POST": 

53 self.headers.setdefault( 

54 b"Content-Type", b"application/x-www-form-urlencoded" 

55 ) 

56 self._set_body(form_query_str) 

57 else: 

58 self._set_url( 

59 urlunsplit(urlsplit(self.url)._replace(query=form_query_str)) 

60 ) 

61 

62 @classmethod 

63 def from_response( 

64 cls, 

65 response: TextResponse, 

66 formname: Optional[str] = None, 

67 formid: Optional[str] = None, 

68 formnumber: int = 0, 

69 formdata: FormdataType = None, 

70 clickdata: Optional[dict] = None, 

71 dont_click: bool = False, 

72 formxpath: Optional[str] = None, 

73 formcss: Optional[str] = None, 

74 **kwargs: Any, 

75 ) -> Self: 

76 kwargs.setdefault("encoding", response.encoding) 

77 

78 if formcss is not None: 

79 from parsel.csstranslator import HTMLTranslator 

80 

81 formxpath = HTMLTranslator().css_to_xpath(formcss) 

82 

83 form = _get_form(response, formname, formid, formnumber, formxpath) 

84 formdata = _get_inputs(form, formdata, dont_click, clickdata) 

85 url = _get_form_url(form, kwargs.pop("url", None)) 

86 

87 method = kwargs.pop("method", form.method) 

88 if method is not None: 

89 method = method.upper() 

90 if method not in cls.valid_form_methods: 

91 method = "GET" 

92 

93 return cls(url=url, method=method, formdata=formdata, **kwargs) 

94 

95 

96def _get_form_url(form: FormElement, url: Optional[str]) -> str: 

97 assert form.base_url is not None # typing 

98 if url is None: 

99 action = form.get("action") 

100 if action is None: 

101 return form.base_url 

102 return urljoin(form.base_url, strip_html5_whitespace(action)) 

103 return urljoin(form.base_url, url) 

104 

105 

106def _urlencode(seq: Iterable[FormdataKVType], enc: str) -> str: 

107 values = [ 

108 (to_bytes(k, enc), to_bytes(v, enc)) 

109 for k, vs in seq 

110 for v in (cast(Iterable[str], vs) if is_listlike(vs) else [cast(str, vs)]) 

111 ] 

112 return urlencode(values, doseq=True) 

113 

114 

115def _get_form( 

116 response: TextResponse, 

117 formname: Optional[str], 

118 formid: Optional[str], 

119 formnumber: int, 

120 formxpath: Optional[str], 

121) -> FormElement: 

122 """Find the wanted form element within the given response.""" 

123 root = create_root_node(response.text, HTMLParser, base_url=get_base_url(response)) 

124 forms = root.xpath("//form") 

125 if not forms: 

126 raise ValueError(f"No <form> element found in {response}") 

127 

128 if formname is not None: 

129 f = root.xpath(f'//form[@name="{formname}"]') 

130 if f: 

131 return cast(FormElement, f[0]) 

132 

133 if formid is not None: 

134 f = root.xpath(f'//form[@id="{formid}"]') 

135 if f: 

136 return cast(FormElement, f[0]) 

137 

138 # Get form element from xpath, if not found, go up 

139 if formxpath is not None: 

140 nodes = root.xpath(formxpath) 

141 if nodes: 

142 el = nodes[0] 

143 while True: 

144 if el.tag == "form": 

145 return cast(FormElement, el) 

146 el = el.getparent() 

147 if el is None: 

148 break 

149 raise ValueError(f"No <form> element found with {formxpath}") 

150 

151 # If we get here, it means that either formname was None or invalid 

152 try: 

153 form = forms[formnumber] 

154 except IndexError: 

155 raise IndexError(f"Form number {formnumber} not found in {response}") 

156 else: 

157 return cast(FormElement, form) 

158 

159 

160def _get_inputs( 

161 form: FormElement, 

162 formdata: FormdataType, 

163 dont_click: bool, 

164 clickdata: Optional[dict], 

165) -> List[FormdataKVType]: 

166 """Return a list of key-value pairs for the inputs found in the given form.""" 

167 try: 

168 formdata_keys = dict(formdata or ()).keys() 

169 except (ValueError, TypeError): 

170 raise ValueError("formdata should be a dict or iterable of tuples") 

171 

172 if not formdata: 

173 formdata = [] 

174 inputs = form.xpath( 

175 "descendant::textarea" 

176 "|descendant::select" 

177 "|descendant::input[not(@type) or @type[" 

178 ' not(re:test(., "^(?:submit|image|reset)$", "i"))' 

179 " and (../@checked or" 

180 ' not(re:test(., "^(?:checkbox|radio)$", "i")))]]', 

181 namespaces={"re": "http://exslt.org/regular-expressions"}, 

182 ) 

183 values: List[FormdataKVType] = [ 

184 (k, "" if v is None else v) 

185 for k, v in (_value(e) for e in inputs) 

186 if k and k not in formdata_keys 

187 ] 

188 

189 if not dont_click: 

190 clickable = _get_clickable(clickdata, form) 

191 if clickable and clickable[0] not in formdata and not clickable[0] is None: 

192 values.append(clickable) 

193 

194 if isinstance(formdata, dict): 

195 formdata = formdata.items() # type: ignore[assignment] 

196 

197 values.extend((k, v) for k, v in formdata if v is not None) 

198 return values 

199 

200 

201def _value( 

202 ele: Union[InputElement, SelectElement, TextareaElement] 

203) -> Tuple[Optional[str], Union[None, str, MultipleSelectOptions]]: 

204 n = ele.name 

205 v = ele.value 

206 if ele.tag == "select": 

207 return _select_value(cast(SelectElement, ele), n, v) 

208 return n, v 

209 

210 

211def _select_value( 

212 ele: SelectElement, n: Optional[str], v: Union[None, str, MultipleSelectOptions] 

213) -> Tuple[Optional[str], Union[None, str, MultipleSelectOptions]]: 

214 multiple = ele.multiple 

215 if v is None and not multiple: 

216 # Match browser behaviour on simple select tag without options selected 

217 # And for select tags without options 

218 o = ele.value_options 

219 return (n, o[0]) if o else (None, None) 

220 return n, v 

221 

222 

223def _get_clickable( 

224 clickdata: Optional[dict], form: FormElement 

225) -> Optional[Tuple[str, str]]: 

226 """ 

227 Returns the clickable element specified in clickdata, 

228 if the latter is given. If not, it returns the first 

229 clickable element found 

230 """ 

231 clickables = list( 

232 form.xpath( 

233 'descendant::input[re:test(@type, "^(submit|image)$", "i")]' 

234 '|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]', 

235 namespaces={"re": "http://exslt.org/regular-expressions"}, 

236 ) 

237 ) 

238 if not clickables: 

239 return None 

240 

241 # If we don't have clickdata, we just use the first clickable element 

242 if clickdata is None: 

243 el = clickables[0] 

244 return (el.get("name"), el.get("value") or "") 

245 

246 # If clickdata is given, we compare it to the clickable elements to find a 

247 # match. We first look to see if the number is specified in clickdata, 

248 # because that uniquely identifies the element 

249 nr = clickdata.get("nr", None) 

250 if nr is not None: 

251 try: 

252 el = list(form.inputs)[nr] 

253 except IndexError: 

254 pass 

255 else: 

256 return (el.get("name"), el.get("value") or "") 

257 

258 # We didn't find it, so now we build an XPath expression out of the other 

259 # arguments, because they can be used as such 

260 xpath = ".//*" + "".join(f'[@{k}="{v}"]' for k, v in clickdata.items()) 

261 el = form.xpath(xpath) 

262 if len(el) == 1: 

263 return (el[0].get("name"), el[0].get("value") or "") 

264 if len(el) > 1: 

265 raise ValueError( 

266 f"Multiple elements found ({el!r}) matching the " 

267 f"criteria in clickdata: {clickdata!r}" 

268 ) 

269 else: 

270 raise ValueError(f"No clickable element matching clickdata: {clickdata!r}")