Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scrapy/http/request/form.py: 3%
131 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-07 06:38 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-07 06:38 +0000
1"""
2This module implements the FormRequest class which is a more convenient class
3(than Request) to generate Requests based on form data.
5See documentation in docs/topics/request-response.rst
6"""
8from __future__ import annotations
10from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union, cast
11from urllib.parse import urlencode, urljoin, urlsplit, urlunsplit
13from lxml.html import (
14 FormElement,
15 HTMLParser,
16 InputElement,
17 MultipleSelectOptions,
18 SelectElement,
19 TextareaElement,
20)
21from parsel.selector import create_root_node
22from w3lib.html import strip_html5_whitespace
24from scrapy.http.request import Request
25from scrapy.http.response.text import TextResponse
26from scrapy.utils.python import is_listlike, to_bytes
27from scrapy.utils.response import get_base_url
29if TYPE_CHECKING:
30 # typing.Self requires Python 3.11
31 from typing_extensions import Self
34FormdataKVType = Tuple[str, Union[str, Iterable[str]]]
35FormdataType = Optional[Union[dict, List[FormdataKVType]]]
38class FormRequest(Request):
39 valid_form_methods = ["GET", "POST"]
41 def __init__(
42 self, *args: Any, formdata: FormdataType = None, **kwargs: Any
43 ) -> None:
44 if formdata and kwargs.get("method") is None:
45 kwargs["method"] = "POST"
47 super().__init__(*args, **kwargs)
49 if formdata:
50 items = formdata.items() if isinstance(formdata, dict) else formdata
51 form_query_str = _urlencode(items, self.encoding)
52 if self.method == "POST":
53 self.headers.setdefault(
54 b"Content-Type", b"application/x-www-form-urlencoded"
55 )
56 self._set_body(form_query_str)
57 else:
58 self._set_url(
59 urlunsplit(urlsplit(self.url)._replace(query=form_query_str))
60 )
62 @classmethod
63 def from_response(
64 cls,
65 response: TextResponse,
66 formname: Optional[str] = None,
67 formid: Optional[str] = None,
68 formnumber: int = 0,
69 formdata: FormdataType = None,
70 clickdata: Optional[dict] = None,
71 dont_click: bool = False,
72 formxpath: Optional[str] = None,
73 formcss: Optional[str] = None,
74 **kwargs: Any,
75 ) -> Self:
76 kwargs.setdefault("encoding", response.encoding)
78 if formcss is not None:
79 from parsel.csstranslator import HTMLTranslator
81 formxpath = HTMLTranslator().css_to_xpath(formcss)
83 form = _get_form(response, formname, formid, formnumber, formxpath)
84 formdata = _get_inputs(form, formdata, dont_click, clickdata)
85 url = _get_form_url(form, kwargs.pop("url", None))
87 method = kwargs.pop("method", form.method)
88 if method is not None:
89 method = method.upper()
90 if method not in cls.valid_form_methods:
91 method = "GET"
93 return cls(url=url, method=method, formdata=formdata, **kwargs)
96def _get_form_url(form: FormElement, url: Optional[str]) -> str:
97 assert form.base_url is not None # typing
98 if url is None:
99 action = form.get("action")
100 if action is None:
101 return form.base_url
102 return urljoin(form.base_url, strip_html5_whitespace(action))
103 return urljoin(form.base_url, url)
106def _urlencode(seq: Iterable[FormdataKVType], enc: str) -> str:
107 values = [
108 (to_bytes(k, enc), to_bytes(v, enc))
109 for k, vs in seq
110 for v in (cast(Iterable[str], vs) if is_listlike(vs) else [cast(str, vs)])
111 ]
112 return urlencode(values, doseq=True)
115def _get_form(
116 response: TextResponse,
117 formname: Optional[str],
118 formid: Optional[str],
119 formnumber: int,
120 formxpath: Optional[str],
121) -> FormElement:
122 """Find the wanted form element within the given response."""
123 root = create_root_node(response.text, HTMLParser, base_url=get_base_url(response))
124 forms = root.xpath("//form")
125 if not forms:
126 raise ValueError(f"No <form> element found in {response}")
128 if formname is not None:
129 f = root.xpath(f'//form[@name="{formname}"]')
130 if f:
131 return cast(FormElement, f[0])
133 if formid is not None:
134 f = root.xpath(f'//form[@id="{formid}"]')
135 if f:
136 return cast(FormElement, f[0])
138 # Get form element from xpath, if not found, go up
139 if formxpath is not None:
140 nodes = root.xpath(formxpath)
141 if nodes:
142 el = nodes[0]
143 while True:
144 if el.tag == "form":
145 return cast(FormElement, el)
146 el = el.getparent()
147 if el is None:
148 break
149 raise ValueError(f"No <form> element found with {formxpath}")
151 # If we get here, it means that either formname was None or invalid
152 try:
153 form = forms[formnumber]
154 except IndexError:
155 raise IndexError(f"Form number {formnumber} not found in {response}")
156 else:
157 return cast(FormElement, form)
160def _get_inputs(
161 form: FormElement,
162 formdata: FormdataType,
163 dont_click: bool,
164 clickdata: Optional[dict],
165) -> List[FormdataKVType]:
166 """Return a list of key-value pairs for the inputs found in the given form."""
167 try:
168 formdata_keys = dict(formdata or ()).keys()
169 except (ValueError, TypeError):
170 raise ValueError("formdata should be a dict or iterable of tuples")
172 if not formdata:
173 formdata = []
174 inputs = form.xpath(
175 "descendant::textarea"
176 "|descendant::select"
177 "|descendant::input[not(@type) or @type["
178 ' not(re:test(., "^(?:submit|image|reset)$", "i"))'
179 " and (../@checked or"
180 ' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
181 namespaces={"re": "http://exslt.org/regular-expressions"},
182 )
183 values: List[FormdataKVType] = [
184 (k, "" if v is None else v)
185 for k, v in (_value(e) for e in inputs)
186 if k and k not in formdata_keys
187 ]
189 if not dont_click:
190 clickable = _get_clickable(clickdata, form)
191 if clickable and clickable[0] not in formdata and not clickable[0] is None:
192 values.append(clickable)
194 if isinstance(formdata, dict):
195 formdata = formdata.items() # type: ignore[assignment]
197 values.extend((k, v) for k, v in formdata if v is not None)
198 return values
201def _value(
202 ele: Union[InputElement, SelectElement, TextareaElement]
203) -> Tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
204 n = ele.name
205 v = ele.value
206 if ele.tag == "select":
207 return _select_value(cast(SelectElement, ele), n, v)
208 return n, v
211def _select_value(
212 ele: SelectElement, n: Optional[str], v: Union[None, str, MultipleSelectOptions]
213) -> Tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
214 multiple = ele.multiple
215 if v is None and not multiple:
216 # Match browser behaviour on simple select tag without options selected
217 # And for select tags without options
218 o = ele.value_options
219 return (n, o[0]) if o else (None, None)
220 return n, v
223def _get_clickable(
224 clickdata: Optional[dict], form: FormElement
225) -> Optional[Tuple[str, str]]:
226 """
227 Returns the clickable element specified in clickdata,
228 if the latter is given. If not, it returns the first
229 clickable element found
230 """
231 clickables = list(
232 form.xpath(
233 'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
234 '|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
235 namespaces={"re": "http://exslt.org/regular-expressions"},
236 )
237 )
238 if not clickables:
239 return None
241 # If we don't have clickdata, we just use the first clickable element
242 if clickdata is None:
243 el = clickables[0]
244 return (el.get("name"), el.get("value") or "")
246 # If clickdata is given, we compare it to the clickable elements to find a
247 # match. We first look to see if the number is specified in clickdata,
248 # because that uniquely identifies the element
249 nr = clickdata.get("nr", None)
250 if nr is not None:
251 try:
252 el = list(form.inputs)[nr]
253 except IndexError:
254 pass
255 else:
256 return (el.get("name"), el.get("value") or "")
258 # We didn't find it, so now we build an XPath expression out of the other
259 # arguments, because they can be used as such
260 xpath = ".//*" + "".join(f'[@{k}="{v}"]' for k, v in clickdata.items())
261 el = form.xpath(xpath)
262 if len(el) == 1:
263 return (el[0].get("name"), el[0].get("value") or "")
264 if len(el) > 1:
265 raise ValueError(
266 f"Multiple elements found ({el!r}) matching the "
267 f"criteria in clickdata: {clickdata!r}"
268 )
269 else:
270 raise ValueError(f"No clickable element matching clickdata: {clickdata!r}")