1# encoding: utf-8
2from __future__ import annotations
3
4# Use of this source code is governed by the MIT license.
5__license__ = "MIT"
6
7__all__ = [
8 "LXMLTreeBuilderForXML",
9 "LXMLTreeBuilder",
10]
11
12
13from typing import (
14 Any,
15 Dict,
16 Iterable,
17 List,
18 Optional,
19 Set,
20 Tuple,
21 Type,
22 TYPE_CHECKING,
23 Union,
24)
25from typing_extensions import TypeAlias
26
27from io import BytesIO
28from io import StringIO
29from lxml import etree
30from bs4.element import (
31 AttributeDict,
32 XMLAttributeDict,
33 Comment,
34 Doctype,
35 NamespacedAttribute,
36 ProcessingInstruction,
37 XMLProcessingInstruction,
38)
39from bs4.builder import (
40 DetectsXMLParsedAsHTML,
41 FAST,
42 HTML,
43 HTMLTreeBuilder,
44 PERMISSIVE,
45 TreeBuilder,
46 XML,
47)
48from bs4.dammit import EncodingDetector
49from bs4.exceptions import ParserRejectedMarkup
50
51if TYPE_CHECKING:
52 from bs4._typing import (
53 _Encoding,
54 _Encodings,
55 _NamespacePrefix,
56 _NamespaceURL,
57 _NamespaceMapping,
58 _InvertedNamespaceMapping,
59 _RawMarkup,
60 )
61 from bs4 import BeautifulSoup
62
63LXML: str = "lxml"
64
65
66def _invert(d: dict[Any, Any]) -> dict[Any, Any]:
67 "Invert a dictionary."
68 return dict((v, k) for k, v in list(d.items()))
69
70
71_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser]
72_ParserOrParserClass: TypeAlias = Union[
73 _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser]
74]
75
76
77class LXMLTreeBuilderForXML(TreeBuilder):
78 DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser
79
80 is_xml: bool = True
81
82 processing_instruction_class: Type[ProcessingInstruction]
83
84 NAME: str = "lxml-xml"
85 ALTERNATE_NAMES: Iterable[str] = ["xml"]
86
87 # Well, it's permissive by XML parser standards.
88 features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE]
89
90 CHUNK_SIZE: int = 512
91
92 # This namespace mapping is specified in the XML Namespace
93 # standard.
94 DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace")
95
96 DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS)
97
98 nsmaps: List[Optional[_InvertedNamespaceMapping]]
99 empty_element_tags: Set[str]
100 parser: Any
101 _default_parser: Optional[etree.XMLParser]
102
103 # NOTE: If we parsed Element objects and looked at .sourceline,
104 # we'd be able to see the line numbers from the original document.
105 # But instead we build an XMLParser or HTMLParser object to serve
106 # as the target of parse messages, and those messages don't include
107 # line numbers.
108 # See: https://bugs.launchpad.net/lxml/+bug/1846906
109
110 def initialize_soup(self, soup: BeautifulSoup) -> None:
111 """Let the BeautifulSoup object know about the standard namespace
112 mapping.
113
114 :param soup: A `BeautifulSoup`.
115 """
116 # Beyond this point, self.soup is set, so we can assume (and
117 # assert) it's not None whenever necessary.
118 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
119 self._register_namespaces(self.DEFAULT_NSMAPS)
120
121 def _register_namespaces(self, mapping: Dict[str, str]) -> None:
122 """Let the BeautifulSoup object know about namespaces encountered
123 while parsing the document.
124
125 This might be useful later on when creating CSS selectors.
126
127 This will track (almost) all namespaces, even ones that were
128 only in scope for part of the document. If two namespaces have
129 the same prefix, only the first one encountered will be
130 tracked. Un-prefixed namespaces are not tracked.
131
132 :param mapping: A dictionary mapping namespace prefixes to URIs.
133 """
134 assert self.soup is not None
135 for key, value in list(mapping.items()):
136 # This is 'if key' and not 'if key is not None' because we
137 # don't track un-prefixed namespaces. Soupselect will
138 # treat an un-prefixed namespace as the default, which
139 # causes confusion in some cases.
140 if key and key not in self.soup._namespaces:
141 # Let the BeautifulSoup object know about a new namespace.
142 # If there are multiple namespaces defined with the same
143 # prefix, the first one in the document takes precedence.
144 self.soup._namespaces[key] = value
145
146 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
147 """Find the default parser for the given encoding.
148
149 :return: Either a parser object or a class, which
150 will be instantiated with default arguments.
151 """
152 if self._default_parser is not None:
153 return self._default_parser
154 return self.DEFAULT_PARSER_CLASS(target=self, recover=True, encoding=encoding)
155
156 def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser:
157 """Instantiate an appropriate parser for the given encoding.
158
159 :param encoding: A string.
160 :return: A parser object such as an `etree.XMLParser`.
161 """
162 # Use the default parser.
163 parser = self.default_parser(encoding)
164
165 if callable(parser):
166 # Instantiate the parser with default arguments
167 parser = parser(target=self, recover=True, encoding=encoding)
168 return parser
169
170 def __init__(
171 self,
172 parser: Optional[etree.XMLParser] = None,
173 empty_element_tags: Optional[Set[str]] = None,
174 **kwargs: Any,
175 ):
176 # TODO: Issue a warning if parser is present but not a
177 # callable, since that means there's no way to create new
178 # parsers for different encodings.
179 self._default_parser = parser
180 self.soup = None
181 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
182 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
183 if self.is_xml:
184 self.processing_instruction_class = XMLProcessingInstruction
185 else:
186 self.processing_instruction_class = ProcessingInstruction
187
188 if "attribute_dict_class" not in kwargs:
189 kwargs["attribute_dict_class"] = XMLAttributeDict
190 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
191
192 def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]:
193 # Split the namespace URL out of a fully-qualified lxml tag
194 # name. Copied from lxml's src/lxml/sax.py.
195 if tag[0] == "{":
196 namespace, name = tag[1:].split("}", 1)
197 return (namespace, name)
198 else:
199 return (None, tag)
200
201 def prepare_markup(
202 self,
203 markup: _RawMarkup,
204 user_specified_encoding: Optional[_Encoding] = None,
205 document_declared_encoding: Optional[_Encoding] = None,
206 exclude_encodings: Optional[_Encodings] = None,
207 ) -> Iterable[
208 Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]
209 ]:
210 """Run any preliminary steps necessary to make incoming markup
211 acceptable to the parser.
212
213 lxml really wants to get a bytestring and convert it to
214 Unicode itself. So instead of using UnicodeDammit to convert
215 the bytestring to Unicode using different encodings, this
216 implementation uses EncodingDetector to iterate over the
217 encodings, and tell lxml to try to parse the document as each
218 one in turn.
219
220 :param markup: Some markup -- hopefully a bytestring.
221 :param user_specified_encoding: The user asked to try this encoding.
222 :param document_declared_encoding: The markup itself claims to be
223 in this encoding.
224 :param exclude_encodings: The user asked _not_ to try any of
225 these encodings.
226
227 :yield: A series of 4-tuples: (markup, encoding, declared encoding,
228 has undergone character replacement)
229
230 Each 4-tuple represents a strategy for converting the
231 document to Unicode and parsing it. Each strategy will be tried
232 in turn.
233 """
234 if not self.is_xml:
235 # We're in HTML mode, so if we're given XML, that's worth
236 # noting.
237 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
238
239 if isinstance(markup, str):
240 # We were given Unicode. Maybe lxml can parse Unicode on
241 # this system?
242
243 # TODO: This is a workaround for
244 # https://bugs.launchpad.net/lxml/+bug/1948551.
245 # We can remove it once the upstream issue is fixed.
246 if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}":
247 markup = markup[1:]
248 yield markup, None, document_declared_encoding, False
249
250 if isinstance(markup, str):
251 # No, apparently not. Convert the Unicode to UTF-8 and
252 # tell lxml to parse it as UTF-8.
253 yield (markup.encode("utf8"), "utf8", document_declared_encoding, False)
254
255 # Since the document was Unicode in the first place, there
256 # is no need to try any more strategies; we know this will
257 # work.
258 return
259
260 known_definite_encodings: List[_Encoding] = []
261 if user_specified_encoding:
262 # This was provided by the end-user; treat it as a known
263 # definite encoding per the algorithm laid out in the
264 # HTML5 spec. (See the EncodingDetector class for
265 # details.)
266 known_definite_encodings.append(user_specified_encoding)
267
268 user_encodings: List[_Encoding] = []
269 if document_declared_encoding:
270 # This was found in the document; treat it as a slightly
271 # lower-priority user encoding.
272 user_encodings.append(document_declared_encoding)
273
274 detector = EncodingDetector(
275 markup,
276 known_definite_encodings=known_definite_encodings,
277 user_encodings=user_encodings,
278 is_html=not self.is_xml,
279 exclude_encodings=exclude_encodings,
280 )
281 for encoding in detector.encodings:
282 yield (detector.markup, encoding, document_declared_encoding, False)
283
284 def feed(self, markup: _RawMarkup) -> None:
285 io: Union[BytesIO, StringIO]
286 if isinstance(markup, bytes):
287 io = BytesIO(markup)
288 elif isinstance(markup, str):
289 io = StringIO(markup)
290
291 # initialize_soup is called before feed, so we know this
292 # is not None.
293 assert self.soup is not None
294
295 # Call feed() at least once, even if the markup is empty,
296 # or the parser won't be initialized.
297 data = io.read(self.CHUNK_SIZE)
298 try:
299 self.parser = self.parser_for(self.soup.original_encoding)
300 self.parser.feed(data)
301 while len(data) != 0:
302 # Now call feed() on the rest of the data, chunk by chunk.
303 data = io.read(self.CHUNK_SIZE)
304 if len(data) != 0:
305 self.parser.feed(data)
306 self.parser.close()
307 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
308 raise ParserRejectedMarkup(e)
309
310 def close(self) -> None:
311 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
312
313 def start(
314 self,
315 tag: str | bytes,
316 attrs: Dict[str | bytes, str | bytes],
317 nsmap: _NamespaceMapping = {},
318 ) -> None:
319 # This is called by lxml code as a result of calling
320 # BeautifulSoup.feed(), and we know self.soup is set by the time feed()
321 # is called.
322 assert self.soup is not None
323 assert isinstance(tag, str)
324
325 # We need to recreate the attribute dict for three
326 # reasons. First, for type checking, so we can assert there
327 # are no bytestrings in the keys or values. Second, because we
328 # need a mutable dict--lxml might send us an immutable
329 # dictproxy. Third, so we can handle namespaced attribute
330 # names by converting the keys to NamespacedAttributes.
331 new_attrs: Dict[Union[str, NamespacedAttribute], str] = (
332 self.attribute_dict_class()
333 )
334 for k, v in attrs.items():
335 assert isinstance(k, str)
336 assert isinstance(v, str)
337 new_attrs[k] = v
338
339 nsprefix: Optional[_NamespacePrefix] = None
340 namespace: Optional[_NamespaceURL] = None
341 # Invert each namespace map as it comes in.
342 if len(nsmap) == 0 and len(self.nsmaps) > 1:
343 # There are no new namespaces for this tag, but
344 # non-default namespaces are in play, so we need a
345 # separate tag stack to know when they end.
346 self.nsmaps.append(None)
347 elif len(nsmap) > 0:
348 # A new namespace mapping has come into play.
349
350 # First, Let the BeautifulSoup object know about it.
351 self._register_namespaces(nsmap)
352
353 # Then, add it to our running list of inverted namespace
354 # mappings.
355 self.nsmaps.append(_invert(nsmap))
356
357 # The currently active namespace prefixes have
358 # changed. Calculate the new mapping so it can be stored
359 # with all Tag objects created while these prefixes are in
360 # scope.
361 current_mapping = dict(self.active_namespace_prefixes[-1])
362 current_mapping.update(nsmap)
363
364 # We should not track un-prefixed namespaces as we can only hold one
365 # and it will be recognized as the default namespace by soupsieve,
366 # which may be confusing in some situations.
367 if "" in current_mapping:
368 del current_mapping[""]
369 self.active_namespace_prefixes.append(current_mapping)
370
371 # Also treat the namespace mapping as a set of attributes on the
372 # tag, so we can recreate it later.
373 for prefix, namespace in list(nsmap.items()):
374 attribute = NamespacedAttribute(
375 "xmlns", prefix, "http://www.w3.org/2000/xmlns/"
376 )
377 new_attrs[attribute] = namespace
378
379 # Namespaces are in play. Find any attributes that came in
380 # from lxml with namespaces attached to their names, and
381 # turn then into NamespacedAttribute objects.
382 final_attrs: AttributeDict = self.attribute_dict_class()
383 for attr, value in list(new_attrs.items()):
384 namespace, attr = self._getNsTag(attr)
385 if namespace is None:
386 final_attrs[attr] = value
387 else:
388 nsprefix = self._prefix_for_namespace(namespace)
389 attr = NamespacedAttribute(nsprefix, attr, namespace)
390 final_attrs[attr] = value
391
392 namespace, tag = self._getNsTag(tag)
393 nsprefix = self._prefix_for_namespace(namespace)
394 self.soup.handle_starttag(
395 tag,
396 namespace,
397 nsprefix,
398 final_attrs,
399 namespaces=self.active_namespace_prefixes[-1],
400 )
401
402 def _prefix_for_namespace(
403 self, namespace: Optional[_NamespaceURL]
404 ) -> Optional[_NamespacePrefix]:
405 """Find the currently active prefix for the given namespace."""
406 if namespace is None:
407 return None
408 for inverted_nsmap in reversed(self.nsmaps):
409 if inverted_nsmap is not None and namespace in inverted_nsmap:
410 return inverted_nsmap[namespace]
411 return None
412
413 def end(self, name: str | bytes) -> None:
414 assert self.soup is not None
415 assert isinstance(name, str)
416 self.soup.endData()
417 namespace, name = self._getNsTag(name)
418 nsprefix = None
419 if namespace is not None:
420 for inverted_nsmap in reversed(self.nsmaps):
421 if inverted_nsmap is not None and namespace in inverted_nsmap:
422 nsprefix = inverted_nsmap[namespace]
423 break
424 self.soup.handle_endtag(name, nsprefix)
425 if len(self.nsmaps) > 1:
426 # This tag, or one of its parents, introduced a namespace
427 # mapping, so pop it off the stack.
428 out_of_scope_nsmap = self.nsmaps.pop()
429
430 if out_of_scope_nsmap is not None:
431 # This tag introduced a namespace mapping which is no
432 # longer in scope. Recalculate the currently active
433 # namespace prefixes.
434 self.active_namespace_prefixes.pop()
435
436 def pi(self, target: str, data: str) -> None:
437 assert self.soup is not None
438 self.soup.endData()
439 data = target + " " + data
440 self.soup.handle_data(data)
441 self.soup.endData(self.processing_instruction_class)
442
443 def data(self, data: str | bytes) -> None:
444 assert self.soup is not None
445 assert isinstance(data, str)
446 self.soup.handle_data(data)
447
448 def doctype(self, name: str, pubid: str, system: str) -> None:
449 assert self.soup is not None
450 self.soup.endData()
451 doctype_string = Doctype._string_for_name_and_ids(name, pubid, system)
452 self.soup.handle_data(doctype_string)
453 self.soup.endData(containerClass=Doctype)
454
455 def comment(self, text: str | bytes) -> None:
456 "Handle comments as Comment objects."
457 assert self.soup is not None
458 assert isinstance(text, str)
459 self.soup.endData()
460 self.soup.handle_data(text)
461 self.soup.endData(Comment)
462
463 def test_fragment_to_document(self, fragment: str) -> str:
464 """See `TreeBuilder`."""
465 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
466
467
468class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
469 NAME: str = LXML
470 ALTERNATE_NAMES: Iterable[str] = ["lxml-html"]
471
472 features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE]
473 is_xml: bool = False
474
475 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
476 return etree.HTMLParser
477
478 def feed(self, markup: _RawMarkup) -> None:
479 # We know self.soup is set by the time feed() is called.
480 assert self.soup is not None
481 encoding = self.soup.original_encoding
482 try:
483 self.parser = self.parser_for(encoding)
484 self.parser.feed(markup)
485 self.parser.close()
486 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
487 raise ParserRejectedMarkup(e)
488
489 def test_fragment_to_document(self, fragment: str) -> str:
490 """See `TreeBuilder`."""
491 return "<html><body>%s</body></html>" % fragment