1# encoding: utf-8
2from __future__ import annotations
3
4# Use of this source code is governed by the MIT license.
5__license__ = "MIT"
6
7__all__ = [
8 "LXMLTreeBuilderForXML",
9 "LXMLTreeBuilder",
10]
11
12
13from typing import (
14 Any,
15 Dict,
16 Iterable,
17 List,
18 Optional,
19 Set,
20 Tuple,
21 Type,
22 TYPE_CHECKING,
23 Union,
24)
25
26from io import BytesIO
27from io import StringIO
28
29from typing_extensions import TypeAlias
30
31from lxml import etree # type:ignore
32from bs4.element import (
33 AttributeDict,
34 XMLAttributeDict,
35 Comment,
36 Doctype,
37 NamespacedAttribute,
38 ProcessingInstruction,
39 XMLProcessingInstruction,
40)
41from bs4.builder import (
42 DetectsXMLParsedAsHTML,
43 FAST,
44 HTML,
45 HTMLTreeBuilder,
46 PERMISSIVE,
47 TreeBuilder,
48 XML,
49)
50from bs4.dammit import EncodingDetector
51from bs4.exceptions import ParserRejectedMarkup
52
53if TYPE_CHECKING:
54 from bs4._typing import (
55 _Encoding,
56 _Encodings,
57 _NamespacePrefix,
58 _NamespaceURL,
59 _NamespaceMapping,
60 _InvertedNamespaceMapping,
61 _RawMarkup,
62 )
63 from bs4 import BeautifulSoup
64
65LXML: str = "lxml"
66
67
68def _invert(d: dict[Any, Any]) -> dict[Any, Any]:
69 "Invert a dictionary."
70 return dict((v, k) for k, v in list(d.items()))
71
72
73_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser]
74_ParserOrParserClass: TypeAlias = Union[
75 _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser]
76]
77
78
79class LXMLTreeBuilderForXML(TreeBuilder):
80 DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser
81
82 is_xml: bool = True
83
84 processing_instruction_class: Type[ProcessingInstruction]
85
86 NAME: str = "lxml-xml"
87 ALTERNATE_NAMES: Iterable[str] = ["xml"]
88
89 # Well, it's permissive by XML parser standards.
90 features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE]
91
92 CHUNK_SIZE: int = 512
93
94 # This namespace mapping is specified in the XML Namespace
95 # standard.
96 DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace")
97
98 DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS)
99
100 nsmaps: List[Optional[_InvertedNamespaceMapping]]
101 empty_element_tags: Optional[Set[str]]
102 parser: Any
103 _default_parser: Optional[etree.XMLParser]
104
105 # NOTE: If we parsed Element objects and looked at .sourceline,
106 # we'd be able to see the line numbers from the original document.
107 # But instead we build an XMLParser or HTMLParser object to serve
108 # as the target of parse messages, and those messages don't include
109 # line numbers.
110 # See: https://bugs.launchpad.net/lxml/+bug/1846906
111
112 def initialize_soup(self, soup: BeautifulSoup) -> None:
113 """Let the BeautifulSoup object know about the standard namespace
114 mapping.
115
116 :param soup: A `BeautifulSoup`.
117 """
118 # Beyond this point, self.soup is set, so we can assume (and
119 # assert) it's not None whenever necessary.
120 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
121 self._register_namespaces(self.DEFAULT_NSMAPS)
122
123 def _register_namespaces(self, mapping: Dict[str, str]) -> None:
124 """Let the BeautifulSoup object know about namespaces encountered
125 while parsing the document.
126
127 This might be useful later on when creating CSS selectors.
128
129 This will track (almost) all namespaces, even ones that were
130 only in scope for part of the document. If two namespaces have
131 the same prefix, only the first one encountered will be
132 tracked. Un-prefixed namespaces are not tracked.
133
134 :param mapping: A dictionary mapping namespace prefixes to URIs.
135 """
136 assert self.soup is not None
137 for key, value in list(mapping.items()):
138 # This is 'if key' and not 'if key is not None' because we
139 # don't track un-prefixed namespaces. Soupselect will
140 # treat an un-prefixed namespace as the default, which
141 # causes confusion in some cases.
142 if key and key not in self.soup._namespaces:
143 # Let the BeautifulSoup object know about a new namespace.
144 # If there are multiple namespaces defined with the same
145 # prefix, the first one in the document takes precedence.
146 self.soup._namespaces[key] = value
147
148 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
149 """Find the default parser for the given encoding.
150
151 :return: Either a parser object or a class, which
152 will be instantiated with default arguments.
153 """
154 if self._default_parser is not None:
155 return self._default_parser
156 return self.DEFAULT_PARSER_CLASS(target=self, recover=True, encoding=encoding)
157
158 def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser:
159 """Instantiate an appropriate parser for the given encoding.
160
161 :param encoding: A string.
162 :return: A parser object such as an `etree.XMLParser`.
163 """
164 # Use the default parser.
165 parser = self.default_parser(encoding)
166
167 if callable(parser):
168 # Instantiate the parser with default arguments
169 parser = parser(target=self, recover=True, encoding=encoding)
170 return parser
171
172 def __init__(
173 self,
174 parser: Optional[etree.XMLParser] = None,
175 empty_element_tags: Optional[Set[str]] = None,
176 **kwargs: Any,
177 ):
178 # TODO: Issue a warning if parser is present but not a
179 # callable, since that means there's no way to create new
180 # parsers for different encodings.
181 self._default_parser = parser
182 self.soup = None
183 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
184 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
185 if self.is_xml:
186 self.processing_instruction_class = XMLProcessingInstruction
187 else:
188 self.processing_instruction_class = ProcessingInstruction
189
190 if "attribute_dict_class" not in kwargs:
191 kwargs["attribute_dict_class"] = XMLAttributeDict
192 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
193
194 def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]:
195 # Split the namespace URL out of a fully-qualified lxml tag
196 # name. Copied from lxml's src/lxml/sax.py.
197 if tag[0] == "{" and "}" in tag:
198 namespace, name = tag[1:].split("}", 1)
199 return (namespace, name)
200 return (None, tag)
201
202 def prepare_markup(
203 self,
204 markup: _RawMarkup,
205 user_specified_encoding: Optional[_Encoding] = None,
206 document_declared_encoding: Optional[_Encoding] = None,
207 exclude_encodings: Optional[_Encodings] = None,
208 ) -> Iterable[
209 Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]
210 ]:
211 """Run any preliminary steps necessary to make incoming markup
212 acceptable to the parser.
213
214 lxml really wants to get a bytestring and convert it to
215 Unicode itself. So instead of using UnicodeDammit to convert
216 the bytestring to Unicode using different encodings, this
217 implementation uses EncodingDetector to iterate over the
218 encodings, and tell lxml to try to parse the document as each
219 one in turn.
220
221 :param markup: Some markup -- hopefully a bytestring.
222 :param user_specified_encoding: The user asked to try this encoding.
223 :param document_declared_encoding: The markup itself claims to be
224 in this encoding.
225 :param exclude_encodings: The user asked _not_ to try any of
226 these encodings.
227
228 :yield: A series of 4-tuples: (markup, encoding, declared encoding,
229 has undergone character replacement)
230
231 Each 4-tuple represents a strategy for converting the
232 document to Unicode and parsing it. Each strategy will be tried
233 in turn.
234 """
235 if not self.is_xml:
236 # We're in HTML mode, so if we're given XML, that's worth
237 # noting.
238 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
239
240 if isinstance(markup, str):
241 # We were given Unicode. Maybe lxml can parse Unicode on
242 # this system?
243
244 # TODO: This is a workaround for
245 # https://bugs.launchpad.net/lxml/+bug/1948551.
246 # We can remove it once the upstream issue is fixed.
247 if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}":
248 markup = markup[1:]
249 yield markup, None, document_declared_encoding, False
250
251 if isinstance(markup, str):
252 # No, apparently not. Convert the Unicode to UTF-8 and
253 # tell lxml to parse it as UTF-8.
254 yield (markup.encode("utf8"), "utf8", document_declared_encoding, False)
255
256 # Since the document was Unicode in the first place, there
257 # is no need to try any more strategies; we know this will
258 # work.
259 return
260
261 known_definite_encodings: List[_Encoding] = []
262 if user_specified_encoding:
263 # This was provided by the end-user; treat it as a known
264 # definite encoding per the algorithm laid out in the
265 # HTML5 spec. (See the EncodingDetector class for
266 # details.)
267 known_definite_encodings.append(user_specified_encoding)
268
269 user_encodings: List[_Encoding] = []
270 if document_declared_encoding:
271 # This was found in the document; treat it as a slightly
272 # lower-priority user encoding.
273 user_encodings.append(document_declared_encoding)
274
275 detector = EncodingDetector(
276 markup,
277 known_definite_encodings=known_definite_encodings,
278 user_encodings=user_encodings,
279 is_html=not self.is_xml,
280 exclude_encodings=exclude_encodings,
281 )
282 for encoding in detector.encodings:
283 yield (detector.markup, encoding, document_declared_encoding, False)
284
285 def feed(self, markup: _RawMarkup) -> None:
286 io: Union[BytesIO, StringIO]
287 if isinstance(markup, bytes):
288 io = BytesIO(markup)
289 elif isinstance(markup, str):
290 io = StringIO(markup)
291
292 # initialize_soup is called before feed, so we know this
293 # is not None.
294 assert self.soup is not None
295
296 # Call feed() at least once, even if the markup is empty,
297 # or the parser won't be initialized.
298 data = io.read(self.CHUNK_SIZE)
299 try:
300 self.parser = self.parser_for(self.soup.original_encoding)
301 self.parser.feed(data)
302 while len(data) != 0:
303 # Now call feed() on the rest of the data, chunk by chunk.
304 data = io.read(self.CHUNK_SIZE)
305 if len(data) != 0:
306 self.parser.feed(data)
307 self.parser.close()
308 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
309 raise ParserRejectedMarkup(e)
310
311 def close(self) -> None:
312 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
313
314 def start(
315 self,
316 tag: str | bytes,
317 attrib: Dict[str | bytes, str | bytes],
318 nsmap: _NamespaceMapping = {},
319 ) -> None:
320 # This is called by lxml code as a result of calling
321 # BeautifulSoup.feed(), and we know self.soup is set by the time feed()
322 # is called.
323 assert self.soup is not None
324 assert isinstance(tag, str)
325
326 # We need to recreate the attribute dict for three
327 # reasons. First, for type checking, so we can assert there
328 # are no bytestrings in the keys or values. Second, because we
329 # need a mutable dict--lxml might send us an immutable
330 # dictproxy. Third, so we can handle namespaced attribute
331 # names by converting the keys to NamespacedAttributes.
332 new_attrib: Dict[Union[str, NamespacedAttribute], str] = (
333 self.attribute_dict_class()
334 )
335 for k, v in attrib.items():
336 assert isinstance(k, str)
337 assert isinstance(v, str)
338 new_attrib[k] = v
339
340 nsprefix: Optional[_NamespacePrefix] = None
341 namespace: Optional[_NamespaceURL] = None
342 # Invert each namespace map as it comes in.
343 if len(nsmap) == 0 and len(self.nsmaps) > 1:
344 # There are no new namespaces for this tag, but
345 # non-default namespaces are in play, so we need a
346 # separate tag stack to know when they end.
347 self.nsmaps.append(None)
348 elif len(nsmap) > 0:
349 # A new namespace mapping has come into play.
350
351 # First, Let the BeautifulSoup object know about it.
352 self._register_namespaces(nsmap)
353
354 # Then, add it to our running list of inverted namespace
355 # mappings.
356 self.nsmaps.append(_invert(nsmap))
357
358 # The currently active namespace prefixes have
359 # changed. Calculate the new mapping so it can be stored
360 # with all Tag objects created while these prefixes are in
361 # scope.
362 current_mapping = dict(self.active_namespace_prefixes[-1])
363 current_mapping.update(nsmap)
364
365 # We should not track un-prefixed namespaces as we can only hold one
366 # and it will be recognized as the default namespace by soupsieve,
367 # which may be confusing in some situations.
368 if "" in current_mapping:
369 del current_mapping[""]
370 self.active_namespace_prefixes.append(current_mapping)
371
372 # Also treat the namespace mapping as a set of attributes on the
373 # tag, so we can recreate it later.
374 for prefix, namespace in list(nsmap.items()):
375 attribute = NamespacedAttribute(
376 "xmlns", prefix, "http://www.w3.org/2000/xmlns/"
377 )
378 new_attrib[attribute] = namespace
379
380 # Namespaces are in play. Find any attributes that came in
381 # from lxml with namespaces attached to their names, and
382 # turn then into NamespacedAttribute objects.
383 final_attrib: AttributeDict = self.attribute_dict_class()
384 for attr, value in list(new_attrib.items()):
385 namespace, attr = self._getNsTag(attr)
386 if namespace is None:
387 final_attrib[attr] = value
388 else:
389 nsprefix = self._prefix_for_namespace(namespace)
390 attr = NamespacedAttribute(nsprefix, attr, namespace)
391 final_attrib[attr] = value
392
393 namespace, tag = self._getNsTag(tag)
394 nsprefix = self._prefix_for_namespace(namespace)
395 self.soup.handle_starttag(
396 tag,
397 namespace,
398 nsprefix,
399 final_attrib,
400 namespaces=self.active_namespace_prefixes[-1],
401 )
402
403 def _prefix_for_namespace(
404 self, namespace: Optional[_NamespaceURL]
405 ) -> Optional[_NamespacePrefix]:
406 """Find the currently active prefix for the given namespace."""
407 if namespace is None:
408 return None
409 for inverted_nsmap in reversed(self.nsmaps):
410 if inverted_nsmap is not None and namespace in inverted_nsmap:
411 return inverted_nsmap[namespace]
412 return None
413
414 def end(self, tag: str | bytes) -> None:
415 assert self.soup is not None
416 assert isinstance(tag, str)
417 self.soup.endData()
418 namespace, tag = self._getNsTag(tag)
419 nsprefix = None
420 if namespace is not None:
421 for inverted_nsmap in reversed(self.nsmaps):
422 if inverted_nsmap is not None and namespace in inverted_nsmap:
423 nsprefix = inverted_nsmap[namespace]
424 break
425 self.soup.handle_endtag(tag, nsprefix)
426 if len(self.nsmaps) > 1:
427 # This tag, or one of its parents, introduced a namespace
428 # mapping, so pop it off the stack.
429 out_of_scope_nsmap = self.nsmaps.pop()
430
431 if out_of_scope_nsmap is not None:
432 # This tag introduced a namespace mapping which is no
433 # longer in scope. Recalculate the currently active
434 # namespace prefixes.
435 self.active_namespace_prefixes.pop()
436
437 def pi(self, target: str, data: str) -> None:
438 assert self.soup is not None
439 self.soup.endData()
440 data = target + " " + data
441 self.soup.handle_data(data)
442 self.soup.endData(self.processing_instruction_class)
443
444 def data(self, data: str | bytes) -> None:
445 assert self.soup is not None
446 assert isinstance(data, str)
447 self.soup.handle_data(data)
448
449 def doctype(self, name: str, pubid: str, system: str) -> None:
450 assert self.soup is not None
451 self.soup.endData()
452 doctype_string = Doctype._string_for_name_and_ids(name, pubid, system)
453 self.soup.handle_data(doctype_string)
454 self.soup.endData(containerClass=Doctype)
455
456 def comment(self, text: str | bytes) -> None:
457 "Handle comments as Comment objects."
458 assert self.soup is not None
459 assert isinstance(text, str)
460 self.soup.endData()
461 self.soup.handle_data(text)
462 self.soup.endData(Comment)
463
464 def test_fragment_to_document(self, fragment: str) -> str:
465 """See `TreeBuilder`."""
466 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
467
468
469class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
470 NAME: str = LXML
471 ALTERNATE_NAMES: Iterable[str] = ["lxml-html"]
472
473 features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE]
474 is_xml: bool = False
475
476 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
477 return etree.HTMLParser
478
479 def feed(self, markup: _RawMarkup) -> None:
480 # We know self.soup is set by the time feed() is called.
481 assert self.soup is not None
482 encoding = self.soup.original_encoding
483 try:
484 self.parser = self.parser_for(encoding)
485 self.parser.feed(markup)
486 self.parser.close()
487 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
488 raise ParserRejectedMarkup(e)
489
490 def test_fragment_to_document(self, fragment: str) -> str:
491 """See `TreeBuilder`."""
492 return "<html><body>%s</body></html>" % fragment