Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/_lxml.py: 28%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# encoding: utf-8
2from __future__ import annotations
4# Use of this source code is governed by the MIT license.
5__license__ = "MIT"
7__all__ = [
8 "LXMLTreeBuilderForXML",
9 "LXMLTreeBuilder",
10]
13from typing import (
14 Any,
15 Dict,
16 Iterable,
17 List,
18 Optional,
19 Set,
20 Tuple,
21 Type,
22 TYPE_CHECKING,
23 Union,
24)
26from io import BytesIO
27from io import StringIO
29from typing_extensions import TypeAlias
31from lxml import etree # type:ignore
32from bs4.element import (
33 AttributeDict,
34 XMLAttributeDict,
35 Comment,
36 Doctype,
37 NamespacedAttribute,
38 ProcessingInstruction,
39 XMLProcessingInstruction,
40)
41from bs4.builder import (
42 DetectsXMLParsedAsHTML,
43 FAST,
44 HTML,
45 HTMLTreeBuilder,
46 PERMISSIVE,
47 TreeBuilder,
48 XML,
49)
50from bs4.dammit import EncodingDetector
51from bs4.exceptions import ParserRejectedMarkup
53if TYPE_CHECKING:
54 from bs4._typing import (
55 _Encoding,
56 _Encodings,
57 _NamespacePrefix,
58 _NamespaceURL,
59 _NamespaceMapping,
60 _InvertedNamespaceMapping,
61 _RawMarkup,
62 )
63 from bs4 import BeautifulSoup
65LXML: str = "lxml"
68def _invert(d: dict[Any, Any]) -> dict[Any, Any]:
69 "Invert a dictionary."
70 return dict((v, k) for k, v in list(d.items()))
73_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser]
74_ParserOrParserClass: TypeAlias = Union[
75 _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser]
76]
79class LXMLTreeBuilderForXML(TreeBuilder):
80 DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser
82 is_xml: bool = True
84 #: Set this to true (probably by passing huge_tree=True into the :
85 #: BeautifulSoup constructor) to enable the lxml feature "disable security
86 #: restrictions and support very deep trees and very long text
87 #: content".
88 huge_tree: bool
90 processing_instruction_class: Type[ProcessingInstruction]
92 NAME: str = "lxml-xml"
93 ALTERNATE_NAMES: Iterable[str] = ["xml"]
95 # Well, it's permissive by XML parser standards.
96 features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE]
98 CHUNK_SIZE: int = 512
100 # This namespace mapping is specified in the XML Namespace
101 # standard.
102 DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace")
104 DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS)
106 nsmaps: List[Optional[_InvertedNamespaceMapping]]
107 empty_element_tags: Optional[Set[str]]
108 parser: Any
109 _default_parser: Optional[etree.XMLParser]
111 # NOTE: If we parsed Element objects and looked at .sourceline,
112 # we'd be able to see the line numbers from the original document.
113 # But instead we build an XMLParser or HTMLParser object to serve
114 # as the target of parse messages, and those messages don't include
115 # line numbers.
116 # See: https://bugs.launchpad.net/lxml/+bug/1846906
118 def initialize_soup(self, soup: BeautifulSoup) -> None:
119 """Let the BeautifulSoup object know about the standard namespace
120 mapping.
122 :param soup: A `BeautifulSoup`.
123 """
124 # Beyond this point, self.soup is set, so we can assume (and
125 # assert) it's not None whenever necessary.
126 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
127 self._register_namespaces(self.DEFAULT_NSMAPS)
129 def _register_namespaces(self, mapping: Dict[str, str]) -> None:
130 """Let the BeautifulSoup object know about namespaces encountered
131 while parsing the document.
133 This might be useful later on when creating CSS selectors.
135 This will track (almost) all namespaces, even ones that were
136 only in scope for part of the document. If two namespaces have
137 the same prefix, only the first one encountered will be
138 tracked. Un-prefixed namespaces are not tracked.
140 :param mapping: A dictionary mapping namespace prefixes to URIs.
141 """
142 assert self.soup is not None
143 for key, value in list(mapping.items()):
144 # This is 'if key' and not 'if key is not None' because we
145 # don't track un-prefixed namespaces. Soupselect will
146 # treat an un-prefixed namespace as the default, which
147 # causes confusion in some cases.
148 if key and key not in self.soup._namespaces:
149 # Let the BeautifulSoup object know about a new namespace.
150 # If there are multiple namespaces defined with the same
151 # prefix, the first one in the document takes precedence.
152 self.soup._namespaces[key] = value
154 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
155 """Find the default parser for the given encoding.
157 :return: Either a parser object or a class, which
158 will be instantiated with default arguments.
159 """
160 if self._default_parser is not None:
161 return self._default_parser
162 return self.DEFAULT_PARSER_CLASS(target=self, recover=True, huge_tree=self.huge_tree, encoding=encoding)
164 def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser:
165 """Instantiate an appropriate parser for the given encoding.
167 :param encoding: A string.
168 :return: A parser object such as an `etree.XMLParser`.
169 """
170 # Use the default parser.
171 parser = self.default_parser(encoding)
173 if callable(parser):
174 # Instantiate the parser with default arguments
175 parser = parser(target=self, recover=True, huge_tree=self.huge_tree, encoding=encoding)
176 return parser
178 def __init__(
179 self,
180 parser: Optional[etree.XMLParser] = None,
181 empty_element_tags: Optional[Set[str]] = None,
182 huge_tree: bool = False,
183 **kwargs: Any,
184 ):
185 # TODO: Issue a warning if parser is present but not a
186 # callable, since that means there's no way to create new
187 # parsers for different encodings.
188 self._default_parser = parser
189 self.soup = None
190 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
191 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
192 if self.is_xml:
193 self.processing_instruction_class = XMLProcessingInstruction
194 else:
195 self.processing_instruction_class = ProcessingInstruction
197 if "attribute_dict_class" not in kwargs:
198 kwargs["attribute_dict_class"] = XMLAttributeDict
199 self.huge_tree = huge_tree
201 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
203 def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]:
204 # Split the namespace URL out of a fully-qualified lxml tag
205 # name. Copied from lxml's src/lxml/sax.py.
206 if tag[0] == "{" and "}" in tag:
207 namespace, name = tag[1:].split("}", 1)
208 return (namespace, name)
209 return (None, tag)
211 def prepare_markup(
212 self,
213 markup: _RawMarkup,
214 user_specified_encoding: Optional[_Encoding] = None,
215 document_declared_encoding: Optional[_Encoding] = None,
216 exclude_encodings: Optional[_Encodings] = None,
217 ) -> Iterable[
218 Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]
219 ]:
220 """Run any preliminary steps necessary to make incoming markup
221 acceptable to the parser.
223 lxml really wants to get a bytestring and convert it to
224 Unicode itself. So instead of using UnicodeDammit to convert
225 the bytestring to Unicode using different encodings, this
226 implementation uses EncodingDetector to iterate over the
227 encodings, and tell lxml to try to parse the document as each
228 one in turn.
230 :param markup: Some markup -- hopefully a bytestring.
231 :param user_specified_encoding: The user asked to try this encoding.
232 :param document_declared_encoding: The markup itself claims to be
233 in this encoding.
234 :param exclude_encodings: The user asked _not_ to try any of
235 these encodings.
237 :yield: A series of 4-tuples: (markup, encoding, declared encoding,
238 has undergone character replacement)
240 Each 4-tuple represents a strategy for converting the
241 document to Unicode and parsing it. Each strategy will be tried
242 in turn.
243 """
244 if not self.is_xml:
245 # We're in HTML mode, so if we're given XML, that's worth
246 # noting.
247 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
249 if isinstance(markup, str):
250 # We were given Unicode. Maybe lxml can parse Unicode on
251 # this system?
253 # TODO: This is a workaround for
254 # https://bugs.launchpad.net/lxml/+bug/1948551.
255 # We can remove it once the upstream issue is fixed.
256 if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}":
257 markup = markup[1:]
258 yield markup, None, document_declared_encoding, False
260 if isinstance(markup, str):
261 # No, apparently not. Convert the Unicode to UTF-8 and
262 # tell lxml to parse it as UTF-8.
263 yield (markup.encode("utf8"), "utf8", document_declared_encoding, False)
265 # Since the document was Unicode in the first place, there
266 # is no need to try any more strategies; we know this will
267 # work.
268 return
270 known_definite_encodings: List[_Encoding] = []
271 if user_specified_encoding:
272 # This was provided by the end-user; treat it as a known
273 # definite encoding per the algorithm laid out in the
274 # HTML5 spec. (See the EncodingDetector class for
275 # details.)
276 known_definite_encodings.append(user_specified_encoding)
278 user_encodings: List[_Encoding] = []
279 if document_declared_encoding:
280 # This was found in the document; treat it as a slightly
281 # lower-priority user encoding.
282 user_encodings.append(document_declared_encoding)
284 detector = EncodingDetector(
285 markup,
286 known_definite_encodings=known_definite_encodings,
287 user_encodings=user_encodings,
288 is_html=not self.is_xml,
289 exclude_encodings=exclude_encodings,
290 )
291 for encoding in detector.encodings:
292 yield (detector.markup, encoding, document_declared_encoding, False)
294 def feed(self, markup: _RawMarkup) -> None:
295 io: Union[BytesIO, StringIO]
296 if isinstance(markup, bytes):
297 io = BytesIO(markup)
298 elif isinstance(markup, str):
299 io = StringIO(markup)
301 # initialize_soup is called before feed, so we know this
302 # is not None.
303 assert self.soup is not None
305 # Call feed() at least once, even if the markup is empty,
306 # or the parser won't be initialized.
307 data = io.read(self.CHUNK_SIZE)
308 try:
309 self.parser = self.parser_for(self.soup.original_encoding)
310 self.parser.feed(data)
311 while len(data) != 0:
312 # Now call feed() on the rest of the data, chunk by chunk.
313 data = io.read(self.CHUNK_SIZE)
314 if len(data) != 0:
315 self.parser.feed(data)
316 self.parser.close()
317 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
318 raise ParserRejectedMarkup(e)
320 def close(self) -> None:
321 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
323 def start(
324 self,
325 tag: str | bytes,
326 attrib: Dict[str | bytes, str | bytes],
327 nsmap: _NamespaceMapping = {},
328 ) -> None:
329 # This is called by lxml code as a result of calling
330 # BeautifulSoup.feed(), and we know self.soup is set by the time feed()
331 # is called.
332 assert self.soup is not None
333 assert isinstance(tag, str)
335 # We need to recreate the attribute dict for three
336 # reasons. First, for type checking, so we can assert there
337 # are no bytestrings in the keys or values. Second, because we
338 # need a mutable dict--lxml might send us an immutable
339 # dictproxy. Third, so we can handle namespaced attribute
340 # names by converting the keys to NamespacedAttributes.
341 new_attrib: Dict[Union[str, NamespacedAttribute], str] = (
342 self.attribute_dict_class()
343 )
344 for k, v in attrib.items():
345 assert isinstance(k, str)
346 assert isinstance(v, str)
347 new_attrib[k] = v
349 nsprefix: Optional[_NamespacePrefix] = None
350 namespace: Optional[_NamespaceURL] = None
351 # Invert each namespace map as it comes in.
352 if len(nsmap) == 0 and len(self.nsmaps) > 1:
353 # There are no new namespaces for this tag, but
354 # non-default namespaces are in play, so we need a
355 # separate tag stack to know when they end.
356 self.nsmaps.append(None)
357 elif len(nsmap) > 0:
358 # A new namespace mapping has come into play.
360 # First, Let the BeautifulSoup object know about it.
361 self._register_namespaces(nsmap)
363 # Then, add it to our running list of inverted namespace
364 # mappings.
365 self.nsmaps.append(_invert(nsmap))
367 # The currently active namespace prefixes have
368 # changed. Calculate the new mapping so it can be stored
369 # with all Tag objects created while these prefixes are in
370 # scope.
371 current_mapping = dict(self.active_namespace_prefixes[-1])
372 current_mapping.update(nsmap)
374 # We should not track un-prefixed namespaces as we can only hold one
375 # and it will be recognized as the default namespace by soupsieve,
376 # which may be confusing in some situations.
377 if "" in current_mapping:
378 del current_mapping[""]
379 self.active_namespace_prefixes.append(current_mapping)
381 # Also treat the namespace mapping as a set of attributes on the
382 # tag, so we can recreate it later.
383 for prefix, namespace in list(nsmap.items()):
384 attribute = NamespacedAttribute(
385 "xmlns", prefix, "http://www.w3.org/2000/xmlns/"
386 )
387 new_attrib[attribute] = namespace
389 # Namespaces are in play. Find any attributes that came in
390 # from lxml with namespaces attached to their names, and
391 # turn then into NamespacedAttribute objects.
392 final_attrib: AttributeDict = self.attribute_dict_class()
393 for attr, value in list(new_attrib.items()):
394 namespace, attr = self._getNsTag(attr)
395 if namespace is None:
396 final_attrib[attr] = value
397 else:
398 nsprefix = self._prefix_for_namespace(namespace)
399 attr = NamespacedAttribute(nsprefix, attr, namespace)
400 final_attrib[attr] = value
402 namespace, tag = self._getNsTag(tag)
403 nsprefix = self._prefix_for_namespace(namespace)
404 self.soup.handle_starttag(
405 tag,
406 namespace,
407 nsprefix,
408 final_attrib,
409 namespaces=self.active_namespace_prefixes[-1],
410 )
412 def _prefix_for_namespace(
413 self, namespace: Optional[_NamespaceURL]
414 ) -> Optional[_NamespacePrefix]:
415 """Find the currently active prefix for the given namespace."""
416 if namespace is None:
417 return None
418 for inverted_nsmap in reversed(self.nsmaps):
419 if inverted_nsmap is not None and namespace in inverted_nsmap:
420 return inverted_nsmap[namespace]
421 return None
423 def end(self, tag: str | bytes) -> None:
424 assert self.soup is not None
425 assert isinstance(tag, str)
426 self.soup.endData()
427 namespace, tag = self._getNsTag(tag)
428 nsprefix = None
429 if namespace is not None:
430 for inverted_nsmap in reversed(self.nsmaps):
431 if inverted_nsmap is not None and namespace in inverted_nsmap:
432 nsprefix = inverted_nsmap[namespace]
433 break
434 self.soup.handle_endtag(tag, nsprefix)
435 if len(self.nsmaps) > 1:
436 # This tag, or one of its parents, introduced a namespace
437 # mapping, so pop it off the stack.
438 out_of_scope_nsmap = self.nsmaps.pop()
440 if out_of_scope_nsmap is not None:
441 # This tag introduced a namespace mapping which is no
442 # longer in scope. Recalculate the currently active
443 # namespace prefixes.
444 self.active_namespace_prefixes.pop()
446 def pi(self, target: str, data: str) -> None:
447 assert self.soup is not None
448 self.soup.endData()
449 data = target + " " + data
450 self.soup.handle_data(data)
451 self.soup.endData(self.processing_instruction_class)
453 def data(self, data: str | bytes) -> None:
454 assert self.soup is not None
455 assert isinstance(data, str)
456 self.soup.handle_data(data)
458 def doctype(self, name: str, pubid: str, system: str) -> None:
459 assert self.soup is not None
460 self.soup.endData()
461 doctype_string = Doctype._string_for_name_and_ids(name, pubid, system)
462 self.soup.handle_data(doctype_string)
463 self.soup.endData(containerClass=Doctype)
465 def comment(self, text: str | bytes) -> None:
466 "Handle comments as Comment objects."
467 assert self.soup is not None
468 assert isinstance(text, str)
469 self.soup.endData()
470 self.soup.handle_data(text)
471 self.soup.endData(Comment)
473 def test_fragment_to_document(self, fragment: str) -> str:
474 """See `TreeBuilder`."""
475 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
478class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
479 NAME: str = LXML
480 ALTERNATE_NAMES: Iterable[str] = ["lxml-html"]
482 features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE]
483 is_xml: bool = False
485 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
486 return etree.HTMLParser
488 def feed(self, markup: _RawMarkup) -> None:
489 # We know self.soup is set by the time feed() is called.
490 assert self.soup is not None
491 encoding = self.soup.original_encoding
492 try:
493 self.parser = self.parser_for(encoding)
494 self.parser.feed(markup)
495 self.parser.close()
496 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
497 raise ParserRejectedMarkup(e)
499 def test_fragment_to_document(self, fragment: str) -> str:
500 """See `TreeBuilder`."""
501 return "<html><body>%s</body></html>" % fragment