Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/_lxml.py: 28%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# encoding: utf-8
2from __future__ import annotations
4# Use of this source code is governed by the MIT license.
5__license__ = "MIT"
7__all__ = [
8 "LXMLTreeBuilderForXML",
9 "LXMLTreeBuilder",
10]
13from typing import (
14 Any,
15 Dict,
16 Iterable,
17 List,
18 Optional,
19 Set,
20 Tuple,
21 Type,
22 TYPE_CHECKING,
23 Union,
24)
26from io import BytesIO
27from io import StringIO
29from typing_extensions import TypeAlias
31from lxml import etree # type:ignore
32from bs4.element import (
33 AttributeDict,
34 XMLAttributeDict,
35 Comment,
36 Doctype,
37 NamespacedAttribute,
38 ProcessingInstruction,
39 XMLProcessingInstruction,
40)
41from bs4.builder import (
42 DetectsXMLParsedAsHTML,
43 FAST,
44 HTML,
45 HTMLTreeBuilder,
46 PERMISSIVE,
47 TreeBuilder,
48 XML,
49)
50from bs4.dammit import EncodingDetector
51from bs4.exceptions import ParserRejectedMarkup
53if TYPE_CHECKING:
54 from bs4._typing import (
55 _Encoding,
56 _Encodings,
57 _NamespacePrefix,
58 _NamespaceURL,
59 _NamespaceMapping,
60 _InvertedNamespaceMapping,
61 _RawMarkup,
62 )
63 from bs4 import BeautifulSoup
65LXML: str = "lxml"
68def _invert(d: dict[Any, Any]) -> dict[Any, Any]:
69 "Invert a dictionary."
70 return dict((v, k) for k, v in list(d.items()))
73_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser]
74_ParserOrParserClass: TypeAlias = Union[
75 _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser]
76]
79class LXMLTreeBuilderForXML(TreeBuilder):
80 DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser
82 is_xml: bool = True
84 processing_instruction_class: Type[ProcessingInstruction]
86 NAME: str = "lxml-xml"
87 ALTERNATE_NAMES: Iterable[str] = ["xml"]
89 # Well, it's permissive by XML parser standards.
90 features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE]
92 CHUNK_SIZE: int = 512
94 # This namespace mapping is specified in the XML Namespace
95 # standard.
96 DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace")
98 DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS)
100 nsmaps: List[Optional[_InvertedNamespaceMapping]]
101 empty_element_tags: Optional[Set[str]]
102 parser: Any
103 _default_parser: Optional[etree.XMLParser]
105 # NOTE: If we parsed Element objects and looked at .sourceline,
106 # we'd be able to see the line numbers from the original document.
107 # But instead we build an XMLParser or HTMLParser object to serve
108 # as the target of parse messages, and those messages don't include
109 # line numbers.
110 # See: https://bugs.launchpad.net/lxml/+bug/1846906
112 def initialize_soup(self, soup: BeautifulSoup) -> None:
113 """Let the BeautifulSoup object know about the standard namespace
114 mapping.
116 :param soup: A `BeautifulSoup`.
117 """
118 # Beyond this point, self.soup is set, so we can assume (and
119 # assert) it's not None whenever necessary.
120 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
121 self._register_namespaces(self.DEFAULT_NSMAPS)
123 def _register_namespaces(self, mapping: Dict[str, str]) -> None:
124 """Let the BeautifulSoup object know about namespaces encountered
125 while parsing the document.
127 This might be useful later on when creating CSS selectors.
129 This will track (almost) all namespaces, even ones that were
130 only in scope for part of the document. If two namespaces have
131 the same prefix, only the first one encountered will be
132 tracked. Un-prefixed namespaces are not tracked.
134 :param mapping: A dictionary mapping namespace prefixes to URIs.
135 """
136 assert self.soup is not None
137 for key, value in list(mapping.items()):
138 # This is 'if key' and not 'if key is not None' because we
139 # don't track un-prefixed namespaces. Soupselect will
140 # treat an un-prefixed namespace as the default, which
141 # causes confusion in some cases.
142 if key and key not in self.soup._namespaces:
143 # Let the BeautifulSoup object know about a new namespace.
144 # If there are multiple namespaces defined with the same
145 # prefix, the first one in the document takes precedence.
146 self.soup._namespaces[key] = value
148 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
149 """Find the default parser for the given encoding.
151 :return: Either a parser object or a class, which
152 will be instantiated with default arguments.
153 """
154 if self._default_parser is not None:
155 return self._default_parser
156 return self.DEFAULT_PARSER_CLASS(target=self, recover=True, encoding=encoding)
158 def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser:
159 """Instantiate an appropriate parser for the given encoding.
161 :param encoding: A string.
162 :return: A parser object such as an `etree.XMLParser`.
163 """
164 # Use the default parser.
165 parser = self.default_parser(encoding)
167 if callable(parser):
168 # Instantiate the parser with default arguments
169 parser = parser(target=self, recover=True, encoding=encoding)
170 return parser
172 def __init__(
173 self,
174 parser: Optional[etree.XMLParser] = None,
175 empty_element_tags: Optional[Set[str]] = None,
176 **kwargs: Any,
177 ):
178 # TODO: Issue a warning if parser is present but not a
179 # callable, since that means there's no way to create new
180 # parsers for different encodings.
181 self._default_parser = parser
182 self.soup = None
183 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
184 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
185 if self.is_xml:
186 self.processing_instruction_class = XMLProcessingInstruction
187 else:
188 self.processing_instruction_class = ProcessingInstruction
190 if "attribute_dict_class" not in kwargs:
191 kwargs["attribute_dict_class"] = XMLAttributeDict
192 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
194 def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]:
195 # Split the namespace URL out of a fully-qualified lxml tag
196 # name. Copied from lxml's src/lxml/sax.py.
197 if tag[0] == "{" and "}" in tag:
198 namespace, name = tag[1:].split("}", 1)
199 return (namespace, name)
200 return (None, tag)
202 def prepare_markup(
203 self,
204 markup: _RawMarkup,
205 user_specified_encoding: Optional[_Encoding] = None,
206 document_declared_encoding: Optional[_Encoding] = None,
207 exclude_encodings: Optional[_Encodings] = None,
208 ) -> Iterable[
209 Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]
210 ]:
211 """Run any preliminary steps necessary to make incoming markup
212 acceptable to the parser.
214 lxml really wants to get a bytestring and convert it to
215 Unicode itself. So instead of using UnicodeDammit to convert
216 the bytestring to Unicode using different encodings, this
217 implementation uses EncodingDetector to iterate over the
218 encodings, and tell lxml to try to parse the document as each
219 one in turn.
221 :param markup: Some markup -- hopefully a bytestring.
222 :param user_specified_encoding: The user asked to try this encoding.
223 :param document_declared_encoding: The markup itself claims to be
224 in this encoding.
225 :param exclude_encodings: The user asked _not_ to try any of
226 these encodings.
228 :yield: A series of 4-tuples: (markup, encoding, declared encoding,
229 has undergone character replacement)
231 Each 4-tuple represents a strategy for converting the
232 document to Unicode and parsing it. Each strategy will be tried
233 in turn.
234 """
235 if not self.is_xml:
236 # We're in HTML mode, so if we're given XML, that's worth
237 # noting.
238 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
240 if isinstance(markup, str):
241 # We were given Unicode. Maybe lxml can parse Unicode on
242 # this system?
244 # TODO: This is a workaround for
245 # https://bugs.launchpad.net/lxml/+bug/1948551.
246 # We can remove it once the upstream issue is fixed.
247 if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}":
248 markup = markup[1:]
249 yield markup, None, document_declared_encoding, False
251 if isinstance(markup, str):
252 # No, apparently not. Convert the Unicode to UTF-8 and
253 # tell lxml to parse it as UTF-8.
254 yield (markup.encode("utf8"), "utf8", document_declared_encoding, False)
256 # Since the document was Unicode in the first place, there
257 # is no need to try any more strategies; we know this will
258 # work.
259 return
261 known_definite_encodings: List[_Encoding] = []
262 if user_specified_encoding:
263 # This was provided by the end-user; treat it as a known
264 # definite encoding per the algorithm laid out in the
265 # HTML5 spec. (See the EncodingDetector class for
266 # details.)
267 known_definite_encodings.append(user_specified_encoding)
269 user_encodings: List[_Encoding] = []
270 if document_declared_encoding:
271 # This was found in the document; treat it as a slightly
272 # lower-priority user encoding.
273 user_encodings.append(document_declared_encoding)
275 detector = EncodingDetector(
276 markup,
277 known_definite_encodings=known_definite_encodings,
278 user_encodings=user_encodings,
279 is_html=not self.is_xml,
280 exclude_encodings=exclude_encodings,
281 )
282 for encoding in detector.encodings:
283 yield (detector.markup, encoding, document_declared_encoding, False)
285 def feed(self, markup: _RawMarkup) -> None:
286 io: Union[BytesIO, StringIO]
287 if isinstance(markup, bytes):
288 io = BytesIO(markup)
289 elif isinstance(markup, str):
290 io = StringIO(markup)
292 # initialize_soup is called before feed, so we know this
293 # is not None.
294 assert self.soup is not None
296 # Call feed() at least once, even if the markup is empty,
297 # or the parser won't be initialized.
298 data = io.read(self.CHUNK_SIZE)
299 try:
300 self.parser = self.parser_for(self.soup.original_encoding)
301 self.parser.feed(data)
302 while len(data) != 0:
303 # Now call feed() on the rest of the data, chunk by chunk.
304 data = io.read(self.CHUNK_SIZE)
305 if len(data) != 0:
306 self.parser.feed(data)
307 self.parser.close()
308 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
309 raise ParserRejectedMarkup(e)
311 def close(self) -> None:
312 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
314 def start(
315 self,
316 tag: str | bytes,
317 attrib: Dict[str | bytes, str | bytes],
318 nsmap: _NamespaceMapping = {},
319 ) -> None:
320 # This is called by lxml code as a result of calling
321 # BeautifulSoup.feed(), and we know self.soup is set by the time feed()
322 # is called.
323 assert self.soup is not None
324 assert isinstance(tag, str)
326 # We need to recreate the attribute dict for three
327 # reasons. First, for type checking, so we can assert there
328 # are no bytestrings in the keys or values. Second, because we
329 # need a mutable dict--lxml might send us an immutable
330 # dictproxy. Third, so we can handle namespaced attribute
331 # names by converting the keys to NamespacedAttributes.
332 new_attrib: Dict[Union[str, NamespacedAttribute], str] = (
333 self.attribute_dict_class()
334 )
335 for k, v in attrib.items():
336 assert isinstance(k, str)
337 assert isinstance(v, str)
338 new_attrib[k] = v
340 nsprefix: Optional[_NamespacePrefix] = None
341 namespace: Optional[_NamespaceURL] = None
342 # Invert each namespace map as it comes in.
343 if len(nsmap) == 0 and len(self.nsmaps) > 1:
344 # There are no new namespaces for this tag, but
345 # non-default namespaces are in play, so we need a
346 # separate tag stack to know when they end.
347 self.nsmaps.append(None)
348 elif len(nsmap) > 0:
349 # A new namespace mapping has come into play.
351 # First, Let the BeautifulSoup object know about it.
352 self._register_namespaces(nsmap)
354 # Then, add it to our running list of inverted namespace
355 # mappings.
356 self.nsmaps.append(_invert(nsmap))
358 # The currently active namespace prefixes have
359 # changed. Calculate the new mapping so it can be stored
360 # with all Tag objects created while these prefixes are in
361 # scope.
362 current_mapping = dict(self.active_namespace_prefixes[-1])
363 current_mapping.update(nsmap)
365 # We should not track un-prefixed namespaces as we can only hold one
366 # and it will be recognized as the default namespace by soupsieve,
367 # which may be confusing in some situations.
368 if "" in current_mapping:
369 del current_mapping[""]
370 self.active_namespace_prefixes.append(current_mapping)
372 # Also treat the namespace mapping as a set of attributes on the
373 # tag, so we can recreate it later.
374 for prefix, namespace in list(nsmap.items()):
375 attribute = NamespacedAttribute(
376 "xmlns", prefix, "http://www.w3.org/2000/xmlns/"
377 )
378 new_attrib[attribute] = namespace
380 # Namespaces are in play. Find any attributes that came in
381 # from lxml with namespaces attached to their names, and
382 # turn then into NamespacedAttribute objects.
383 final_attrib: AttributeDict = self.attribute_dict_class()
384 for attr, value in list(new_attrib.items()):
385 namespace, attr = self._getNsTag(attr)
386 if namespace is None:
387 final_attrib[attr] = value
388 else:
389 nsprefix = self._prefix_for_namespace(namespace)
390 attr = NamespacedAttribute(nsprefix, attr, namespace)
391 final_attrib[attr] = value
393 namespace, tag = self._getNsTag(tag)
394 nsprefix = self._prefix_for_namespace(namespace)
395 self.soup.handle_starttag(
396 tag,
397 namespace,
398 nsprefix,
399 final_attrib,
400 namespaces=self.active_namespace_prefixes[-1],
401 )
403 def _prefix_for_namespace(
404 self, namespace: Optional[_NamespaceURL]
405 ) -> Optional[_NamespacePrefix]:
406 """Find the currently active prefix for the given namespace."""
407 if namespace is None:
408 return None
409 for inverted_nsmap in reversed(self.nsmaps):
410 if inverted_nsmap is not None and namespace in inverted_nsmap:
411 return inverted_nsmap[namespace]
412 return None
414 def end(self, tag: str | bytes) -> None:
415 assert self.soup is not None
416 assert isinstance(tag, str)
417 self.soup.endData()
418 namespace, tag = self._getNsTag(tag)
419 nsprefix = None
420 if namespace is not None:
421 for inverted_nsmap in reversed(self.nsmaps):
422 if inverted_nsmap is not None and namespace in inverted_nsmap:
423 nsprefix = inverted_nsmap[namespace]
424 break
425 self.soup.handle_endtag(tag, nsprefix)
426 if len(self.nsmaps) > 1:
427 # This tag, or one of its parents, introduced a namespace
428 # mapping, so pop it off the stack.
429 out_of_scope_nsmap = self.nsmaps.pop()
431 if out_of_scope_nsmap is not None:
432 # This tag introduced a namespace mapping which is no
433 # longer in scope. Recalculate the currently active
434 # namespace prefixes.
435 self.active_namespace_prefixes.pop()
437 def pi(self, target: str, data: str) -> None:
438 assert self.soup is not None
439 self.soup.endData()
440 data = target + " " + data
441 self.soup.handle_data(data)
442 self.soup.endData(self.processing_instruction_class)
444 def data(self, data: str | bytes) -> None:
445 assert self.soup is not None
446 assert isinstance(data, str)
447 self.soup.handle_data(data)
449 def doctype(self, name: str, pubid: str, system: str) -> None:
450 assert self.soup is not None
451 self.soup.endData()
452 doctype_string = Doctype._string_for_name_and_ids(name, pubid, system)
453 self.soup.handle_data(doctype_string)
454 self.soup.endData(containerClass=Doctype)
456 def comment(self, text: str | bytes) -> None:
457 "Handle comments as Comment objects."
458 assert self.soup is not None
459 assert isinstance(text, str)
460 self.soup.endData()
461 self.soup.handle_data(text)
462 self.soup.endData(Comment)
464 def test_fragment_to_document(self, fragment: str) -> str:
465 """See `TreeBuilder`."""
466 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
469class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
470 NAME: str = LXML
471 ALTERNATE_NAMES: Iterable[str] = ["lxml-html"]
473 features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE]
474 is_xml: bool = False
476 def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
477 return etree.HTMLParser
479 def feed(self, markup: _RawMarkup) -> None:
480 # We know self.soup is set by the time feed() is called.
481 assert self.soup is not None
482 encoding = self.soup.original_encoding
483 try:
484 self.parser = self.parser_for(encoding)
485 self.parser.feed(markup)
486 self.parser.close()
487 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
488 raise ParserRejectedMarkup(e)
490 def test_fragment_to_document(self, fragment: str) -> str:
491 """See `TreeBuilder`."""
492 return "<html><body>%s</body></html>" % fragment