1from __future__ import annotations
2
3# Use of this source code is governed by the MIT license.
4__license__ = "MIT"
5
6from collections import defaultdict
7import re
8from types import ModuleType
9from typing import (
10 Any,
11 cast,
12 Dict,
13 Iterable,
14 List,
15 Optional,
16 Pattern,
17 Set,
18 Tuple,
19 Type,
20 TYPE_CHECKING,
21)
22import warnings
23import sys
24from bs4.element import (
25 AttributeDict,
26 AttributeValueList,
27 CharsetMetaAttributeValue,
28 ContentMetaAttributeValue,
29 RubyParenthesisString,
30 RubyTextString,
31 Stylesheet,
32 Script,
33 TemplateString,
34 nonwhitespace_re,
35)
36
37# Exceptions were moved to their own module in 4.13. Import here for
38# backwards compatibility.
39from bs4.exceptions import ParserRejectedMarkup
40
41from bs4._typing import (
42 _AttributeValues,
43 _RawAttributeValue,
44)
45
46from bs4._warnings import XMLParsedAsHTMLWarning
47
48if TYPE_CHECKING:
49 from bs4 import BeautifulSoup
50 from bs4.element import (
51 NavigableString,
52 Tag,
53 )
54 from bs4._typing import (
55 _AttributeValue,
56 _Encoding,
57 _Encodings,
58 _RawOrProcessedAttributeValues,
59 _RawMarkup,
60 )
61
62__all__ = [
63 "HTMLTreeBuilder",
64 "SAXTreeBuilder",
65 "TreeBuilder",
66 "TreeBuilderRegistry",
67]
68
69# Some useful features for a TreeBuilder to have.
70FAST = "fast"
71PERMISSIVE = "permissive"
72STRICT = "strict"
73XML = "xml"
74HTML = "html"
75HTML_5 = "html5"
76
77__all__ = [
78 "TreeBuilderRegistry",
79 "TreeBuilder",
80 "HTMLTreeBuilder",
81 "DetectsXMLParsedAsHTML",
82
83 "ParserRejectedMarkup", # backwards compatibility only as of 4.13.0
84]
85
86class TreeBuilderRegistry(object):
87 """A way of looking up TreeBuilder subclasses by their name or by desired
88 features.
89 """
90
91 builders_for_feature: Dict[str, List[Type[TreeBuilder]]]
92 builders: List[Type[TreeBuilder]]
93
94 def __init__(self) -> None:
95 self.builders_for_feature = defaultdict(list)
96 self.builders = []
97
98 def register(self, treebuilder_class: type[TreeBuilder]) -> None:
99 """Register a treebuilder based on its advertised features.
100
101 :param treebuilder_class: A subclass of `TreeBuilder`. its
102 `TreeBuilder.features` attribute should list its features.
103 """
104 for feature in treebuilder_class.features:
105 self.builders_for_feature[feature].insert(0, treebuilder_class)
106 self.builders.insert(0, treebuilder_class)
107
108 def lookup(self, *features: str) -> Optional[Type[TreeBuilder]]:
109 """Look up a TreeBuilder subclass with the desired features.
110
111 :param features: A list of features to look for. If none are
112 provided, the most recently registered TreeBuilder subclass
113 will be used.
114 :return: A TreeBuilder subclass, or None if there's no
115 registered subclass with all the requested features.
116 """
117 if len(self.builders) == 0:
118 # There are no builders at all.
119 return None
120
121 if len(features) == 0:
122 # They didn't ask for any features. Give them the most
123 # recently registered builder.
124 return self.builders[0]
125
126 # Go down the list of features in order, and eliminate any builders
127 # that don't match every feature.
128 feature_list = list(features)
129 feature_list.reverse()
130 candidates = None
131 candidate_set = None
132 while len(feature_list) > 0:
133 feature = feature_list.pop()
134 we_have_the_feature = self.builders_for_feature.get(feature, [])
135 if len(we_have_the_feature) > 0:
136 if candidates is None:
137 candidates = we_have_the_feature
138 candidate_set = set(candidates)
139 else:
140 # Eliminate any candidates that don't have this feature.
141 candidate_set = candidate_set.intersection(set(we_have_the_feature))
142
143 # The only valid candidates are the ones in candidate_set.
144 # Go through the original list of candidates and pick the first one
145 # that's in candidate_set.
146 if candidate_set is None or candidates is None:
147 return None
148 for candidate in candidates:
149 if candidate in candidate_set:
150 return candidate
151 return None
152
153
154#: The `BeautifulSoup` constructor will take a list of features
155#: and use it to look up `TreeBuilder` classes in this registry.
156builder_registry: TreeBuilderRegistry = TreeBuilderRegistry()
157
158
159class TreeBuilder(object):
160 """Turn a textual document into a Beautiful Soup object tree.
161
162 This is an abstract superclass which smooths out the behavior of
163 different parser libraries into a single, unified interface.
164
165 :param multi_valued_attributes: If this is set to None, the
166 TreeBuilder will not turn any values for attributes like
167 'class' into lists. Setting this to a dictionary will
168 customize this behavior; look at :py:attr:`bs4.builder.HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES`
169 for an example.
170
171 Internally, these are called "CDATA list attributes", but that
172 probably doesn't make sense to an end-user, so the argument name
173 is ``multi_valued_attributes``.
174
175 :param preserve_whitespace_tags: A set of tags to treat
176 the way <pre> tags are treated in HTML. Tags in this set
177 are immune from pretty-printing; their contents will always be
178 output as-is.
179
180 :param string_containers: A dictionary mapping tag names to
181 the classes that should be instantiated to contain the textual
182 contents of those tags. The default is to use NavigableString
183 for every tag, no matter what the name. You can override the
184 default by changing :py:attr:`DEFAULT_STRING_CONTAINERS`.
185
186 :param store_line_numbers: If the parser keeps track of the line
187 numbers and positions of the original markup, that information
188 will, by default, be stored in each corresponding
189 :py:class:`bs4.element.Tag` object. You can turn this off by
190 passing store_line_numbers=False; then Tag.sourcepos and
191 Tag.sourceline will always be None. If the parser you're using
192 doesn't keep track of this information, then store_line_numbers
193 is irrelevant.
194
195 :param attribute_dict_class: The value of a multi-valued attribute
196 (such as HTML's 'class') willl be stored in an instance of this
197 class. The default is Beautiful Soup's built-in
198 `AttributeValueList`, which is a normal Python list, and you
199 will probably never need to change it.
200 """
201
202 USE_DEFAULT: Any = object() #: :meta private:
203
204 def __init__(
205 self,
206 multi_valued_attributes: Dict[str, Set[str]] = USE_DEFAULT,
207 preserve_whitespace_tags: Set[str] = USE_DEFAULT,
208 store_line_numbers: bool = USE_DEFAULT,
209 string_containers: Dict[str, Type[NavigableString]] = USE_DEFAULT,
210 empty_element_tags: Set[str] = USE_DEFAULT,
211 attribute_dict_class: Type[AttributeDict] = AttributeDict,
212 attribute_value_list_class: Type[AttributeValueList] = AttributeValueList,
213 ):
214 self.soup = None
215 if multi_valued_attributes is self.USE_DEFAULT:
216 multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
217 self.cdata_list_attributes = multi_valued_attributes
218 if preserve_whitespace_tags is self.USE_DEFAULT:
219 preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
220 self.preserve_whitespace_tags = preserve_whitespace_tags
221 if empty_element_tags is self.USE_DEFAULT:
222 self.empty_element_tags = self.DEFAULT_EMPTY_ELEMENT_TAGS
223 else:
224 self.empty_element_tags = empty_element_tags
225 # TODO: store_line_numbers is probably irrelevant now that
226 # the behavior of sourceline and sourcepos has been made consistent
227 # everywhere.
228 if store_line_numbers == self.USE_DEFAULT:
229 store_line_numbers = self.TRACKS_LINE_NUMBERS
230 self.store_line_numbers = store_line_numbers
231 if string_containers == self.USE_DEFAULT:
232 string_containers = self.DEFAULT_STRING_CONTAINERS
233 self.string_containers = string_containers
234 self.attribute_dict_class = attribute_dict_class
235 self.attribute_value_list_class = attribute_value_list_class
236
237 NAME: str = "[Unknown tree builder]"
238 ALTERNATE_NAMES: Iterable[str] = []
239 features: Iterable[str] = []
240
241 is_xml: bool = False
242 picklable: bool = False
243
244 soup: Optional[BeautifulSoup] #: :meta private:
245
246 #: A tag will be considered an empty-element
247 #: tag when and only when it has no contents.
248 empty_element_tags: Optional[Set[str]] = None #: :meta private:
249 cdata_list_attributes: Dict[str, Set[str]] #: :meta private:
250 preserve_whitespace_tags: Set[str] #: :meta private:
251 string_containers: Dict[str, Type[NavigableString]] #: :meta private:
252 tracks_line_numbers: bool #: :meta private:
253
254 #: A value for these tag/attribute combinations is a space- or
255 #: comma-separated list of CDATA, rather than a single CDATA.
256 DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = defaultdict(set)
257
258 #: Whitespace should be preserved inside these tags.
259 DEFAULT_PRESERVE_WHITESPACE_TAGS: Set[str] = set()
260
261 #: The textual contents of tags with these names should be
262 #: instantiated with some class other than `bs4.element.NavigableString`.
263 DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {}
264
265 #: By default, tags are treated as empty-element tags if they have
266 #: no contents--that is, using XML rules. HTMLTreeBuilder
267 #: defines a different set of DEFAULT_EMPTY_ELEMENT_TAGS based on the
268 #: HTML 4 and HTML5 standards.
269 DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = None
270
271 #: Most parsers don't keep track of line numbers.
272 TRACKS_LINE_NUMBERS: bool = False
273
274 def initialize_soup(self, soup: BeautifulSoup) -> None:
275 """The BeautifulSoup object has been initialized and is now
276 being associated with the TreeBuilder.
277
278 :param soup: A BeautifulSoup object.
279 """
280 self.soup = soup
281
282 def reset(self) -> None:
283 """Do any work necessary to reset the underlying parser
284 for a new document.
285
286 By default, this does nothing.
287 """
288 pass
289
290 def can_be_empty_element(self, tag_name: str) -> bool:
291 """Might a tag with this name be an empty-element tag?
292
293 The final markup may or may not actually present this tag as
294 self-closing.
295
296 For instance: an HTMLBuilder does not consider a <p> tag to be
297 an empty-element tag (it's not in
298 HTMLBuilder.empty_element_tags). This means an empty <p> tag
299 will be presented as "<p></p>", not "<p/>" or "<p>".
300
301 The default implementation has no opinion about which tags are
302 empty-element tags, so a tag will be presented as an
303 empty-element tag if and only if it has no children.
304 "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
305 be left alone.
306
307 :param tag_name: The name of a markup tag.
308 """
309 if self.empty_element_tags is None:
310 return True
311 return tag_name in self.empty_element_tags
312
313 def feed(self, markup: _RawMarkup) -> None:
314 """Run incoming markup through some parsing process."""
315 raise NotImplementedError()
316
317 def prepare_markup(
318 self,
319 markup: _RawMarkup,
320 user_specified_encoding: Optional[_Encoding] = None,
321 document_declared_encoding: Optional[_Encoding] = None,
322 exclude_encodings: Optional[_Encodings] = None,
323 ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
324 """Run any preliminary steps necessary to make incoming markup
325 acceptable to the parser.
326
327 :param markup: The markup that's about to be parsed.
328 :param user_specified_encoding: The user asked to try this encoding
329 to convert the markup into a Unicode string.
330 :param document_declared_encoding: The markup itself claims to be
331 in this encoding. NOTE: This argument is not used by the
332 calling code and can probably be removed.
333 :param exclude_encodings: The user asked *not* to try any of
334 these encodings.
335
336 :yield: A series of 4-tuples: (markup, encoding, declared encoding,
337 has undergone character replacement)
338
339 Each 4-tuple represents a strategy that the parser can try
340 to convert the document to Unicode and parse it. Each
341 strategy will be tried in turn.
342
343 By default, the only strategy is to parse the markup
344 as-is. See `LXMLTreeBuilderForXML` and
345 `HTMLParserTreeBuilder` for implementations that take into
346 account the quirks of particular parsers.
347
348 :meta private:
349
350 """
351 yield markup, None, None, False
352
353 def test_fragment_to_document(self, fragment: str) -> str:
354 """Wrap an HTML fragment to make it look like a document.
355
356 Different parsers do this differently. For instance, lxml
357 introduces an empty <head> tag, and html5lib
358 doesn't. Abstracting this away lets us write simple tests
359 which run HTML fragments through the parser and compare the
360 results against other HTML fragments.
361
362 This method should not be used outside of unit tests.
363
364 :param fragment: A fragment of HTML.
365 :return: A full HTML document.
366 :meta private:
367 """
368 return fragment
369
370 def set_up_substitutions(self, tag: Tag) -> bool:
371 """Set up any substitutions that will need to be performed on
372 a `Tag` when it's output as a string.
373
374 By default, this does nothing. See `HTMLTreeBuilder` for a
375 case where this is used.
376
377 :return: Whether or not a substitution was performed.
378 :meta private:
379 """
380 return False
381
382 def _replace_cdata_list_attribute_values(
383 self, tag_name: str, attrs: _RawOrProcessedAttributeValues
384 ) -> _AttributeValues:
385 """When an attribute value is associated with a tag that can
386 have multiple values for that attribute, convert the string
387 value to a list of strings.
388
389 Basically, replaces class="foo bar" with class=["foo", "bar"]
390
391 NOTE: This method modifies its input in place.
392
393 :param tag_name: The name of a tag.
394 :param attrs: A dictionary containing the tag's attributes.
395 Any appropriate attribute values will be modified in place.
396 :return: The modified dictionary that was originally passed in.
397 """
398
399 # First, cast the attrs dict to _AttributeValues. This might
400 # not be accurate yet, but it will be by the time this method
401 # returns.
402 modified_attrs = cast(_AttributeValues, attrs)
403 if not modified_attrs or not self.cdata_list_attributes:
404 # Nothing to do.
405 return modified_attrs
406
407 # There is at least a possibility that we need to modify one of
408 # the attribute values.
409 universal: Set[str] = self.cdata_list_attributes.get("*", set())
410 tag_specific = self.cdata_list_attributes.get(tag_name.lower(), None)
411 for attr in list(modified_attrs.keys()):
412 modified_value: _AttributeValue
413 if attr in universal or (tag_specific and attr in tag_specific):
414 # We have a "class"-type attribute whose string
415 # value is a whitespace-separated list of
416 # values. Split it into a list.
417 original_value: _AttributeValue = modified_attrs[attr]
418 if isinstance(original_value, _RawAttributeValue):
419 # This is a _RawAttributeValue (a string) that
420 # needs to be split and converted to a
421 # AttributeValueList so it can be an
422 # _AttributeValue.
423 modified_value = self.attribute_value_list_class(
424 nonwhitespace_re.findall(original_value)
425 )
426 else:
427 # html5lib calls setAttributes twice for the
428 # same tag when rearranging the parse tree. On
429 # the second call the attribute value here is
430 # already a list. This can also happen when a
431 # Tag object is cloned. If this happens, leave
432 # the value alone rather than trying to split
433 # it again.
434 modified_value = original_value
435 modified_attrs[attr] = modified_value
436 return modified_attrs
437
438
439class SAXTreeBuilder(TreeBuilder):
440 """A Beautiful Soup treebuilder that listens for SAX events.
441
442 This is not currently used for anything, and it will be removed
443 soon. It was a good idea, but it wasn't properly integrated into the
444 rest of Beautiful Soup, so there have been long stretches where it
445 hasn't worked properly.
446 """
447
448 def __init__(self, *args: Any, **kwargs: Any) -> None:
449 warnings.warn(
450 "The SAXTreeBuilder class was deprecated in 4.13.0 and will be removed soon thereafter. It is completely untested and probably doesn't work; do not use it.",
451 DeprecationWarning,
452 stacklevel=2,
453 )
454 super(SAXTreeBuilder, self).__init__(*args, **kwargs)
455
456 def feed(self, markup: _RawMarkup) -> None:
457 raise NotImplementedError()
458
459 def close(self) -> None:
460 pass
461
462 def startElement(self, name: str, attrs: Dict[str, str]) -> None:
463 attrs = AttributeDict((key[1], value) for key, value in list(attrs.items()))
464 # print("Start %s, %r" % (name, attrs))
465 assert self.soup is not None
466 self.soup.handle_starttag(name, None, None, attrs)
467
468 def endElement(self, name: str) -> None:
469 # print("End %s" % name)
470 assert self.soup is not None
471 self.soup.handle_endtag(name)
472
473 def startElementNS(
474 self, nsTuple: Tuple[str, str], nodeName: str, attrs: Dict[str, str]
475 ) -> None:
476 # Throw away (ns, nodeName) for now.
477 self.startElement(nodeName, attrs)
478
479 def endElementNS(self, nsTuple: Tuple[str, str], nodeName: str) -> None:
480 # Throw away (ns, nodeName) for now.
481 self.endElement(nodeName)
482 # handler.endElementNS((ns, node.nodeName), node.nodeName)
483
484 def startPrefixMapping(self, prefix: str, nodeValue: str) -> None:
485 # Ignore the prefix for now.
486 pass
487
488 def endPrefixMapping(self, prefix: str) -> None:
489 # Ignore the prefix for now.
490 # handler.endPrefixMapping(prefix)
491 pass
492
493 def characters(self, content: str) -> None:
494 assert self.soup is not None
495 self.soup.handle_data(content)
496
497 def startDocument(self) -> None:
498 pass
499
500 def endDocument(self) -> None:
501 pass
502
503
504class HTMLTreeBuilder(TreeBuilder):
505 """This TreeBuilder knows facts about HTML, such as which tags are treated
506 specially by the HTML standard.
507 """
508
509 #: Some HTML tags are defined as having no contents. Beautiful Soup
510 #: treats these specially.
511 DEFAULT_EMPTY_ELEMENT_TAGS: Set[str] = set(
512 [
513 # These are from HTML5.
514 "area",
515 "base",
516 "br",
517 "col",
518 "embed",
519 "hr",
520 "img",
521 "input",
522 "keygen",
523 "link",
524 "menuitem",
525 "meta",
526 "param",
527 "source",
528 "track",
529 "wbr",
530 # These are from earlier versions of HTML and are removed in HTML5.
531 "basefont",
532 "bgsound",
533 "command",
534 "frame",
535 "image",
536 "isindex",
537 "nextid",
538 "spacer",
539 ]
540 )
541
542 #: The HTML standard defines these tags as block-level elements. Beautiful
543 #: Soup does not treat these elements differently from other elements,
544 #: but it may do so eventually, and this information is available if
545 #: you need to use it.
546 DEFAULT_BLOCK_ELEMENTS: Set[str] = set(
547 [
548 "address",
549 "article",
550 "aside",
551 "blockquote",
552 "canvas",
553 "dd",
554 "div",
555 "dl",
556 "dt",
557 "fieldset",
558 "figcaption",
559 "figure",
560 "footer",
561 "form",
562 "h1",
563 "h2",
564 "h3",
565 "h4",
566 "h5",
567 "h6",
568 "header",
569 "hr",
570 "li",
571 "main",
572 "nav",
573 "noscript",
574 "ol",
575 "output",
576 "p",
577 "pre",
578 "section",
579 "table",
580 "tfoot",
581 "ul",
582 "video",
583 ]
584 )
585
586 #: These HTML tags need special treatment so they can be
587 #: represented by a string class other than `bs4.element.NavigableString`.
588 #:
589 #: For some of these tags, it's because the HTML standard defines
590 #: an unusual content model for them. I made this list by going
591 #: through the HTML spec
592 #: (https://html.spec.whatwg.org/#metadata-content) and looking for
593 #: "metadata content" elements that can contain strings.
594 #:
595 #: The Ruby tags (<rt> and <rp>) are here despite being normal
596 #: "phrasing content" tags, because the content they contain is
597 #: qualitatively different from other text in the document, and it
598 #: can be useful to be able to distinguish it.
599 #:
600 #: TODO: Arguably <noscript> could go here but it seems
601 #: qualitatively different from the other tags.
602 DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {
603 "rt": RubyTextString,
604 "rp": RubyParenthesisString,
605 "style": Stylesheet,
606 "script": Script,
607 "template": TemplateString,
608 }
609
610 #: The HTML standard defines these attributes as containing a
611 #: space-separated list of values, not a single value. That is,
612 #: class="foo bar" means that the 'class' attribute has two values,
613 #: 'foo' and 'bar', not the single value 'foo bar'. When we
614 #: encounter one of these attributes, we will parse its value into
615 #: a list of values if possible. Upon output, the list will be
616 #: converted back into a string.
617 DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = {
618 "*": {"class", "accesskey", "dropzone"},
619 "a": {"rel", "rev"},
620 "link": {"rel", "rev"},
621 "td": {"headers"},
622 "th": {"headers"},
623 "form": {"accept-charset"},
624 "object": {"archive"},
625 # These are HTML5 specific, as are *.accesskey and *.dropzone above.
626 "area": {"rel"},
627 "icon": {"sizes"},
628 "iframe": {"sandbox"},
629 "output": {"for"},
630 }
631
632 #: By default, whitespace inside these HTML tags will be
633 #: preserved rather than being collapsed.
634 DEFAULT_PRESERVE_WHITESPACE_TAGS: set[str] = set(["pre", "textarea"])
635
636 def set_up_substitutions(self, tag: Tag) -> bool:
637 """Replace the declared encoding in a <meta> tag with a placeholder,
638 to be substituted when the tag is output to a string.
639
640 An HTML document may come in to Beautiful Soup as one
641 encoding, but exit in a different encoding, and the <meta> tag
642 needs to be changed to reflect this.
643
644 :return: Whether or not a substitution was performed.
645
646 :meta private:
647 """
648 # We are only interested in <meta> tags
649 if tag.name != "meta":
650 return False
651
652 # TODO: This cast will fail in the (very unlikely) scenario
653 # that the programmer who instantiates the TreeBuilder
654 # specifies meta['content'] or meta['charset'] as
655 # cdata_list_attributes.
656 content: Optional[str] = cast(Optional[str], tag.get("content"))
657 charset: Optional[str] = cast(Optional[str], tag.get("charset"))
658
659 # But we can accommodate meta['http-equiv'] being made a
660 # cdata_list_attribute (again, very unlikely) without much
661 # trouble.
662 http_equiv: List[str] = tag.get_attribute_list("http-equiv")
663
664 # We are interested in <meta> tags that say what encoding the
665 # document was originally in. This means HTML 5-style <meta>
666 # tags that provide the "charset" attribute. It also means
667 # HTML 4-style <meta> tags that provide the "content"
668 # attribute and have "http-equiv" set to "content-type".
669 #
670 # In both cases we will replace the value of the appropriate
671 # attribute with a standin object that can take on any
672 # encoding.
673 substituted = False
674 if charset is not None:
675 # HTML 5 style:
676 # <meta charset="utf8">
677 tag["charset"] = CharsetMetaAttributeValue(charset)
678 substituted = True
679
680 elif content is not None and any(
681 x.lower() == "content-type" for x in http_equiv
682 ):
683 # HTML 4 style:
684 # <meta http-equiv="content-type" content="text/html; charset=utf8">
685 tag["content"] = ContentMetaAttributeValue(content)
686 substituted = True
687
688 return substituted
689
690
691class DetectsXMLParsedAsHTML(object):
692 """A mixin class for any class (a TreeBuilder, or some class used by a
693 TreeBuilder) that's in a position to detect whether an XML
694 document is being incorrectly parsed as HTML, and issue an
695 appropriate warning.
696
697 This requires being able to observe an incoming processing
698 instruction that might be an XML declaration, and also able to
699 observe tags as they're opened. If you can't do that for a given
700 `TreeBuilder`, there's a less reliable implementation based on
701 examining the raw markup.
702 """
703
704 #: Regular expression for seeing if string markup has an <html> tag.
705 LOOKS_LIKE_HTML: Pattern[str] = re.compile("<[^ +]html", re.I)
706
707 #: Regular expression for seeing if byte markup has an <html> tag.
708 LOOKS_LIKE_HTML_B: Pattern[bytes] = re.compile(b"<[^ +]html", re.I)
709
710 #: The start of an XML document string.
711 XML_PREFIX: str = "<?xml"
712
713 #: The start of an XML document bytestring.
714 XML_PREFIX_B: bytes = b"<?xml"
715
716 # This is typed as str, not `ProcessingInstruction`, because this
717 # check may be run before any Beautiful Soup objects are created.
718 _first_processing_instruction: Optional[str] #: :meta private:
719 _root_tag_name: Optional[str] #: :meta private:
720
721 @classmethod
722 def warn_if_markup_looks_like_xml(
723 cls, markup: Optional[_RawMarkup], stacklevel: int = 3
724 ) -> bool:
725 """Perform a check on some markup to see if it looks like XML
726 that's not XHTML. If so, issue a warning.
727
728 This is much less reliable than doing the check while parsing,
729 but some of the tree builders can't do that.
730
731 :param stacklevel: The stacklevel of the code calling this\
732 function.
733
734 :return: True if the markup looks like non-XHTML XML, False
735 otherwise.
736 """
737 if markup is None:
738 return False
739 markup = markup[:500]
740 if isinstance(markup, bytes):
741 markup_b: bytes = markup
742 looks_like_xml = markup_b.startswith(
743 cls.XML_PREFIX_B
744 ) and not cls.LOOKS_LIKE_HTML_B.search(markup)
745 else:
746 markup_s: str = markup
747 looks_like_xml = markup_s.startswith(
748 cls.XML_PREFIX
749 ) and not cls.LOOKS_LIKE_HTML.search(markup)
750
751 if looks_like_xml:
752 cls._warn(stacklevel=stacklevel + 2)
753 return True
754 return False
755
756 @classmethod
757 def _warn(cls, stacklevel: int = 5) -> None:
758 """Issue a warning about XML being parsed as HTML."""
759 warnings.warn(
760 XMLParsedAsHTMLWarning.MESSAGE,
761 XMLParsedAsHTMLWarning,
762 stacklevel=stacklevel,
763 )
764
765 def _initialize_xml_detector(self) -> None:
766 """Call this method before parsing a document."""
767 self._first_processing_instruction = None
768 self._root_tag_name = None
769
770 def _document_might_be_xml(self, processing_instruction: str) -> None:
771 """Call this method when encountering an XML declaration, or a
772 "processing instruction" that might be an XML declaration.
773
774 This helps Beautiful Soup detect potential issues later, if
775 the XML document turns out to be a non-XHTML document that's
776 being parsed as XML.
777 """
778 if (
779 self._first_processing_instruction is not None
780 or self._root_tag_name is not None
781 ):
782 # The document has already started. Don't bother checking
783 # anymore.
784 return
785
786 self._first_processing_instruction = processing_instruction
787
788 # We won't know until we encounter the first tag whether or
789 # not this is actually a problem.
790
791 def _root_tag_encountered(self, name: str) -> None:
792 """Call this when you encounter the document's root tag.
793
794 This is where we actually check whether an XML document is
795 being incorrectly parsed as HTML, and issue the warning.
796 """
797 if self._root_tag_name is not None:
798 # This method was incorrectly called multiple times. Do
799 # nothing.
800 return
801
802 self._root_tag_name = name
803
804 if (
805 name != "html"
806 and self._first_processing_instruction is not None
807 and self._first_processing_instruction.lower().startswith("xml ")
808 ):
809 # We encountered an XML declaration and then a tag other
810 # than 'html'. This is a reliable indicator that a
811 # non-XHTML document is being parsed as XML.
812 self._warn(stacklevel=10)
813
814
815def register_treebuilders_from(module: ModuleType) -> None:
816 """Copy TreeBuilders from the given module into this module."""
817 this_module = sys.modules[__name__]
818 for name in module.__all__:
819 obj = getattr(module, name)
820
821 if issubclass(obj, TreeBuilder):
822 setattr(this_module, name, obj)
823 this_module.__all__.append(name)
824 # Register the builder while we're at it.
825 this_module.builder_registry.register(obj)
826
827
828# Builders are registered in reverse order of priority, so that custom
829# builder registrations will take precedence. In general, we want lxml
830# to take precedence over html5lib, because it's faster. And we only
831# want to use HTMLParser as a last resort.
832from . import _htmlparser # noqa: E402
833
834register_treebuilders_from(_htmlparser)
835try:
836 from . import _html5lib
837
838 register_treebuilders_from(_html5lib)
839except ImportError:
840 # They don't have html5lib installed.
841 pass
842try:
843 from . import _lxml
844
845 register_treebuilders_from(_lxml)
846except ImportError:
847 # They don't have lxml installed.
848 pass