Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/__init__.py: 18%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
3http://www.crummy.com/software/BeautifulSoup/
5Beautiful Soup uses a pluggable XML or HTML parser to parse a
6(possibly invalid) document into a tree representation. Beautiful Soup
7provides methods and Pythonic idioms that make it easy to navigate,
8search, and modify the parse tree.
10Beautiful Soup works with Python 3.7 and up. It works better if lxml
11and/or html5lib is installed, but they are not required.
13For more than you ever wanted to know about Beautiful Soup, see the
14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
15"""
17__author__ = "Leonard Richardson (leonardr@segfault.org)"
18__version__ = "4.13.5"
19__copyright__ = "Copyright (c) 2004-2025 Leonard Richardson"
20# Use of this source code is governed by the MIT license.
21__license__ = "MIT"
23__all__ = [
24 "AttributeResemblesVariableWarning",
25 "BeautifulSoup",
26 "Comment",
27 "Declaration",
28 "ProcessingInstruction",
29 "ResultSet",
30 "CSS",
31 "Script",
32 "Stylesheet",
33 "Tag",
34 "TemplateString",
35 "ElementFilter",
36 "UnicodeDammit",
37 "CData",
38 "Doctype",
40 # Exceptions
41 "FeatureNotFound",
42 "ParserRejectedMarkup",
43 "StopParsing",
45 # Warnings
46 "AttributeResemblesVariableWarning",
47 "GuessedAtParserWarning",
48 "MarkupResemblesLocatorWarning",
49 "UnusualUsageWarning",
50 "XMLParsedAsHTMLWarning",
51]
53from collections import Counter
54import io
55import sys
56import warnings
58# The very first thing we do is give a useful error if someone is
59# running this code under Python 2.
60if sys.version_info.major < 3:
61 raise ImportError(
62 "You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3."
63 )
65from .builder import (
66 builder_registry,
67 TreeBuilder,
68)
69from .builder._htmlparser import HTMLParserTreeBuilder
70from .dammit import UnicodeDammit
71from .css import CSS
72from ._deprecation import (
73 _deprecated,
74)
75from .element import (
76 CData,
77 Comment,
78 DEFAULT_OUTPUT_ENCODING,
79 Declaration,
80 Doctype,
81 NavigableString,
82 PageElement,
83 ProcessingInstruction,
84 PYTHON_SPECIFIC_ENCODINGS,
85 ResultSet,
86 Script,
87 Stylesheet,
88 Tag,
89 TemplateString,
90)
91from .formatter import Formatter
92from .filter import (
93 ElementFilter,
94 SoupStrainer,
95)
96from typing import (
97 Any,
98 cast,
99 Counter as CounterType,
100 Dict,
101 Iterator,
102 List,
103 Sequence,
104 Sized,
105 Optional,
106 Type,
107 Union,
108)
110from bs4._typing import (
111 _Encoding,
112 _Encodings,
113 _IncomingMarkup,
114 _InsertableElement,
115 _RawAttributeValue,
116 _RawAttributeValues,
117 _RawMarkup,
118)
120# Import all warnings and exceptions into the main package.
121from bs4.exceptions import (
122 FeatureNotFound,
123 ParserRejectedMarkup,
124 StopParsing,
125)
126from bs4._warnings import (
127 AttributeResemblesVariableWarning,
128 GuessedAtParserWarning,
129 MarkupResemblesLocatorWarning,
130 UnusualUsageWarning,
131 XMLParsedAsHTMLWarning,
132)
135class BeautifulSoup(Tag):
136 """A data structure representing a parsed HTML or XML document.
138 Most of the methods you'll call on a BeautifulSoup object are inherited from
139 PageElement or Tag.
141 Internally, this class defines the basic interface called by the
142 tree builders when converting an HTML/XML document into a data
143 structure. The interface abstracts away the differences between
144 parsers. To write a new tree builder, you'll need to understand
145 these methods as a whole.
147 These methods will be called by the BeautifulSoup constructor:
148 * reset()
149 * feed(markup)
151 The tree builder may call these methods from its feed() implementation:
152 * handle_starttag(name, attrs) # See note about return value
153 * handle_endtag(name)
154 * handle_data(data) # Appends to the current data node
155 * endData(containerClass) # Ends the current data node
157 No matter how complicated the underlying parser is, you should be
158 able to build a tree using 'start tag' events, 'end tag' events,
159 'data' events, and "done with data" events.
161 If you encounter an empty-element tag (aka a self-closing tag,
162 like HTML's <br> tag), call handle_starttag and then
163 handle_endtag.
164 """
166 #: Since `BeautifulSoup` subclasses `Tag`, it's possible to treat it as
167 #: a `Tag` with a `Tag.name`. Hoever, this name makes it clear the
168 #: `BeautifulSoup` object isn't a real markup tag.
169 ROOT_TAG_NAME: str = "[document]"
171 #: If the end-user gives no indication which tree builder they
172 #: want, look for one with these features.
173 DEFAULT_BUILDER_FEATURES: Sequence[str] = ["html", "fast"]
175 #: A string containing all ASCII whitespace characters, used in
176 #: during parsing to detect data chunks that seem 'empty'.
177 ASCII_SPACES: str = "\x20\x0a\x09\x0c\x0d"
179 # FUTURE PYTHON:
180 element_classes: Dict[Type[PageElement], Type[PageElement]] #: :meta private:
181 builder: TreeBuilder #: :meta private:
182 is_xml: bool
183 known_xml: Optional[bool]
184 parse_only: Optional[SoupStrainer] #: :meta private:
186 # These members are only used while parsing markup.
187 markup: Optional[_RawMarkup] #: :meta private:
188 current_data: List[str] #: :meta private:
189 currentTag: Optional[Tag] #: :meta private:
190 tagStack: List[Tag] #: :meta private:
191 open_tag_counter: CounterType[str] #: :meta private:
192 preserve_whitespace_tag_stack: List[Tag] #: :meta private:
193 string_container_stack: List[Tag] #: :meta private:
194 _most_recent_element: Optional[PageElement] #: :meta private:
196 #: Beautiful Soup's best guess as to the character encoding of the
197 #: original document.
198 original_encoding: Optional[_Encoding]
200 #: The character encoding, if any, that was explicitly defined
201 #: in the original document. This may or may not match
202 #: `BeautifulSoup.original_encoding`.
203 declared_html_encoding: Optional[_Encoding]
205 #: This is True if the markup that was parsed contains
206 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present
207 #: in the original markup. These mark character sequences that
208 #: could not be represented in Unicode.
209 contains_replacement_characters: bool
211 def __init__(
212 self,
213 markup: _IncomingMarkup = "",
214 features: Optional[Union[str, Sequence[str]]] = None,
215 builder: Optional[Union[TreeBuilder, Type[TreeBuilder]]] = None,
216 parse_only: Optional[SoupStrainer] = None,
217 from_encoding: Optional[_Encoding] = None,
218 exclude_encodings: Optional[_Encodings] = None,
219 element_classes: Optional[Dict[Type[PageElement], Type[PageElement]]] = None,
220 **kwargs: Any,
221 ):
222 """Constructor.
224 :param markup: A string or a file-like object representing
225 markup to be parsed.
227 :param features: Desirable features of the parser to be
228 used. This may be the name of a specific parser ("lxml",
229 "lxml-xml", "html.parser", or "html5lib") or it may be the
230 type of markup to be used ("html", "html5", "xml"). It's
231 recommended that you name a specific parser, so that
232 Beautiful Soup gives you the same results across platforms
233 and virtual environments.
235 :param builder: A TreeBuilder subclass to instantiate (or
236 instance to use) instead of looking one up based on
237 `features`. You only need to use this if you've implemented a
238 custom TreeBuilder.
240 :param parse_only: A SoupStrainer. Only parts of the document
241 matching the SoupStrainer will be considered. This is useful
242 when parsing part of a document that would otherwise be too
243 large to fit into memory.
245 :param from_encoding: A string indicating the encoding of the
246 document to be parsed. Pass this in if Beautiful Soup is
247 guessing wrongly about the document's encoding.
249 :param exclude_encodings: A list of strings indicating
250 encodings known to be wrong. Pass this in if you don't know
251 the document's encoding but you know Beautiful Soup's guess is
252 wrong.
254 :param element_classes: A dictionary mapping BeautifulSoup
255 classes like Tag and NavigableString, to other classes you'd
256 like to be instantiated instead as the parse tree is
257 built. This is useful for subclassing Tag or NavigableString
258 to modify default behavior.
260 :param kwargs: For backwards compatibility purposes, the
261 constructor accepts certain keyword arguments used in
262 Beautiful Soup 3. None of these arguments do anything in
263 Beautiful Soup 4; they will result in a warning and then be
264 ignored.
266 Apart from this, any keyword arguments passed into the
267 BeautifulSoup constructor are propagated to the TreeBuilder
268 constructor. This makes it possible to configure a
269 TreeBuilder by passing in arguments, not just by saying which
270 one to use.
271 """
272 if "convertEntities" in kwargs:
273 del kwargs["convertEntities"]
274 warnings.warn(
275 "BS4 does not respect the convertEntities argument to the "
276 "BeautifulSoup constructor. Entities are always converted "
277 "to Unicode characters."
278 )
280 if "markupMassage" in kwargs:
281 del kwargs["markupMassage"]
282 warnings.warn(
283 "BS4 does not respect the markupMassage argument to the "
284 "BeautifulSoup constructor. The tree builder is responsible "
285 "for any necessary markup massage."
286 )
288 if "smartQuotesTo" in kwargs:
289 del kwargs["smartQuotesTo"]
290 warnings.warn(
291 "BS4 does not respect the smartQuotesTo argument to the "
292 "BeautifulSoup constructor. Smart quotes are always converted "
293 "to Unicode characters."
294 )
296 if "selfClosingTags" in kwargs:
297 del kwargs["selfClosingTags"]
298 warnings.warn(
299 "Beautiful Soup 4 does not respect the selfClosingTags argument to the "
300 "BeautifulSoup constructor. The tree builder is responsible "
301 "for understanding self-closing tags."
302 )
304 if "isHTML" in kwargs:
305 del kwargs["isHTML"]
306 warnings.warn(
307 "Beautiful Soup 4 does not respect the isHTML argument to the "
308 "BeautifulSoup constructor. Suggest you use "
309 "features='lxml' for HTML and features='lxml-xml' for "
310 "XML."
311 )
313 def deprecated_argument(old_name: str, new_name: str) -> Optional[Any]:
314 if old_name in kwargs:
315 warnings.warn(
316 'The "%s" argument to the BeautifulSoup constructor '
317 'was renamed to "%s" in Beautiful Soup 4.0.0'
318 % (old_name, new_name),
319 DeprecationWarning,
320 stacklevel=3,
321 )
322 return kwargs.pop(old_name)
323 return None
325 parse_only = parse_only or deprecated_argument("parseOnlyThese", "parse_only")
326 if parse_only is not None:
327 # Issue a warning if we can tell in advance that
328 # parse_only will exclude the entire tree.
329 if parse_only.excludes_everything:
330 warnings.warn(
331 f"The given value for parse_only will exclude everything: {parse_only}",
332 UserWarning,
333 stacklevel=3,
334 )
336 from_encoding = from_encoding or deprecated_argument(
337 "fromEncoding", "from_encoding"
338 )
340 if from_encoding and isinstance(markup, str):
341 warnings.warn(
342 "You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored."
343 )
344 from_encoding = None
346 self.element_classes = element_classes or dict()
348 # We need this information to track whether or not the builder
349 # was specified well enough that we can omit the 'you need to
350 # specify a parser' warning.
351 original_builder = builder
352 original_features = features
354 builder_class: Optional[Type[TreeBuilder]] = None
355 if isinstance(builder, type):
356 # A builder class was passed in; it needs to be instantiated.
357 builder_class = builder
358 builder = None
359 elif builder is None:
360 if isinstance(features, str):
361 features = [features]
362 if features is None or len(features) == 0:
363 features = self.DEFAULT_BUILDER_FEATURES
364 possible_builder_class = builder_registry.lookup(*features)
365 if possible_builder_class is None:
366 raise FeatureNotFound(
367 "Couldn't find a tree builder with the features you "
368 "requested: %s. Do you need to install a parser library?"
369 % ",".join(features)
370 )
371 builder_class = possible_builder_class
373 # At this point either we have a TreeBuilder instance in
374 # builder, or we have a builder_class that we can instantiate
375 # with the remaining **kwargs.
376 if builder is None:
377 assert builder_class is not None
378 builder = builder_class(**kwargs)
379 if (
380 not original_builder
381 and not (
382 original_features == builder.NAME
383 or (
384 isinstance(original_features, str)
385 and original_features in builder.ALTERNATE_NAMES
386 )
387 )
388 and markup
389 ):
390 # The user did not tell us which TreeBuilder to use,
391 # and we had to guess. Issue a warning.
392 if builder.is_xml:
393 markup_type = "XML"
394 else:
395 markup_type = "HTML"
397 # This code adapted from warnings.py so that we get the same line
398 # of code as our warnings.warn() call gets, even if the answer is wrong
399 # (as it may be in a multithreading situation).
400 caller = None
401 try:
402 caller = sys._getframe(1)
403 except ValueError:
404 pass
405 if caller:
406 globals = caller.f_globals
407 line_number = caller.f_lineno
408 else:
409 globals = sys.__dict__
410 line_number = 1
411 filename = globals.get("__file__")
412 if filename:
413 fnl = filename.lower()
414 if fnl.endswith((".pyc", ".pyo")):
415 filename = filename[:-1]
416 if filename:
417 # If there is no filename at all, the user is most likely in a REPL,
418 # and the warning is not necessary.
419 values = dict(
420 filename=filename,
421 line_number=line_number,
422 parser=builder.NAME,
423 markup_type=markup_type,
424 )
425 warnings.warn(
426 GuessedAtParserWarning.MESSAGE % values,
427 GuessedAtParserWarning,
428 stacklevel=2,
429 )
430 else:
431 if kwargs:
432 warnings.warn(
433 "Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`."
434 )
436 self.builder = builder
437 self.is_xml = builder.is_xml
438 self.known_xml = self.is_xml
439 self._namespaces = dict()
440 self.parse_only = parse_only
442 if hasattr(markup, "read"): # It's a file-type object.
443 markup = cast(io.IOBase, markup).read()
444 elif not isinstance(markup, (bytes, str)) and not hasattr(markup, "__len__"):
445 raise TypeError(
446 f"Incoming markup is of an invalid type: {markup!r}. Markup must be a string, a bytestring, or an open filehandle."
447 )
448 elif isinstance(markup, Sized) and len(markup) <= 256 and (
449 (isinstance(markup, bytes) and b"<" not in markup and b"\n" not in markup)
450 or (isinstance(markup, str) and "<" not in markup and "\n" not in markup)
451 ):
452 # Issue warnings for a couple beginner problems
453 # involving passing non-markup to Beautiful Soup.
454 # Beautiful Soup will still parse the input as markup,
455 # since that is sometimes the intended behavior.
456 if not self._markup_is_url(markup):
457 self._markup_resembles_filename(markup)
459 # At this point we know markup is a string or bytestring. If
460 # it was a file-type object, we've read from it.
461 markup = cast(_RawMarkup, markup)
463 rejections = []
464 success = False
465 for (
466 self.markup,
467 self.original_encoding,
468 self.declared_html_encoding,
469 self.contains_replacement_characters,
470 ) in self.builder.prepare_markup(
471 markup, from_encoding, exclude_encodings=exclude_encodings
472 ):
473 self.reset()
474 self.builder.initialize_soup(self)
475 try:
476 self._feed()
477 success = True
478 break
479 except ParserRejectedMarkup as e:
480 rejections.append(e)
481 pass
483 if not success:
484 other_exceptions = [str(e) for e in rejections]
485 raise ParserRejectedMarkup(
486 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n "
487 + "\n ".join(other_exceptions)
488 )
490 # Clear out the markup and remove the builder's circular
491 # reference to this object.
492 self.markup = None
493 self.builder.soup = None
495 def copy_self(self) -> "BeautifulSoup":
496 """Create a new BeautifulSoup object with the same TreeBuilder,
497 but not associated with any markup.
499 This is the first step of the deepcopy process.
500 """
501 clone = type(self)("", None, self.builder)
503 # Keep track of the encoding of the original document,
504 # since we won't be parsing it again.
505 clone.original_encoding = self.original_encoding
506 return clone
508 def __getstate__(self) -> Dict[str, Any]:
509 # Frequently a tree builder can't be pickled.
510 d = dict(self.__dict__)
511 if "builder" in d and d["builder"] is not None and not self.builder.picklable:
512 d["builder"] = type(self.builder)
513 # Store the contents as a Unicode string.
514 d["contents"] = []
515 d["markup"] = self.decode()
517 # If _most_recent_element is present, it's a Tag object left
518 # over from initial parse. It might not be picklable and we
519 # don't need it.
520 if "_most_recent_element" in d:
521 del d["_most_recent_element"]
522 return d
524 def __setstate__(self, state: Dict[str, Any]) -> None:
525 # If necessary, restore the TreeBuilder by looking it up.
526 self.__dict__ = state
527 if isinstance(self.builder, type):
528 self.builder = self.builder()
529 elif not self.builder:
530 # We don't know which builder was used to build this
531 # parse tree, so use a default we know is always available.
532 self.builder = HTMLParserTreeBuilder()
533 self.builder.soup = self
534 self.reset()
535 self._feed()
537 @classmethod
538 @_deprecated(
539 replaced_by="nothing (private method, will be removed)", version="4.13.0"
540 )
541 def _decode_markup(cls, markup: _RawMarkup) -> str:
542 """Ensure `markup` is Unicode so it's safe to send into warnings.warn.
544 warnings.warn had this problem back in 2010 but fortunately
545 not anymore. This has not been used for a long time; I just
546 noticed that fact while working on 4.13.0.
547 """
548 if isinstance(markup, bytes):
549 decoded = markup.decode("utf-8", "replace")
550 else:
551 decoded = markup
552 return decoded
554 @classmethod
555 def _markup_is_url(cls, markup: _RawMarkup) -> bool:
556 """Error-handling method to raise a warning if incoming markup looks
557 like a URL.
559 :param markup: A string of markup.
560 :return: Whether or not the markup resembled a URL
561 closely enough to justify issuing a warning.
562 """
563 problem: bool = False
564 if isinstance(markup, bytes):
565 problem = (
566 any(markup.startswith(prefix) for prefix in (b"http:", b"https:"))
567 and b" " not in markup
568 )
569 elif isinstance(markup, str):
570 problem = (
571 any(markup.startswith(prefix) for prefix in ("http:", "https:"))
572 and " " not in markup
573 )
574 else:
575 return False
577 if not problem:
578 return False
579 warnings.warn(
580 MarkupResemblesLocatorWarning.URL_MESSAGE % dict(what="URL"),
581 MarkupResemblesLocatorWarning,
582 stacklevel=3,
583 )
584 return True
586 @classmethod
587 def _markup_resembles_filename(cls, markup: _RawMarkup) -> bool:
588 """Error-handling method to issue a warning if incoming markup
589 resembles a filename.
591 :param markup: A string of markup.
592 :return: Whether or not the markup resembled a filename
593 closely enough to justify issuing a warning.
594 """
595 markup_b: bytes
597 # We're only checking ASCII characters, so rather than write
598 # the same tests twice, convert Unicode to a bytestring and
599 # operate on the bytestring.
600 if isinstance(markup, str):
601 markup_b = markup.encode("utf8")
602 else:
603 markup_b = markup
605 # Step 1: does it end with a common textual file extension?
606 filelike = False
607 lower = markup_b.lower()
608 extensions = [b".html", b".htm", b".xml", b".xhtml", b".txt"]
609 if any(lower.endswith(ext) for ext in extensions):
610 filelike = True
611 if not filelike:
612 return False
614 # Step 2: it _might_ be a file, but there are a few things
615 # we can look for that aren't very common in filenames.
617 # Characters that have special meaning to Unix shells. (< was
618 # excluded before this method was called.)
619 #
620 # Many of these are also reserved characters that cannot
621 # appear in Windows filenames.
622 for byte in markup_b:
623 if byte in b"?*#&;>$|":
624 return False
626 # Two consecutive forward slashes (as seen in a URL) or two
627 # consecutive spaces (as seen in fixed-width data).
628 #
629 # (Paths to Windows network shares contain consecutive
630 # backslashes, so checking that doesn't seem as helpful.)
631 if b"//" in markup_b:
632 return False
633 if b" " in markup_b:
634 return False
636 # A colon in any position other than position 1 (e.g. after a
637 # Windows drive letter).
638 if markup_b.startswith(b":"):
639 return False
640 colon_i = markup_b.rfind(b":")
641 if colon_i not in (-1, 1):
642 return False
644 # Step 3: If it survived all of those checks, it's similar
645 # enough to a file to justify issuing a warning.
646 warnings.warn(
647 MarkupResemblesLocatorWarning.FILENAME_MESSAGE % dict(what="filename"),
648 MarkupResemblesLocatorWarning,
649 stacklevel=3,
650 )
651 return True
653 def _feed(self) -> None:
654 """Internal method that parses previously set markup, creating a large
655 number of Tag and NavigableString objects.
656 """
657 # Convert the document to Unicode.
658 self.builder.reset()
660 if self.markup is not None:
661 self.builder.feed(self.markup)
662 # Close out any unfinished strings and close all the open tags.
663 self.endData()
664 while (
665 self.currentTag is not None and self.currentTag.name != self.ROOT_TAG_NAME
666 ):
667 self.popTag()
669 def reset(self) -> None:
670 """Reset this object to a state as though it had never parsed any
671 markup.
672 """
673 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
674 self.hidden = True
675 self.builder.reset()
676 self.current_data = []
677 self.currentTag = None
678 self.tagStack = []
679 self.open_tag_counter = Counter()
680 self.preserve_whitespace_tag_stack = []
681 self.string_container_stack = []
682 self._most_recent_element = None
683 self.pushTag(self)
685 def new_tag(
686 self,
687 name: str,
688 namespace: Optional[str] = None,
689 nsprefix: Optional[str] = None,
690 attrs: Optional[_RawAttributeValues] = None,
691 sourceline: Optional[int] = None,
692 sourcepos: Optional[int] = None,
693 string: Optional[str] = None,
694 **kwattrs: _RawAttributeValue,
695 ) -> Tag:
696 """Create a new Tag associated with this BeautifulSoup object.
698 :param name: The name of the new Tag.
699 :param namespace: The URI of the new Tag's XML namespace, if any.
700 :param prefix: The prefix for the new Tag's XML namespace, if any.
701 :param attrs: A dictionary of this Tag's attribute values; can
702 be used instead of ``kwattrs`` for attributes like 'class'
703 that are reserved words in Python.
704 :param sourceline: The line number where this tag was
705 (purportedly) found in its source document.
706 :param sourcepos: The character position within ``sourceline`` where this
707 tag was (purportedly) found.
708 :param string: String content for the new Tag, if any.
709 :param kwattrs: Keyword arguments for the new Tag's attribute values.
711 """
712 attr_container = self.builder.attribute_dict_class(**kwattrs)
713 if attrs is not None:
714 attr_container.update(attrs)
715 tag_class = self.element_classes.get(Tag, Tag)
717 # Assume that this is either Tag or a subclass of Tag. If not,
718 # the user brought type-unsafety upon themselves.
719 tag_class = cast(Type[Tag], tag_class)
720 tag = tag_class(
721 None,
722 self.builder,
723 name,
724 namespace,
725 nsprefix,
726 attr_container,
727 sourceline=sourceline,
728 sourcepos=sourcepos,
729 )
731 if string is not None:
732 tag.string = string
733 return tag
735 def string_container(
736 self, base_class: Optional[Type[NavigableString]] = None
737 ) -> Type[NavigableString]:
738 """Find the class that should be instantiated to hold a given kind of
739 string.
741 This may be a built-in Beautiful Soup class or a custom class passed
742 in to the BeautifulSoup constructor.
743 """
744 container = base_class or NavigableString
746 # The user may want us to use some other class (hopefully a
747 # custom subclass) instead of the one we'd use normally.
748 container = cast(
749 Type[NavigableString], self.element_classes.get(container, container)
750 )
752 # On top of that, we may be inside a tag that needs a special
753 # container class.
754 if self.string_container_stack and container is NavigableString:
755 container = self.builder.string_containers.get(
756 self.string_container_stack[-1].name, container
757 )
758 return container
760 def new_string(
761 self, s: str, subclass: Optional[Type[NavigableString]] = None
762 ) -> NavigableString:
763 """Create a new `NavigableString` associated with this `BeautifulSoup`
764 object.
766 :param s: The string content of the `NavigableString`
767 :param subclass: The subclass of `NavigableString`, if any, to
768 use. If a document is being processed, an appropriate
769 subclass for the current location in the document will
770 be determined automatically.
771 """
772 container = self.string_container(subclass)
773 return container(s)
775 def insert_before(self, *args: _InsertableElement) -> List[PageElement]:
776 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
777 it because there is nothing before or after it in the parse tree.
778 """
779 raise NotImplementedError(
780 "BeautifulSoup objects don't support insert_before()."
781 )
783 def insert_after(self, *args: _InsertableElement) -> List[PageElement]:
784 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
785 it because there is nothing before or after it in the parse tree.
786 """
787 raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
789 def popTag(self) -> Optional[Tag]:
790 """Internal method called by _popToTag when a tag is closed.
792 :meta private:
793 """
794 if not self.tagStack:
795 # Nothing to pop. This shouldn't happen.
796 return None
797 tag = self.tagStack.pop()
798 if tag.name in self.open_tag_counter:
799 self.open_tag_counter[tag.name] -= 1
800 if (
801 self.preserve_whitespace_tag_stack
802 and tag == self.preserve_whitespace_tag_stack[-1]
803 ):
804 self.preserve_whitespace_tag_stack.pop()
805 if self.string_container_stack and tag == self.string_container_stack[-1]:
806 self.string_container_stack.pop()
807 # print("Pop", tag.name)
808 if self.tagStack:
809 self.currentTag = self.tagStack[-1]
810 return self.currentTag
812 def pushTag(self, tag: Tag) -> None:
813 """Internal method called by handle_starttag when a tag is opened.
815 :meta private:
816 """
817 # print("Push", tag.name)
818 if self.currentTag is not None:
819 self.currentTag.contents.append(tag)
820 self.tagStack.append(tag)
821 self.currentTag = self.tagStack[-1]
822 if tag.name != self.ROOT_TAG_NAME:
823 self.open_tag_counter[tag.name] += 1
824 if tag.name in self.builder.preserve_whitespace_tags:
825 self.preserve_whitespace_tag_stack.append(tag)
826 if tag.name in self.builder.string_containers:
827 self.string_container_stack.append(tag)
829 def endData(self, containerClass: Optional[Type[NavigableString]] = None) -> None:
830 """Method called by the TreeBuilder when the end of a data segment
831 occurs.
833 :param containerClass: The class to use when incorporating the
834 data segment into the parse tree.
836 :meta private:
837 """
838 if self.current_data:
839 current_data = "".join(self.current_data)
840 # If whitespace is not preserved, and this string contains
841 # nothing but ASCII spaces, replace it with a single space
842 # or newline.
843 if not self.preserve_whitespace_tag_stack:
844 strippable = True
845 for i in current_data:
846 if i not in self.ASCII_SPACES:
847 strippable = False
848 break
849 if strippable:
850 if "\n" in current_data:
851 current_data = "\n"
852 else:
853 current_data = " "
855 # Reset the data collector.
856 self.current_data = []
858 # Should we add this string to the tree at all?
859 if (
860 self.parse_only
861 and len(self.tagStack) <= 1
862 and (not self.parse_only.allow_string_creation(current_data))
863 ):
864 return
866 containerClass = self.string_container(containerClass)
867 o = containerClass(current_data)
868 self.object_was_parsed(o)
870 def object_was_parsed(
871 self,
872 o: PageElement,
873 parent: Optional[Tag] = None,
874 most_recent_element: Optional[PageElement] = None,
875 ) -> None:
876 """Method called by the TreeBuilder to integrate an object into the
877 parse tree.
879 :meta private:
880 """
881 if parent is None:
882 parent = self.currentTag
883 assert parent is not None
884 previous_element: Optional[PageElement]
885 if most_recent_element is not None:
886 previous_element = most_recent_element
887 else:
888 previous_element = self._most_recent_element
890 next_element = previous_sibling = next_sibling = None
891 if isinstance(o, Tag):
892 next_element = o.next_element
893 next_sibling = o.next_sibling
894 previous_sibling = o.previous_sibling
895 if previous_element is None:
896 previous_element = o.previous_element
898 fix = parent.next_element is not None
900 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
902 self._most_recent_element = o
903 parent.contents.append(o)
905 # Check if we are inserting into an already parsed node.
906 if fix:
907 self._linkage_fixer(parent)
909 def _linkage_fixer(self, el: Tag) -> None:
910 """Make sure linkage of this fragment is sound."""
912 first = el.contents[0]
913 child = el.contents[-1]
914 descendant: PageElement = child
916 if child is first and el.parent is not None:
917 # Parent should be linked to first child
918 el.next_element = child
919 # We are no longer linked to whatever this element is
920 prev_el = child.previous_element
921 if prev_el is not None and prev_el is not el:
922 prev_el.next_element = None
923 # First child should be linked to the parent, and no previous siblings.
924 child.previous_element = el
925 child.previous_sibling = None
927 # We have no sibling as we've been appended as the last.
928 child.next_sibling = None
930 # This index is a tag, dig deeper for a "last descendant"
931 if isinstance(child, Tag) and child.contents:
932 # _last_decendant is typed as returning Optional[PageElement],
933 # but the value can't be None here, because el is a Tag
934 # which we know has contents.
935 descendant = cast(PageElement, child._last_descendant(False))
937 # As the final step, link last descendant. It should be linked
938 # to the parent's next sibling (if found), else walk up the chain
939 # and find a parent with a sibling. It should have no next sibling.
940 descendant.next_element = None
941 descendant.next_sibling = None
943 target: Optional[Tag] = el
944 while True:
945 if target is None:
946 break
947 elif target.next_sibling is not None:
948 descendant.next_element = target.next_sibling
949 target.next_sibling.previous_element = child
950 break
951 target = target.parent
953 def _popToTag(
954 self, name: str, nsprefix: Optional[str] = None, inclusivePop: bool = True
955 ) -> Optional[Tag]:
956 """Pops the tag stack up to and including the most recent
957 instance of the given tag.
959 If there are no open tags with the given name, nothing will be
960 popped.
962 :param name: Pop up to the most recent tag with this name.
963 :param nsprefix: The namespace prefix that goes with `name`.
964 :param inclusivePop: It this is false, pops the tag stack up
965 to but *not* including the most recent instqance of the
966 given tag.
968 :meta private:
969 """
970 # print("Popping to %s" % name)
971 if name == self.ROOT_TAG_NAME:
972 # The BeautifulSoup object itself can never be popped.
973 return None
975 most_recently_popped = None
977 stack_size = len(self.tagStack)
978 for i in range(stack_size - 1, 0, -1):
979 if not self.open_tag_counter.get(name):
980 break
981 t = self.tagStack[i]
982 if name == t.name and nsprefix == t.prefix:
983 if inclusivePop:
984 most_recently_popped = self.popTag()
985 break
986 most_recently_popped = self.popTag()
988 return most_recently_popped
990 def handle_starttag(
991 self,
992 name: str,
993 namespace: Optional[str],
994 nsprefix: Optional[str],
995 attrs: _RawAttributeValues,
996 sourceline: Optional[int] = None,
997 sourcepos: Optional[int] = None,
998 namespaces: Optional[Dict[str, str]] = None,
999 ) -> Optional[Tag]:
1000 """Called by the tree builder when a new tag is encountered.
1002 :param name: Name of the tag.
1003 :param nsprefix: Namespace prefix for the tag.
1004 :param attrs: A dictionary of attribute values. Note that
1005 attribute values are expected to be simple strings; processing
1006 of multi-valued attributes such as "class" comes later.
1007 :param sourceline: The line number where this tag was found in its
1008 source document.
1009 :param sourcepos: The character position within `sourceline` where this
1010 tag was found.
1011 :param namespaces: A dictionary of all namespace prefix mappings
1012 currently in scope in the document.
1014 If this method returns None, the tag was rejected by an active
1015 `ElementFilter`. You should proceed as if the tag had not occurred
1016 in the document. For instance, if this was a self-closing tag,
1017 don't call handle_endtag.
1019 :meta private:
1020 """
1021 # print("Start tag %s: %s" % (name, attrs))
1022 self.endData()
1024 if (
1025 self.parse_only
1026 and len(self.tagStack) <= 1
1027 and not self.parse_only.allow_tag_creation(nsprefix, name, attrs)
1028 ):
1029 return None
1031 tag_class = self.element_classes.get(Tag, Tag)
1032 # Assume that this is either Tag or a subclass of Tag. If not,
1033 # the user brought type-unsafety upon themselves.
1034 tag_class = cast(Type[Tag], tag_class)
1035 tag = tag_class(
1036 self,
1037 self.builder,
1038 name,
1039 namespace,
1040 nsprefix,
1041 attrs,
1042 self.currentTag,
1043 self._most_recent_element,
1044 sourceline=sourceline,
1045 sourcepos=sourcepos,
1046 namespaces=namespaces,
1047 )
1048 if tag is None:
1049 return tag
1050 if self._most_recent_element is not None:
1051 self._most_recent_element.next_element = tag
1052 self._most_recent_element = tag
1053 self.pushTag(tag)
1054 return tag
1056 def handle_endtag(self, name: str, nsprefix: Optional[str] = None) -> None:
1057 """Called by the tree builder when an ending tag is encountered.
1059 :param name: Name of the tag.
1060 :param nsprefix: Namespace prefix for the tag.
1062 :meta private:
1063 """
1064 # print("End tag: " + name)
1065 self.endData()
1066 self._popToTag(name, nsprefix)
1068 def handle_data(self, data: str) -> None:
1069 """Called by the tree builder when a chunk of textual data is
1070 encountered.
1072 :meta private:
1073 """
1074 self.current_data.append(data)
1076 def decode(
1077 self,
1078 indent_level: Optional[int] = None,
1079 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
1080 formatter: Union[Formatter, str] = "minimal",
1081 iterator: Optional[Iterator[PageElement]] = None,
1082 **kwargs: Any,
1083 ) -> str:
1084 """Returns a string representation of the parse tree
1085 as a full HTML or XML document.
1087 :param indent_level: Each line of the rendering will be
1088 indented this many levels. (The ``formatter`` decides what a
1089 'level' means, in terms of spaces or other characters
1090 output.) This is used internally in recursive calls while
1091 pretty-printing.
1092 :param eventual_encoding: The encoding of the final document.
1093 If this is None, the document will be a Unicode string.
1094 :param formatter: Either a `Formatter` object, or a string naming one of
1095 the standard formatters.
1096 :param iterator: The iterator to use when navigating over the
1097 parse tree. This is only used by `Tag.decode_contents` and
1098 you probably won't need to use it.
1099 """
1100 if self.is_xml:
1101 # Print the XML declaration
1102 encoding_part = ""
1103 declared_encoding: Optional[str] = eventual_encoding
1104 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
1105 # This is a special Python encoding; it can't actually
1106 # go into an XML document because it means nothing
1107 # outside of Python.
1108 declared_encoding = None
1109 if declared_encoding is not None:
1110 encoding_part = ' encoding="%s"' % declared_encoding
1111 prefix = '<?xml version="1.0"%s?>\n' % encoding_part
1112 else:
1113 prefix = ""
1115 # Prior to 4.13.0, the first argument to this method was a
1116 # bool called pretty_print, which gave the method a different
1117 # signature from its superclass implementation, Tag.decode.
1118 #
1119 # The signatures of the two methods now match, but just in
1120 # case someone is still passing a boolean in as the first
1121 # argument to this method (or a keyword argument with the old
1122 # name), we can handle it and put out a DeprecationWarning.
1123 warning: Optional[str] = None
1124 pretty_print: Optional[bool] = None
1125 if isinstance(indent_level, bool):
1126 if indent_level is True:
1127 indent_level = 0
1128 elif indent_level is False:
1129 indent_level = None
1130 warning = f"As of 4.13.0, the first argument to BeautifulSoup.decode has been changed from bool to int, to match Tag.decode. Pass in a value of {indent_level} instead."
1131 else:
1132 pretty_print = kwargs.pop("pretty_print", None)
1133 assert not kwargs
1134 if pretty_print is not None:
1135 if pretty_print is True:
1136 indent_level = 0
1137 elif pretty_print is False:
1138 indent_level = None
1139 warning = f"As of 4.13.0, the pretty_print argument to BeautifulSoup.decode has been removed, to match Tag.decode. Pass in a value of indent_level={indent_level} instead."
1141 if warning:
1142 warnings.warn(warning, DeprecationWarning, stacklevel=2)
1143 elif indent_level is False or pretty_print is False:
1144 indent_level = None
1145 return prefix + super(BeautifulSoup, self).decode(
1146 indent_level, eventual_encoding, formatter, iterator
1147 )
1150# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
1151_s = BeautifulSoup
1152_soup = BeautifulSoup
1155class BeautifulStoneSoup(BeautifulSoup):
1156 """Deprecated interface to an XML parser."""
1158 def __init__(self, *args: Any, **kwargs: Any):
1159 kwargs["features"] = "xml"
1160 warnings.warn(
1161 "The BeautifulStoneSoup class was deprecated in version 4.0.0. Instead of using "
1162 'it, pass features="xml" into the BeautifulSoup constructor.',
1163 DeprecationWarning,
1164 stacklevel=2,
1165 )
1166 super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
1169# If this file is run as a script, act as an HTML pretty-printer.
1170if __name__ == "__main__":
1171 import sys
1173 soup = BeautifulSoup(sys.stdin)
1174 print((soup.prettify()))