Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/element.py: 39%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3# Use of this source code is governed by the MIT license.
4__license__ = "MIT"
6import re
7import warnings
9from bs4.css import CSS
10from bs4._deprecation import (
11 _deprecated,
12 _deprecated_alias,
13 _deprecated_function_alias,
14)
15from bs4.formatter import (
16 Formatter,
17 HTMLFormatter,
18 XMLFormatter,
19)
20from bs4._warnings import AttributeResemblesVariableWarning
22from typing import (
23 Any,
24 Callable,
25 Dict,
26 Generic,
27 Iterable,
28 Iterator,
29 List,
30 Mapping,
31 Optional,
32 Pattern,
33 Set,
34 TYPE_CHECKING,
35 Tuple,
36 Type,
37 TypeVar,
38 Union,
39 cast,
40)
41from typing_extensions import (
42 Self,
43 TypeAlias,
44)
46if TYPE_CHECKING:
47 from bs4 import BeautifulSoup
48 from bs4.builder import TreeBuilder
49 from bs4.filter import ElementFilter
50 from bs4.formatter import (
51 _EntitySubstitutionFunction,
52 _FormatterOrName,
53 )
54 from bs4._typing import (
55 _AtMostOneElement,
56 _AttributeValue,
57 _AttributeValues,
58 _Encoding,
59 _InsertableElement,
60 _OneElement,
61 _QueryResults,
62 _RawOrProcessedAttributeValues,
63 _StrainableElement,
64 _StrainableAttribute,
65 _StrainableAttributes,
66 _StrainableString,
67 )
69_OneOrMoreStringTypes: TypeAlias = Union[
70 Type["NavigableString"], Iterable[Type["NavigableString"]]
71]
73_FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]]
75# Deprecated module-level attributes.
76# See https://peps.python.org/pep-0562/
77_deprecated_names = dict(
78 whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy."
79)
80#: :meta private:
81_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+")
84def __getattr__(name: str) -> Any:
85 if name in _deprecated_names:
86 message = _deprecated_names[name]
87 warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2)
89 return globals()[f"_deprecated_{name}"]
90 raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
93#: Documents output by Beautiful Soup will be encoded with
94#: this encoding unless you specify otherwise.
95DEFAULT_OUTPUT_ENCODING: str = "utf-8"
97#: A regular expression that can be used to split on whitespace.
98nonwhitespace_re: Pattern[str] = re.compile(r"\S+")
100#: These encodings are recognized by Python (so `Tag.encode`
101#: could theoretically support them) but XML and HTML don't recognize
102#: them (so they should not show up in an XML or HTML document as that
103#: document's encoding).
104#:
105#: If an XML document is encoded in one of these encodings, no encoding
106#: will be mentioned in the XML declaration. If an HTML document is
107#: encoded in one of these encodings, and the HTML document has a
108#: <meta> tag that mentions an encoding, the encoding will be given as
109#: the empty string.
110#:
111#: Source:
112#: Python documentation, `Python Specific Encodings <https://docs.python.org/3/library/codecs.html#python-specific-encodings>`_
113PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set(
114 [
115 "idna",
116 "mbcs",
117 "oem",
118 "palmos",
119 "punycode",
120 "raw_unicode_escape",
121 "undefined",
122 "unicode_escape",
123 "raw-unicode-escape",
124 "unicode-escape",
125 "string-escape",
126 "string_escape",
127 ]
128)
131class NamespacedAttribute(str):
132 """A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"')
133 which remembers the namespace prefix ('xml') and the name ('lang')
134 that were used to create it.
135 """
137 prefix: Optional[str]
138 name: Optional[str]
139 namespace: Optional[str]
141 def __new__(
142 cls,
143 prefix: Optional[str],
144 name: Optional[str] = None,
145 namespace: Optional[str] = None,
146 ) -> Self:
147 if not name:
148 # This is the default namespace. Its name "has no value"
149 # per https://www.w3.org/TR/xml-names/#defaulting
150 name = None
152 if not name:
153 obj = str.__new__(cls, prefix)
154 elif not prefix:
155 # Not really namespaced.
156 obj = str.__new__(cls, name)
157 else:
158 obj = str.__new__(cls, prefix + ":" + name)
159 obj.prefix = prefix
160 obj.name = name
161 obj.namespace = namespace
162 return obj
165class AttributeValueWithCharsetSubstitution(str):
166 """An abstract class standing in for a character encoding specified
167 inside an HTML ``<meta>`` tag.
169 Subclasses exist for each place such a character encoding might be
170 found: either inside the ``charset`` attribute
171 (`CharsetMetaAttributeValue`) or inside the ``content`` attribute
172 (`ContentMetaAttributeValue`)
174 This allows Beautiful Soup to replace that part of the HTML file
175 with a different encoding when ouputting a tree as a string.
176 """
178 # The original, un-encoded value of the ``content`` attribute.
179 #: :meta private:
180 original_value: str
182 def substitute_encoding(self, eventual_encoding: str) -> str:
183 """Do whatever's necessary in this implementation-specific
184 portion an HTML document to substitute in a specific encoding.
185 """
186 raise NotImplementedError()
189class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
190 """A generic stand-in for the value of a ``<meta>`` tag's ``charset``
191 attribute.
193 When Beautiful Soup parses the markup ``<meta charset="utf8">``, the
194 value of the ``charset`` attribute will become one of these objects.
196 If the document is later encoded to an encoding other than UTF-8, its
197 ``<meta>`` tag will mention the new encoding instead of ``utf8``.
198 """
200 def __new__(cls, original_value: str) -> Self:
201 # We don't need to use the original value for anything, but
202 # it might be useful for the user to know.
203 obj = str.__new__(cls, original_value)
204 obj.original_value = original_value
205 return obj
207 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
208 """When an HTML document is being encoded to a given encoding, the
209 value of a ``<meta>`` tag's ``charset`` becomes the name of
210 the encoding.
211 """
212 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
213 return ""
214 return eventual_encoding
217class AttributeValueList(List[str]):
218 """Class for the list used to hold the values of attributes which
219 have multiple values (such as HTML's 'class'). It's just a regular
220 list, but you can subclass it and pass it in to the TreeBuilder
221 constructor as attribute_value_list_class, to have your subclass
222 instantiated instead.
223 """
226class AttributeDict(Dict[Any,Any]):
227 """Superclass for the dictionary used to hold a tag's
228 attributes. You can use this, but it's just a regular dict with no
229 special logic.
230 """
233class XMLAttributeDict(AttributeDict):
234 """A dictionary for holding a Tag's attributes, which processes
235 incoming values for consistency with the HTML spec.
236 """
238 def __setitem__(self, key: str, value: Any) -> None:
239 """Set an attribute value, possibly modifying it to comply with
240 the XML spec.
242 This just means converting common non-string values to
243 strings: XML attributes may have "any literal string as a
244 value."
245 """
246 if value is None:
247 value = ""
248 if isinstance(value, bool):
249 # XML does not define any rules for boolean attributes.
250 # Preserve the old Beautiful Soup behavior (a bool that
251 # gets converted to a string on output) rather than
252 # guessing what the value should be.
253 pass
254 elif isinstance(value, (int, float)):
255 # It's dangerous to convert _every_ attribute value into a
256 # plain string, since an attribute value may be a more
257 # sophisticated string-like object
258 # (e.g. CharsetMetaAttributeValue). But we can definitely
259 # convert numeric values and booleans, which are the most common.
260 value = str(value)
262 super().__setitem__(key, value)
265class HTMLAttributeDict(AttributeDict):
266 """A dictionary for holding a Tag's attributes, which processes
267 incoming values for consistency with the HTML spec, which says
268 'Attribute values are a mixture of text and character
269 references...'
271 Basically, this means converting common non-string values into
272 strings, like XMLAttributeDict, though HTML also has some rules
273 around boolean attributes that XML doesn't have.
274 """
276 def __setitem__(self, key: str, value: Any) -> None:
277 """Set an attribute value, possibly modifying it to comply
278 with the HTML spec,
279 """
280 if value in (False, None):
281 # 'The values "true" and "false" are not allowed on
282 # boolean attributes. To represent a false value, the
283 # attribute has to be omitted altogether.'
284 if key in self:
285 del self[key]
286 return
287 if isinstance(value, bool):
288 # 'If the [boolean] attribute is present, its value must
289 # either be the empty string or a value that is an ASCII
290 # case-insensitive match for the attribute's canonical
291 # name, with no leading or trailing whitespace.'
292 #
293 # [fixme] It's not clear to me whether "canonical name"
294 # means fully-qualified name, unqualified name, or
295 # (probably not) name with namespace prefix. For now I'm
296 # going with unqualified name.
297 if isinstance(key, NamespacedAttribute):
298 value = key.name
299 else:
300 value = key
301 elif isinstance(value, (int, float)):
302 # See note in XMLAttributeDict for the reasoning why we
303 # only do this to numbers.
304 value = str(value)
305 super().__setitem__(key, value)
308class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
309 """A generic stand-in for the value of a ``<meta>`` tag's ``content``
310 attribute.
312 When Beautiful Soup parses the markup:
313 ``<meta http-equiv="content-type" content="text/html; charset=utf8">``
315 The value of the ``content`` attribute will become one of these objects.
317 If the document is later encoded to an encoding other than UTF-8, its
318 ``<meta>`` tag will mention the new encoding instead of ``utf8``.
319 """
321 #: Match the 'charset' argument inside the 'content' attribute
322 #: of a <meta> tag.
323 #: :meta private:
324 CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
326 def __new__(cls, original_value: str) -> Self:
327 cls.CHARSET_RE.search(original_value)
328 obj = str.__new__(cls, original_value)
329 obj.original_value = original_value
330 return obj
332 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
333 """When an HTML document is being encoded to a given encoding, the
334 value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes
335 the name of the encoding.
336 """
337 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
338 return self.CHARSET_RE.sub("", self.original_value)
340 def rewrite(match: re.Match[str]) -> str:
341 return match.group(1) + eventual_encoding
343 return self.CHARSET_RE.sub(rewrite, self.original_value)
346class PageElement(object):
347 """An abstract class representing a single element in the parse tree.
349 `NavigableString`, `Tag`, etc. are all subclasses of
350 `PageElement`. For this reason you'll see a lot of methods that
351 return `PageElement`, but you'll never see an actual `PageElement`
352 object. For the most part you can think of `PageElement` as
353 meaning "a `Tag` or a `NavigableString`."
354 """
356 #: In general, we can't tell just by looking at an element whether
357 #: it's contained in an XML document or an HTML document. But for
358 #: `Tag` objects (q.v.) we can store this information at parse time.
359 #: :meta private:
360 known_xml: Optional[bool] = None
362 #: Whether or not this element has been decomposed from the tree
363 #: it was created in.
364 _decomposed: bool
366 parent: Optional[Tag]
367 next_element: _AtMostOneElement
368 previous_element: _AtMostOneElement
369 next_sibling: _AtMostOneElement
370 previous_sibling: _AtMostOneElement
372 #: Whether or not this element is hidden from generated output.
373 #: Only the `BeautifulSoup` object itself is hidden.
374 hidden: bool = False
376 def setup(
377 self,
378 parent: Optional[Tag] = None,
379 previous_element: _AtMostOneElement = None,
380 next_element: _AtMostOneElement = None,
381 previous_sibling: _AtMostOneElement = None,
382 next_sibling: _AtMostOneElement = None,
383 ) -> None:
384 """Sets up the initial relations between this element and
385 other elements.
387 :param parent: The parent of this element.
389 :param previous_element: The element parsed immediately before
390 this one.
392 :param next_element: The element parsed immediately after
393 this one.
395 :param previous_sibling: The most recently encountered element
396 on the same level of the parse tree as this one.
398 :param previous_sibling: The next element to be encountered
399 on the same level of the parse tree as this one.
400 """
401 self.parent = parent
403 self.previous_element = previous_element
404 if self.previous_element is not None:
405 self.previous_element.next_element = self
407 self.next_element = next_element
408 if self.next_element is not None:
409 self.next_element.previous_element = self
411 self.next_sibling = next_sibling
412 if self.next_sibling is not None:
413 self.next_sibling.previous_sibling = self
415 if (
416 previous_sibling is None
417 and self.parent is not None
418 and self.parent.contents
419 ):
420 previous_sibling = self.parent.contents[-1]
422 self.previous_sibling = previous_sibling
423 if self.previous_sibling is not None:
424 self.previous_sibling.next_sibling = self
426 def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str:
427 """Format the given string using the given formatter.
429 :param s: A string.
430 :param formatter: A Formatter object, or a string naming one of the standard formatters.
431 """
432 if formatter is None:
433 return s
434 if not isinstance(formatter, Formatter):
435 formatter = self.formatter_for_name(formatter)
436 output = formatter.substitute(s)
437 return output
439 def formatter_for_name(
440 self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction]
441 ) -> Formatter:
442 """Look up or create a Formatter for the given identifier,
443 if necessary.
445 :param formatter: Can be a `Formatter` object (used as-is), a
446 function (used as the entity substitution hook for an
447 `bs4.formatter.XMLFormatter` or
448 `bs4.formatter.HTMLFormatter`), or a string (used to look
449 up an `bs4.formatter.XMLFormatter` or
450 `bs4.formatter.HTMLFormatter` in the appropriate registry.
452 """
453 if isinstance(formatter_name, Formatter):
454 return formatter_name
455 c: type[Formatter]
456 registry: Mapping[Optional[str], Formatter]
457 if self._is_xml:
458 c = XMLFormatter
459 registry = XMLFormatter.REGISTRY
460 else:
461 c = HTMLFormatter
462 registry = HTMLFormatter.REGISTRY
463 if callable(formatter_name):
464 return c(entity_substitution=formatter_name)
465 return registry[formatter_name]
467 @property
468 def _is_xml(self) -> bool:
469 """Is this element part of an XML tree or an HTML tree?
471 This is used in formatter_for_name, when deciding whether an
472 XMLFormatter or HTMLFormatter is more appropriate. It can be
473 inefficient, but it should be called very rarely.
474 """
475 if self.known_xml is not None:
476 # Most of the time we will have determined this when the
477 # document is parsed.
478 return self.known_xml
480 # Otherwise, it's likely that this element was created by
481 # direct invocation of the constructor from within the user's
482 # Python code.
483 if self.parent is None:
484 # This is the top-level object. It should have .known_xml set
485 # from tree creation. If not, take a guess--BS is usually
486 # used on HTML markup.
487 return getattr(self, "is_xml", False)
488 return self.parent._is_xml
490 nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0")
491 previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0")
493 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
494 raise NotImplementedError()
496 def __copy__(self) -> Self:
497 """A copy of a PageElement can only be a deep copy, because
498 only one PageElement can occupy a given place in a parse tree.
499 """
500 return self.__deepcopy__({})
502 default: Iterable[type[NavigableString]] = tuple() #: :meta private:
504 def _all_strings(
505 self, strip: bool = False, types: Iterable[type[NavigableString]] = default
506 ) -> Iterator[str]:
507 """Yield all strings of certain classes, possibly stripping them.
509 This is implemented differently in `Tag` and `NavigableString`.
510 """
511 raise NotImplementedError()
513 @property
514 def stripped_strings(self) -> Iterator[str]:
515 """Yield all interesting strings in this PageElement, stripping them
516 first.
518 See `Tag` for information on which strings are considered
519 interesting in a given context.
520 """
521 for string in self._all_strings(True):
522 yield string
524 def get_text(
525 self,
526 separator: str = "",
527 strip: bool = False,
528 types: Iterable[Type[NavigableString]] = default,
529 ) -> str:
530 """Get all child strings of this PageElement, concatenated using the
531 given separator.
533 :param separator: Strings will be concatenated using this separator.
535 :param strip: If True, strings will be stripped before being
536 concatenated.
538 :param types: A tuple of NavigableString subclasses. Any
539 strings of a subclass not found in this list will be
540 ignored. Although there are exceptions, the default
541 behavior in most cases is to consider only NavigableString
542 and CData objects. That means no comments, processing
543 instructions, etc.
545 :return: A string.
546 """
547 return separator.join([s for s in self._all_strings(strip, types=types)])
549 getText = get_text
550 text = property(get_text)
552 def replace_with(self, *args: PageElement) -> Self:
553 """Replace this `PageElement` with one or more other `PageElement`,
554 objects, keeping the rest of the tree the same.
556 :return: This `PageElement`, no longer part of the tree.
557 """
558 if self.parent is None:
559 raise ValueError(
560 "Cannot replace one element with another when the "
561 "element to be replaced is not part of a tree."
562 )
563 if len(args) == 1 and args[0] is self:
564 # Replacing an element with itself is a no-op.
565 return self
566 if any(x is self.parent for x in args):
567 raise ValueError("Cannot replace a Tag with its parent.")
568 old_parent = self.parent
569 my_index = self.parent.index(self)
570 self.extract(_self_index=my_index)
571 for idx, replace_with in enumerate(args, start=my_index):
572 old_parent.insert(idx, replace_with)
573 return self
575 replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0")
577 def wrap(self, wrap_inside: Tag) -> Tag:
578 """Wrap this `PageElement` inside a `Tag`.
580 :return: ``wrap_inside``, occupying the position in the tree that used
581 to be occupied by this object, and with this object now inside it.
582 """
583 me = self.replace_with(wrap_inside)
584 wrap_inside.append(me)
585 return wrap_inside
587 def extract(self, _self_index: Optional[int] = None) -> Self:
588 """Destructively rips this element out of the tree.
590 :param _self_index: The location of this element in its parent's
591 .contents, if known. Passing this in allows for a performance
592 optimization.
594 :return: this `PageElement`, no longer part of the tree.
595 """
596 if self.parent is not None:
597 if _self_index is None:
598 _self_index = self.parent.index(self)
599 del self.parent.contents[_self_index]
601 # Find the two elements that would be next to each other if
602 # this element (and any children) hadn't been parsed. Connect
603 # the two.
604 last_child = self._last_descendant()
606 # last_child can't be None because we passed accept_self=True
607 # into _last_descendant. Worst case, last_child will be
608 # self. Making this cast removes several mypy complaints later
609 # on as we manipulate last_child.
610 last_child = cast(PageElement, last_child)
611 next_element = last_child.next_element
613 if self.previous_element is not None:
614 if self.previous_element is not next_element:
615 self.previous_element.next_element = next_element
616 if next_element is not None and next_element is not self.previous_element:
617 next_element.previous_element = self.previous_element
618 self.previous_element = None
619 last_child.next_element = None
621 self.parent = None
622 if (
623 self.previous_sibling is not None
624 and self.previous_sibling is not self.next_sibling
625 ):
626 self.previous_sibling.next_sibling = self.next_sibling
627 if (
628 self.next_sibling is not None
629 and self.next_sibling is not self.previous_sibling
630 ):
631 self.next_sibling.previous_sibling = self.previous_sibling
632 self.previous_sibling = self.next_sibling = None
633 return self
635 def decompose(self) -> None:
636 """Recursively destroys this `PageElement` and its children.
638 The element will be removed from the tree and wiped out; so
639 will everything beneath it.
641 The behavior of a decomposed `PageElement` is undefined and you
642 should never use one for anything, but if you need to *check*
643 whether an element has been decomposed, you can use the
644 `PageElement.decomposed` property.
645 """
646 self.extract()
647 e: _AtMostOneElement = self
648 next_up: _AtMostOneElement = None
649 while e is not None:
650 next_up = e.next_element
651 e.__dict__.clear()
652 if isinstance(e, Tag):
653 e.contents = []
654 e._decomposed = True
655 e = next_up
657 def _last_descendant(
658 self, is_initialized: bool = True, accept_self: bool = True
659 ) -> _AtMostOneElement:
660 """Finds the last element beneath this object to be parsed.
662 Special note to help you figure things out if your type
663 checking is tripped up by the fact that this method returns
664 _AtMostOneElement instead of PageElement: the only time
665 this method returns None is if `accept_self` is False and the
666 `PageElement` has no children--either it's a NavigableString
667 or an empty Tag.
669 :param is_initialized: Has `PageElement.setup` been called on
670 this `PageElement` yet?
672 :param accept_self: Is ``self`` an acceptable answer to the
673 question?
674 """
675 if is_initialized and self.next_sibling is not None:
676 last_child = self.next_sibling.previous_element
677 else:
678 last_child = self
679 while isinstance(last_child, Tag) and last_child.contents:
680 last_child = last_child.contents[-1]
681 if not accept_self and last_child is self:
682 last_child = None
683 return last_child
685 _lastRecursiveChild = _deprecated_alias(
686 "_lastRecursiveChild", "_last_descendant", "4.0.0"
687 )
689 def insert_before(self, *args: _InsertableElement) -> List[PageElement]:
690 """Makes the given element(s) the immediate predecessor of this one.
692 All the elements will have the same `PageElement.parent` as
693 this one, and the given elements will occur immediately before
694 this one.
696 :param args: One or more PageElements.
698 :return The list of PageElements that were inserted.
699 """
700 parent = self.parent
701 if parent is None:
702 raise ValueError("Element has no parent, so 'before' has no meaning.")
703 if any(x is self for x in args):
704 raise ValueError("Can't insert an element before itself.")
705 results: List[PageElement] = []
706 for predecessor in args:
707 # Extract first so that the index won't be screwed up if they
708 # are siblings.
709 if isinstance(predecessor, PageElement):
710 predecessor.extract()
711 index = parent.index(self)
712 results.extend(parent.insert(index, predecessor))
714 return results
716 def insert_after(self, *args: _InsertableElement) -> List[PageElement]:
717 """Makes the given element(s) the immediate successor of this one.
719 The elements will have the same `PageElement.parent` as this
720 one, and the given elements will occur immediately after this
721 one.
723 :param args: One or more PageElements.
725 :return The list of PageElements that were inserted.
726 """
727 # Do all error checking before modifying the tree.
728 parent = self.parent
729 if parent is None:
730 raise ValueError("Element has no parent, so 'after' has no meaning.")
731 if any(x is self for x in args):
732 raise ValueError("Can't insert an element after itself.")
734 offset = 0
735 results: List[PageElement] = []
736 for successor in args:
737 # Extract first so that the index won't be screwed up if they
738 # are siblings.
739 if isinstance(successor, PageElement):
740 successor.extract()
741 index = parent.index(self)
742 results.extend(parent.insert(index + 1 + offset, successor))
743 offset += 1
745 return results
747 def find_next(
748 self,
749 name: _FindMethodName = None,
750 attrs: _StrainableAttributes = {},
751 string: Optional[_StrainableString] = None,
752 **kwargs: _StrainableAttribute,
753 ) -> _AtMostOneElement:
754 """Find the first PageElement that matches the given criteria and
755 appears later in the document than this PageElement.
757 All find_* methods take a common set of arguments. See the online
758 documentation for detailed explanations.
760 :param name: A filter on tag name.
761 :param attrs: Additional filters on attribute values.
762 :param string: A filter for a NavigableString with specific text.
763 :kwargs: Additional filters on attribute values.
764 """
765 return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
767 findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0")
769 def find_all_next(
770 self,
771 name: _FindMethodName = None,
772 attrs: _StrainableAttributes = {},
773 string: Optional[_StrainableString] = None,
774 limit: Optional[int] = None,
775 _stacklevel: int = 2,
776 **kwargs: _StrainableAttribute,
777 ) -> _QueryResults:
778 """Find all `PageElement` objects that match the given criteria and
779 appear later in the document than this `PageElement`.
781 All find_* methods take a common set of arguments. See the online
782 documentation for detailed explanations.
784 :param name: A filter on tag name.
785 :param attrs: Additional filters on attribute values.
786 :param string: A filter for a NavigableString with specific text.
787 :param limit: Stop looking after finding this many results.
788 :param _stacklevel: Used internally to improve warning messages.
789 :kwargs: Additional filters on attribute values.
790 """
791 return self._find_all(
792 name,
793 attrs,
794 string,
795 limit,
796 self.next_elements,
797 _stacklevel=_stacklevel + 1,
798 **kwargs,
799 )
801 findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0")
803 def find_next_sibling(
804 self,
805 name: _FindMethodName = None,
806 attrs: _StrainableAttributes = {},
807 string: Optional[_StrainableString] = None,
808 **kwargs: _StrainableAttribute,
809 ) -> _AtMostOneElement:
810 """Find the closest sibling to this PageElement that matches the
811 given criteria and appears later in the document.
813 All find_* methods take a common set of arguments. See the
814 online documentation for detailed explanations.
816 :param name: A filter on tag name.
817 :param attrs: Additional filters on attribute values.
818 :param string: A filter for a `NavigableString` with specific text.
819 :kwargs: Additional filters on attribute values.
820 """
821 return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs)
823 findNextSibling = _deprecated_function_alias(
824 "findNextSibling", "find_next_sibling", "4.0.0"
825 )
827 def find_next_siblings(
828 self,
829 name: _FindMethodName = None,
830 attrs: _StrainableAttributes = {},
831 string: Optional[_StrainableString] = None,
832 limit: Optional[int] = None,
833 _stacklevel: int = 2,
834 **kwargs: _StrainableAttribute,
835 ) -> _QueryResults:
836 """Find all siblings of this `PageElement` that match the given criteria
837 and appear later in the document.
839 All find_* methods take a common set of arguments. See the online
840 documentation for detailed explanations.
842 :param name: A filter on tag name.
843 :param attrs: Additional filters on attribute values.
844 :param string: A filter for a `NavigableString` with specific text.
845 :param limit: Stop looking after finding this many results.
846 :param _stacklevel: Used internally to improve warning messages.
847 :kwargs: Additional filters on attribute values.
848 """
849 return self._find_all(
850 name,
851 attrs,
852 string,
853 limit,
854 self.next_siblings,
855 _stacklevel=_stacklevel + 1,
856 **kwargs,
857 )
859 findNextSiblings = _deprecated_function_alias(
860 "findNextSiblings", "find_next_siblings", "4.0.0"
861 )
862 fetchNextSiblings = _deprecated_function_alias(
863 "fetchNextSiblings", "find_next_siblings", "3.0.0"
864 )
866 def find_previous(
867 self,
868 name: _FindMethodName = None,
869 attrs: _StrainableAttributes = {},
870 string: Optional[_StrainableString] = None,
871 **kwargs: _StrainableAttribute,
872 ) -> _AtMostOneElement:
873 """Look backwards in the document from this `PageElement` and find the
874 first `PageElement` that matches the given criteria.
876 All find_* methods take a common set of arguments. See the online
877 documentation for detailed explanations.
879 :param name: A filter on tag name.
880 :param attrs: Additional filters on attribute values.
881 :param string: A filter for a `NavigableString` with specific text.
882 :kwargs: Additional filters on attribute values.
883 """
884 return self._find_one(self.find_all_previous, name, attrs, string, **kwargs)
886 findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0")
888 def find_all_previous(
889 self,
890 name: _FindMethodName = None,
891 attrs: _StrainableAttributes = {},
892 string: Optional[_StrainableString] = None,
893 limit: Optional[int] = None,
894 _stacklevel: int = 2,
895 **kwargs: _StrainableAttribute,
896 ) -> _QueryResults:
897 """Look backwards in the document from this `PageElement` and find all
898 `PageElement` that match the given criteria.
900 All find_* methods take a common set of arguments. See the online
901 documentation for detailed explanations.
903 :param name: A filter on tag name.
904 :param attrs: Additional filters on attribute values.
905 :param string: A filter for a `NavigableString` with specific text.
906 :param limit: Stop looking after finding this many results.
907 :param _stacklevel: Used internally to improve warning messages.
908 :kwargs: Additional filters on attribute values.
909 """
910 return self._find_all(
911 name,
912 attrs,
913 string,
914 limit,
915 self.previous_elements,
916 _stacklevel=_stacklevel + 1,
917 **kwargs,
918 )
920 findAllPrevious = _deprecated_function_alias(
921 "findAllPrevious", "find_all_previous", "4.0.0"
922 )
923 fetchAllPrevious = _deprecated_function_alias(
924 "fetchAllPrevious", "find_all_previous", "3.0.0"
925 )
927 def find_previous_sibling(
928 self,
929 name: _FindMethodName = None,
930 attrs: _StrainableAttributes = {},
931 string: Optional[_StrainableString] = None,
932 **kwargs: _StrainableAttribute,
933 ) -> _AtMostOneElement:
934 """Returns the closest sibling to this `PageElement` that matches the
935 given criteria and appears earlier in the document.
937 All find_* methods take a common set of arguments. See the online
938 documentation for detailed explanations.
940 :param name: A filter on tag name.
941 :param attrs: Additional filters on attribute values.
942 :param string: A filter for a `NavigableString` with specific text.
943 :kwargs: Additional filters on attribute values.
944 """
945 return self._find_one(
946 self.find_previous_siblings, name, attrs, string, **kwargs
947 )
949 findPreviousSibling = _deprecated_function_alias(
950 "findPreviousSibling", "find_previous_sibling", "4.0.0"
951 )
953 def find_previous_siblings(
954 self,
955 name: _FindMethodName = None,
956 attrs: _StrainableAttributes = {},
957 string: Optional[_StrainableString] = None,
958 limit: Optional[int] = None,
959 _stacklevel: int = 2,
960 **kwargs: _StrainableAttribute,
961 ) -> _QueryResults:
962 """Returns all siblings to this PageElement that match the
963 given criteria and appear earlier in the document.
965 All find_* methods take a common set of arguments. See the online
966 documentation for detailed explanations.
968 :param name: A filter on tag name.
969 :param attrs: Additional filters on attribute values.
970 :param string: A filter for a NavigableString with specific text.
971 :param limit: Stop looking after finding this many results.
972 :param _stacklevel: Used internally to improve warning messages.
973 :kwargs: Additional filters on attribute values.
974 """
975 return self._find_all(
976 name,
977 attrs,
978 string,
979 limit,
980 self.previous_siblings,
981 _stacklevel=_stacklevel + 1,
982 **kwargs,
983 )
985 findPreviousSiblings = _deprecated_function_alias(
986 "findPreviousSiblings", "find_previous_siblings", "4.0.0"
987 )
988 fetchPreviousSiblings = _deprecated_function_alias(
989 "fetchPreviousSiblings", "find_previous_siblings", "3.0.0"
990 )
992 def find_parent(
993 self,
994 name: _FindMethodName = None,
995 attrs: _StrainableAttributes = {},
996 **kwargs: _StrainableAttribute,
997 ) -> _AtMostOneElement:
998 """Find the closest parent of this PageElement that matches the given
999 criteria.
1001 All find_* methods take a common set of arguments. See the online
1002 documentation for detailed explanations.
1004 :param name: A filter on tag name.
1005 :param attrs: Additional filters on attribute values.
1006 :param self: Whether the PageElement itself should be considered
1007 as one of its 'parents'.
1008 :kwargs: Additional filters on attribute values.
1009 """
1010 # NOTE: We can't use _find_one because findParents takes a different
1011 # set of arguments.
1012 r = None
1013 results = self.find_parents(
1014 name, attrs, 1, _stacklevel=3, **kwargs
1015 )
1016 if results:
1017 r = results[0]
1018 return r
1020 findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0")
1022 def find_parents(
1023 self,
1024 name: _FindMethodName = None,
1025 attrs: _StrainableAttributes = {},
1026 limit: Optional[int] = None,
1027 _stacklevel: int = 2,
1028 **kwargs: _StrainableAttribute,
1029 ) -> _QueryResults:
1030 """Find all parents of this `PageElement` that match the given criteria.
1032 All find_* methods take a common set of arguments. See the online
1033 documentation for detailed explanations.
1035 :param name: A filter on tag name.
1036 :param attrs: Additional filters on attribute values.
1037 :param limit: Stop looking after finding this many results.
1038 :param _stacklevel: Used internally to improve warning messages.
1039 :kwargs: Additional filters on attribute values.
1040 """
1041 iterator = self.parents
1042 return self._find_all(
1043 name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs
1044 )
1046 findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0")
1047 fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0")
1049 @property
1050 def next(self) -> _AtMostOneElement:
1051 """The `PageElement`, if any, that was parsed just after this one."""
1052 return self.next_element
1054 @property
1055 def previous(self) -> _AtMostOneElement:
1056 """The `PageElement`, if any, that was parsed just before this one."""
1057 return self.previous_element
1059 # These methods do the real heavy lifting.
1061 def _find_one(
1062 self,
1063 # TODO-TYPING: "There is no syntax to indicate optional or
1064 # keyword arguments; such function types are rarely used
1065 # as callback types." - So, not sure how to get more
1066 # specific here.
1067 method: Callable,
1068 name: _FindMethodName,
1069 attrs: _StrainableAttributes,
1070 string: Optional[_StrainableString],
1071 **kwargs: _StrainableAttribute,
1072 ) -> _AtMostOneElement:
1073 r: _AtMostOneElement = None
1074 results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
1075 if results:
1076 r = results[0]
1077 return r
1079 def _find_all(
1080 self,
1081 name: _FindMethodName,
1082 attrs: _StrainableAttributes,
1083 string: Optional[_StrainableString],
1084 limit: Optional[int],
1085 generator: Iterator[PageElement],
1086 _stacklevel: int = 3,
1087 **kwargs: _StrainableAttribute,
1088 ) -> _QueryResults:
1089 """Iterates over a generator looking for things that match."""
1091 if string is None and "text" in kwargs:
1092 string = kwargs.pop("text")
1093 warnings.warn(
1094 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
1095 DeprecationWarning,
1096 stacklevel=_stacklevel,
1097 )
1099 if "_class" in kwargs:
1100 warnings.warn(
1101 AttributeResemblesVariableWarning.MESSAGE
1102 % dict(
1103 original="_class",
1104 autocorrect="class_",
1105 ),
1106 AttributeResemblesVariableWarning,
1107 stacklevel=_stacklevel,
1108 )
1110 from bs4.filter import ElementFilter
1112 if isinstance(name, ElementFilter):
1113 matcher = name
1114 else:
1115 matcher = SoupStrainer(name, attrs, string, **kwargs)
1117 result: Iterable[_OneElement]
1118 if string is None and not limit and not attrs and not kwargs:
1119 if name is True or name is None:
1120 # Optimization to find all tags.
1121 result = (element for element in generator if isinstance(element, Tag))
1122 return ResultSet(matcher, result)
1123 elif isinstance(name, str):
1124 # Optimization to find all tags with a given name.
1125 if name.count(":") == 1:
1126 # This is a name with a prefix. If this is a namespace-aware document,
1127 # we need to match the local name against tag.name. If not,
1128 # we need to match the fully-qualified name against tag.name.
1129 prefix, local_name = name.split(":", 1)
1130 else:
1131 prefix = None
1132 local_name = name
1133 result = []
1134 for element in generator:
1135 if not isinstance(element, Tag):
1136 continue
1137 if element.name == name or (
1138 element.name == local_name
1139 and (prefix is None or element.prefix == prefix)
1140 ):
1141 result.append(element)
1142 return ResultSet(matcher, result)
1143 return matcher.find_all(generator, limit)
1145 # These generators can be used to navigate starting from both
1146 # NavigableStrings and Tags.
1147 @property
1148 def next_elements(self) -> Iterator[PageElement]:
1149 """All PageElements that were parsed after this one."""
1150 i = self.next_element
1151 while i is not None:
1152 successor = i.next_element
1153 yield i
1154 i = successor
1156 @property
1157 def self_and_next_elements(self) -> Iterator[PageElement]:
1158 """This PageElement, then all PageElements that were parsed after it."""
1159 return self._self_and(self.next_elements)
1161 @property
1162 def next_siblings(self) -> Iterator[PageElement]:
1163 """All PageElements that are siblings of this one but were parsed
1164 later.
1165 """
1166 i = self.next_sibling
1167 while i is not None:
1168 successor = i.next_sibling
1169 yield i
1170 i = successor
1172 @property
1173 def self_and_next_siblings(self) -> Iterator[PageElement]:
1174 """This PageElement, then all of its siblings."""
1175 return self._self_and(self.next_siblings)
1177 @property
1178 def previous_elements(self) -> Iterator[PageElement]:
1179 """All PageElements that were parsed before this one.
1181 :yield: A sequence of PageElements.
1182 """
1183 i = self.previous_element
1184 while i is not None:
1185 successor = i.previous_element
1186 yield i
1187 i = successor
1189 @property
1190 def self_and_previous_elements(self) -> Iterator[PageElement]:
1191 """This PageElement, then all elements that were parsed
1192 earlier."""
1193 return self._self_and(self.previous_elements)
1195 @property
1196 def previous_siblings(self) -> Iterator[PageElement]:
1197 """All PageElements that are siblings of this one but were parsed
1198 earlier.
1200 :yield: A sequence of PageElements.
1201 """
1202 i = self.previous_sibling
1203 while i is not None:
1204 successor = i.previous_sibling
1205 yield i
1206 i = successor
1208 @property
1209 def self_and_previous_siblings(self) -> Iterator[PageElement]:
1210 """This PageElement, then all of its siblings that were parsed
1211 earlier."""
1212 return self._self_and(self.previous_siblings)
1214 @property
1215 def parents(self) -> Iterator[Tag]:
1216 """All elements that are parents of this PageElement.
1218 :yield: A sequence of Tags, ending with a BeautifulSoup object.
1219 """
1220 i = self.parent
1221 while i is not None:
1222 successor = i.parent
1223 yield i
1224 i = successor
1226 @property
1227 def self_and_parents(self) -> Iterator[PageElement]:
1228 """This element, then all of its parents.
1230 :yield: A sequence of PageElements, ending with a BeautifulSoup object.
1231 """
1232 return self._self_and(self.parents)
1234 def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]:
1235 """Modify a generator by yielding this element, then everything
1236 yielded by the other generator.
1237 """
1238 if not self.hidden:
1239 yield self
1240 for i in other_generator:
1241 yield i
1243 @property
1244 def decomposed(self) -> bool:
1245 """Check whether a PageElement has been decomposed."""
1246 return getattr(self, "_decomposed", False) or False
1248 @_deprecated("next_elements", "4.0.0")
1249 def nextGenerator(self) -> Iterator[PageElement]:
1250 ":meta private:"
1251 return self.next_elements
1253 @_deprecated("next_siblings", "4.0.0")
1254 def nextSiblingGenerator(self) -> Iterator[PageElement]:
1255 ":meta private:"
1256 return self.next_siblings
1258 @_deprecated("previous_elements", "4.0.0")
1259 def previousGenerator(self) -> Iterator[PageElement]:
1260 ":meta private:"
1261 return self.previous_elements
1263 @_deprecated("previous_siblings", "4.0.0")
1264 def previousSiblingGenerator(self) -> Iterator[PageElement]:
1265 ":meta private:"
1266 return self.previous_siblings
1268 @_deprecated("parents", "4.0.0")
1269 def parentGenerator(self) -> Iterator[PageElement]:
1270 ":meta private:"
1271 return self.parents
1274class NavigableString(str, PageElement):
1275 """A Python string that is part of a parse tree.
1277 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
1278 create a `NavigableString` for the string "penguin".
1279 """
1281 #: A string prepended to the body of the 'real' string
1282 #: when formatting it as part of a document, such as the '<!--'
1283 #: in an HTML comment.
1284 PREFIX: str = ""
1286 #: A string appended to the body of the 'real' string
1287 #: when formatting it as part of a document, such as the '-->'
1288 #: in an HTML comment.
1289 SUFFIX: str = ""
1291 def __new__(cls, value: Union[str, bytes]) -> Self:
1292 """Create a new NavigableString.
1294 When unpickling a NavigableString, this method is called with
1295 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
1296 passed in to the superclass's __new__ or the superclass won't know
1297 how to handle non-ASCII characters.
1298 """
1299 if isinstance(value, str):
1300 u = str.__new__(cls, value)
1301 else:
1302 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
1303 u.hidden = False
1304 u.setup()
1305 return u
1307 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
1308 """A copy of a NavigableString has the same contents and class
1309 as the original, but it is not connected to the parse tree.
1311 :param recursive: This parameter is ignored; it's only defined
1312 so that NavigableString.__deepcopy__ implements the same
1313 signature as Tag.__deepcopy__.
1314 """
1315 return type(self)(self)
1317 def __getnewargs__(self) -> Tuple[str]:
1318 return (str(self),)
1320 # TODO-TYPING This should be SupportsIndex|slice but SupportsIndex
1321 # is introduced in 3.8.
1322 def __getitem__(self, key: Union[int|slice]) -> str:
1323 """Raise an exception """
1324 if isinstance(key, str):
1325 raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__))
1326 return super(NavigableString, self).__getitem__(key)
1328 @property
1329 def string(self) -> str:
1330 """Convenience property defined to match `Tag.string`.
1332 :return: This property always returns the `NavigableString` it was
1333 called on.
1335 :meta private:
1336 """
1337 return self
1339 def output_ready(self, formatter: _FormatterOrName = "minimal") -> str:
1340 """Run the string through the provided formatter, making it
1341 ready for output as part of an HTML or XML document.
1343 :param formatter: A `Formatter` object, or a string naming one
1344 of the standard formatters.
1345 """
1346 output = self.format_string(self, formatter)
1347 return self.PREFIX + output + self.SUFFIX
1349 @property
1350 def name(self) -> None:
1351 """Since a NavigableString is not a Tag, it has no .name.
1353 This property is implemented so that code like this doesn't crash
1354 when run on a mixture of Tag and NavigableString objects:
1355 [x.name for x in tag.children]
1357 :meta private:
1358 """
1359 return None
1361 @name.setter
1362 def name(self, name: str) -> None:
1363 """Prevent NavigableString.name from ever being set.
1365 :meta private:
1366 """
1367 raise AttributeError("A NavigableString cannot be given a name.")
1369 def _all_strings(
1370 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
1371 ) -> Iterator[str]:
1372 """Yield all strings of certain classes, possibly stripping them.
1374 This makes it easy for NavigableString to implement methods
1375 like get_text() as conveniences, creating a consistent
1376 text-extraction API across all PageElements.
1378 :param strip: If True, all strings will be stripped before being
1379 yielded.
1381 :param types: A tuple of NavigableString subclasses. If this
1382 NavigableString isn't one of those subclasses, the
1383 sequence will be empty. By default, the subclasses
1384 considered are NavigableString and CData objects. That
1385 means no comments, processing instructions, etc.
1387 :yield: A sequence that either contains this string, or is empty.
1388 """
1389 if types is self.default:
1390 # This is kept in Tag because it's full of subclasses of
1391 # this class, which aren't defined until later in the file.
1392 types = Tag.MAIN_CONTENT_STRING_TYPES
1394 # Do nothing if the caller is looking for specific types of
1395 # string, and we're of a different type.
1396 #
1397 # We check specific types instead of using isinstance(self,
1398 # types) because all of these classes subclass
1399 # NavigableString. Anyone who's using this feature probably
1400 # wants generic NavigableStrings but not other stuff.
1401 my_type = type(self)
1402 if types is not None:
1403 if isinstance(types, type):
1404 # Looking for a single type.
1405 if my_type is not types:
1406 return
1407 elif my_type not in types:
1408 # Looking for one of a list of types.
1409 return
1411 value = self
1412 if strip:
1413 final_value = value.strip()
1414 else:
1415 final_value = self
1416 if len(final_value) > 0:
1417 yield final_value
1419 @property
1420 def strings(self) -> Iterator[str]:
1421 """Yield this string, but only if it is interesting.
1423 This is defined the way it is for compatibility with
1424 `Tag.strings`. See `Tag` for information on which strings are
1425 interesting in a given context.
1427 :yield: A sequence that either contains this string, or is empty.
1428 """
1429 return self._all_strings()
1432class PreformattedString(NavigableString):
1433 """A `NavigableString` not subject to the normal formatting rules.
1435 This is an abstract class used for special kinds of strings such
1436 as comments (`Comment`) and CDATA blocks (`CData`).
1437 """
1439 PREFIX: str = ""
1440 SUFFIX: str = ""
1442 def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str:
1443 """Make this string ready for output by adding any subclass-specific
1444 prefix or suffix.
1446 :param formatter: A `Formatter` object, or a string naming one
1447 of the standard formatters. The string will be passed into the
1448 `Formatter`, but only to trigger any side effects: the return
1449 value is ignored.
1451 :return: The string, with any subclass-specific prefix and
1452 suffix added on.
1453 """
1454 if formatter is not None:
1455 self.format_string(self, formatter)
1456 return self.PREFIX + self + self.SUFFIX
1459class CData(PreformattedString):
1460 """A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_."""
1462 PREFIX: str = "<![CDATA["
1463 SUFFIX: str = "]]>"
1466class ProcessingInstruction(PreformattedString):
1467 """A SGML processing instruction."""
1469 PREFIX: str = "<?"
1470 SUFFIX: str = ">"
1473class XMLProcessingInstruction(ProcessingInstruction):
1474 """An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_."""
1476 PREFIX: str = "<?"
1477 SUFFIX: str = "?>"
1480class Comment(PreformattedString):
1481 """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_."""
1483 PREFIX: str = "<!--"
1484 SUFFIX: str = "-->"
1487class Declaration(PreformattedString):
1488 """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_."""
1490 PREFIX: str = "<?"
1491 SUFFIX: str = "?>"
1494class Doctype(PreformattedString):
1495 """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_."""
1497 @classmethod
1498 def for_name_and_ids(
1499 cls, name: str, pub_id: Optional[str], system_id: Optional[str]
1500 ) -> Doctype:
1501 """Generate an appropriate document type declaration for a given
1502 public ID and system ID.
1504 :param name: The name of the document's root element, e.g. 'html'.
1505 :param pub_id: The Formal Public Identifier for this document type,
1506 e.g. '-//W3C//DTD XHTML 1.1//EN'
1507 :param system_id: The system identifier for this document type,
1508 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1509 """
1510 return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id))
1512 @classmethod
1513 def _string_for_name_and_ids(
1514 self, name: str, pub_id: Optional[str], system_id: Optional[str]
1515 ) -> str:
1516 """Generate a string to be used as the basis of a Doctype object.
1518 This is a separate method from for_name_and_ids() because the lxml
1519 TreeBuilder needs to call it.
1520 """
1521 value = name or ""
1522 if pub_id is not None:
1523 value += ' PUBLIC "%s"' % pub_id
1524 if system_id is not None:
1525 value += ' "%s"' % system_id
1526 elif system_id is not None:
1527 value += ' SYSTEM "%s"' % system_id
1528 return value
1530 PREFIX: str = "<!DOCTYPE "
1531 SUFFIX: str = ">\n"
1534class Stylesheet(NavigableString):
1535 """A `NavigableString` representing the contents of a `<style> HTML
1536 tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_
1537 (probably CSS).
1539 Used to distinguish embedded stylesheets from textual content.
1540 """
1543class Script(NavigableString):
1544 """A `NavigableString` representing the contents of a `<script>
1545 HTML tag
1546 <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_
1547 (probably Javascript).
1549 Used to distinguish executable code from textual content.
1550 """
1553class TemplateString(NavigableString):
1554 """A `NavigableString` representing a string found inside an `HTML
1555 <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_
1556 embedded in a larger document.
1558 Used to distinguish such strings from the main body of the document.
1559 """
1562class RubyTextString(NavigableString):
1563 """A NavigableString representing the contents of an `<rt> HTML
1564 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_.
1566 Can be used to distinguish such strings from the strings they're
1567 annotating.
1568 """
1571class RubyParenthesisString(NavigableString):
1572 """A NavigableString representing the contents of an `<rp> HTML
1573 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_.
1574 """
1577class Tag(PageElement):
1578 """An HTML or XML tag that is part of a parse tree, along with its
1579 attributes, contents, and relationships to other parts of the tree.
1581 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
1582 create a `Tag` object representing the ``<b>`` tag. You can
1583 instantiate `Tag` objects directly, but it's not necessary unless
1584 you're adding entirely new markup to a parsed document. Most of
1585 the constructor arguments are intended for use by the `TreeBuilder`
1586 that's parsing a document.
1588 :param parser: A `BeautifulSoup` object representing the parse tree this
1589 `Tag` will be part of.
1590 :param builder: The `TreeBuilder` being used to build the tree.
1591 :param name: The name of the tag.
1592 :param namespace: The URI of this tag's XML namespace, if any.
1593 :param prefix: The prefix for this tag's XML namespace, if any.
1594 :param attrs: A dictionary of attribute values.
1595 :param parent: The `Tag` to use as the parent of this `Tag`. May be
1596 the `BeautifulSoup` object itself.
1597 :param previous: The `PageElement` that was parsed immediately before
1598 parsing this tag.
1599 :param is_xml: If True, this is an XML tag. Otherwise, this is an
1600 HTML tag.
1601 :param sourceline: The line number where this tag was found in its
1602 source document.
1603 :param sourcepos: The character position within ``sourceline`` where this
1604 tag was found.
1605 :param can_be_empty_element: If True, this tag should be
1606 represented as <tag/>. If False, this tag should be represented
1607 as <tag></tag>.
1608 :param cdata_list_attributes: A dictionary of attributes whose values should
1609 be parsed as lists of strings if they ever show up on this tag.
1610 :param preserve_whitespace_tags: Names of tags whose contents
1611 should have their whitespace preserved if they are encountered inside
1612 this tag.
1613 :param interesting_string_types: When iterating over this tag's
1614 string contents in methods like `Tag.strings` or
1615 `PageElement.get_text`, these are the types of strings that are
1616 interesting enough to be considered. By default,
1617 `NavigableString` (normal strings) and `CData` (CDATA
1618 sections) are the only interesting string subtypes.
1619 :param namespaces: A dictionary mapping currently active
1620 namespace prefixes to URIs, as of the point in the parsing process when
1621 this tag was encountered. This can be used later to
1622 construct CSS selectors.
1624 """
1626 def __init__(
1627 self,
1628 parser: Optional[BeautifulSoup] = None,
1629 builder: Optional[TreeBuilder] = None,
1630 name: Optional[str] = None,
1631 namespace: Optional[str] = None,
1632 prefix: Optional[str] = None,
1633 attrs: Optional[_RawOrProcessedAttributeValues] = None,
1634 parent: Optional[Union[BeautifulSoup, Tag]] = None,
1635 previous: _AtMostOneElement = None,
1636 is_xml: Optional[bool] = None,
1637 sourceline: Optional[int] = None,
1638 sourcepos: Optional[int] = None,
1639 can_be_empty_element: Optional[bool] = None,
1640 cdata_list_attributes: Optional[Dict[str, Set[str]]] = None,
1641 preserve_whitespace_tags: Optional[Set[str]] = None,
1642 interesting_string_types: Optional[Set[Type[NavigableString]]] = None,
1643 namespaces: Optional[Dict[str, str]] = None,
1644 # NOTE: Any new arguments here need to be mirrored in
1645 # Tag.copy_self, and potentially BeautifulSoup.new_tag
1646 # as well.
1647 ):
1648 if parser is None:
1649 self.parser_class = None
1650 else:
1651 # We don't actually store the parser object: that lets extracted
1652 # chunks be garbage-collected.
1653 self.parser_class = parser.__class__
1654 if name is None:
1655 raise ValueError("No value provided for new tag's name.")
1656 self.name = name
1657 self.namespace = namespace
1658 self._namespaces = namespaces or {}
1659 self.prefix = prefix
1660 if (not builder or builder.store_line_numbers) and (
1661 sourceline is not None or sourcepos is not None
1662 ):
1663 self.sourceline = sourceline
1664 self.sourcepos = sourcepos
1665 else:
1666 self.sourceline = sourceline
1667 self.sourcepos = sourcepos
1669 attr_dict_class: type[AttributeDict]
1670 attribute_value_list_class: type[AttributeValueList]
1671 if builder is None:
1672 if is_xml:
1673 attr_dict_class = XMLAttributeDict
1674 else:
1675 attr_dict_class = HTMLAttributeDict
1676 attribute_value_list_class = AttributeValueList
1677 else:
1678 attr_dict_class = builder.attribute_dict_class
1679 attribute_value_list_class = builder.attribute_value_list_class
1680 self.attribute_value_list_class = attribute_value_list_class
1682 if attrs is None:
1683 self.attrs = attr_dict_class()
1684 else:
1685 if builder is not None and builder.cdata_list_attributes:
1686 self.attrs = builder._replace_cdata_list_attribute_values(
1687 self.name, attrs
1688 )
1689 else:
1690 self.attrs = attr_dict_class()
1691 # Make sure that the values of any multi-valued
1692 # attributes (e.g. when a Tag is copied) are stored in
1693 # new lists.
1694 for k, v in attrs.items():
1695 if isinstance(v, list):
1696 v = v.__class__(v)
1697 self.attrs[k] = v
1699 # If possible, determine ahead of time whether this tag is an
1700 # XML tag.
1701 if builder:
1702 self.known_xml = builder.is_xml
1703 else:
1704 self.known_xml = is_xml
1705 self.contents: List[PageElement] = []
1706 self.setup(parent, previous)
1707 self.hidden = False
1709 if builder is None:
1710 # In the absence of a TreeBuilder, use whatever values were
1711 # passed in here. They're probably None, unless this is a copy of some
1712 # other tag.
1713 self.can_be_empty_element = can_be_empty_element
1714 self.cdata_list_attributes = cdata_list_attributes
1715 self.preserve_whitespace_tags = preserve_whitespace_tags
1716 self.interesting_string_types = interesting_string_types
1717 else:
1718 # Set up any substitutions for this tag, such as the charset in a META tag.
1719 self.attribute_value_list_class = builder.attribute_value_list_class
1720 builder.set_up_substitutions(self)
1722 # Ask the TreeBuilder whether this tag might be an empty-element tag.
1723 self.can_be_empty_element = builder.can_be_empty_element(name)
1725 # Keep track of the list of attributes of this tag that
1726 # might need to be treated as a list.
1727 #
1728 # For performance reasons, we store the whole data structure
1729 # rather than asking the question of every tag. Asking would
1730 # require building a new data structure every time, and
1731 # (unlike can_be_empty_element), we almost never need
1732 # to check this.
1733 self.cdata_list_attributes = builder.cdata_list_attributes
1735 # Keep track of the names that might cause this tag to be treated as a
1736 # whitespace-preserved tag.
1737 self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1739 if self.name in builder.string_containers:
1740 # This sort of tag uses a special string container
1741 # subclass for most of its strings. We need to be able
1742 # to look up the proper container subclass.
1743 self.interesting_string_types = {builder.string_containers[self.name]}
1744 else:
1745 self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES
1747 parser_class: Optional[type[BeautifulSoup]]
1748 name: str
1749 namespace: Optional[str]
1750 prefix: Optional[str]
1751 attrs: _AttributeValues
1752 sourceline: Optional[int]
1753 sourcepos: Optional[int]
1754 known_xml: Optional[bool]
1755 contents: List[PageElement]
1756 hidden: bool
1757 interesting_string_types: Optional[Set[Type[NavigableString]]]
1759 can_be_empty_element: Optional[bool]
1760 cdata_list_attributes: Optional[Dict[str, Set[str]]]
1761 preserve_whitespace_tags: Optional[Set[str]]
1763 #: :meta private:
1764 parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0")
1766 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self:
1767 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
1768 Its contents are a copy of the old Tag's contents.
1769 """
1770 clone = self.copy_self()
1772 if recursive:
1773 # Clone this tag's descendants recursively, but without
1774 # making any recursive function calls.
1775 tag_stack: List[Tag] = [clone]
1776 for event, element in self._event_stream(self.descendants):
1777 if event is Tag.END_ELEMENT_EVENT:
1778 # Stop appending incoming Tags to the Tag that was
1779 # just closed.
1780 tag_stack.pop()
1781 else:
1782 descendant_clone = element.__deepcopy__(memo, recursive=False)
1783 # Add to its parent's .contents
1784 tag_stack[-1].append(descendant_clone)
1786 if event is Tag.START_ELEMENT_EVENT:
1787 # Add the Tag itself to the stack so that its
1788 # children will be .appended to it.
1789 tag_stack.append(cast(Tag, descendant_clone))
1790 return clone
1792 def copy_self(self) -> Self:
1793 """Create a new Tag just like this one, but with no
1794 contents and unattached to any parse tree.
1796 This is the first step in the deepcopy process, but you can
1797 call it on its own to create a copy of a Tag without copying its
1798 contents.
1799 """
1800 clone = type(self)(
1801 None,
1802 None,
1803 self.name,
1804 self.namespace,
1805 self.prefix,
1806 self.attrs,
1807 is_xml=self._is_xml,
1808 sourceline=self.sourceline,
1809 sourcepos=self.sourcepos,
1810 can_be_empty_element=self.can_be_empty_element,
1811 cdata_list_attributes=self.cdata_list_attributes,
1812 preserve_whitespace_tags=self.preserve_whitespace_tags,
1813 interesting_string_types=self.interesting_string_types,
1814 namespaces=self._namespaces,
1815 )
1816 for attr in ("can_be_empty_element", "hidden"):
1817 setattr(clone, attr, getattr(self, attr))
1818 return clone
1820 @property
1821 def is_empty_element(self) -> bool:
1822 """Is this tag an empty-element tag? (aka a self-closing tag)
1824 A tag that has contents is never an empty-element tag.
1826 A tag that has no contents may or may not be an empty-element
1827 tag. It depends on the `TreeBuilder` used to create the
1828 tag. If the builder has a designated list of empty-element
1829 tags, then only a tag whose name shows up in that list is
1830 considered an empty-element tag. This is usually the case
1831 for HTML documents.
1833 If the builder has no designated list of empty-element, then
1834 any tag with no contents is an empty-element tag. This is usually
1835 the case for XML documents.
1836 """
1837 return len(self.contents) == 0 and self.can_be_empty_element is True
1839 @_deprecated("is_empty_element", "4.0.0")
1840 def isSelfClosing(self) -> bool:
1841 ": :meta private:"
1842 return self.is_empty_element
1844 @property
1845 def string(self) -> Optional[str]:
1846 """Convenience property to get the single string within this
1847 `Tag`, assuming there is just one.
1849 :return: If this `Tag` has a single child that's a
1850 `NavigableString`, the return value is that string. If this
1851 element has one child `Tag`, the return value is that child's
1852 `Tag.string`, recursively. If this `Tag` has no children,
1853 or has more than one child, the return value is ``None``.
1855 If this property is unexpectedly returning ``None`` for you,
1856 it's probably because your `Tag` has more than one thing
1857 inside it.
1858 """
1859 if len(self.contents) != 1:
1860 return None
1861 child = self.contents[0]
1862 if isinstance(child, NavigableString):
1863 return child
1864 elif isinstance(child, Tag):
1865 return child.string
1866 return None
1868 @string.setter
1869 def string(self, string: str) -> None:
1870 """Replace the `Tag.contents` of this `Tag` with a single string."""
1871 self.clear()
1872 if isinstance(string, NavigableString):
1873 new_class = string.__class__
1874 else:
1875 new_class = NavigableString
1876 self.append(new_class(string))
1878 #: :meta private:
1879 MAIN_CONTENT_STRING_TYPES = {NavigableString, CData}
1881 def _all_strings(
1882 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
1883 ) -> Iterator[str]:
1884 """Yield all strings of certain classes, possibly stripping them.
1886 :param strip: If True, all strings will be stripped before being
1887 yielded.
1889 :param types: A tuple of NavigableString subclasses. Any strings of
1890 a subclass not found in this list will be ignored. By
1891 default, the subclasses considered are the ones found in
1892 self.interesting_string_types. If that's not specified,
1893 only NavigableString and CData objects will be
1894 considered. That means no comments, processing
1895 instructions, etc.
1896 """
1897 if types is self.default:
1898 if self.interesting_string_types is None:
1899 types = self.MAIN_CONTENT_STRING_TYPES
1900 else:
1901 types = self.interesting_string_types
1903 for descendant in self.descendants:
1904 if not isinstance(descendant, NavigableString):
1905 continue
1906 descendant_type = type(descendant)
1907 if isinstance(types, type):
1908 if descendant_type is not types:
1909 # We're not interested in strings of this type.
1910 continue
1911 elif types is not None and descendant_type not in types:
1912 # We're not interested in strings of this type.
1913 continue
1914 if strip:
1915 stripped = descendant.strip()
1916 if len(stripped) == 0:
1917 continue
1918 yield stripped
1919 else:
1920 yield descendant
1922 strings = property(_all_strings)
1924 def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]:
1925 """Insert one or more new PageElements as a child of this `Tag`.
1927 This works similarly to :py:meth:`list.insert`, except you can insert
1928 multiple elements at once.
1930 :param position: The numeric position that should be occupied
1931 in this Tag's `Tag.children` by the first new `PageElement`.
1933 :param new_children: The PageElements to insert.
1935 :return The newly inserted PageElements.
1936 """
1937 inserted: List[PageElement] = []
1938 for new_child in new_children:
1939 inserted.extend(self._insert(position, new_child))
1940 position += 1
1941 return inserted
1943 def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]:
1944 if new_child is None:
1945 raise ValueError("Cannot insert None into a tag.")
1946 if new_child is self:
1947 raise ValueError("Cannot insert a tag into itself.")
1948 if isinstance(new_child, str) and not isinstance(new_child, NavigableString):
1949 new_child = NavigableString(new_child)
1951 from bs4 import BeautifulSoup
1952 if isinstance(new_child, BeautifulSoup):
1953 # We don't want to end up with a situation where one BeautifulSoup
1954 # object contains another. Insert the BeautifulSoup's children and
1955 # return them.
1956 return self.insert(position, *list(new_child.contents))
1957 position = min(position, len(self.contents))
1958 if hasattr(new_child, "parent") and new_child.parent is not None:
1959 # We're 'inserting' an element that's already one
1960 # of this object's children.
1961 if new_child.parent is self:
1962 current_index = self.index(new_child)
1963 if current_index < position:
1964 # We're moving this element further down the list
1965 # of this object's children. That means that when
1966 # we extract this element, our target index will
1967 # jump down one.
1968 position -= 1
1969 elif current_index == position:
1970 # We're 'inserting' an element into its current location.
1971 # This is a no-op.
1972 return [new_child]
1973 new_child.extract()
1975 new_child.parent = self
1976 previous_child = None
1977 if position == 0:
1978 new_child.previous_sibling = None
1979 new_child.previous_element = self
1980 else:
1981 previous_child = self.contents[position - 1]
1982 new_child.previous_sibling = previous_child
1983 new_child.previous_sibling.next_sibling = new_child
1984 new_child.previous_element = previous_child._last_descendant(False)
1985 if new_child.previous_element is not None:
1986 new_child.previous_element.next_element = new_child
1988 new_childs_last_element = new_child._last_descendant(
1989 is_initialized=False, accept_self=True
1990 )
1991 # new_childs_last_element can't be None because we passed
1992 # accept_self=True into _last_descendant. Worst case,
1993 # new_childs_last_element will be new_child itself. Making
1994 # this cast removes several mypy complaints later on as we
1995 # manipulate new_childs_last_element.
1996 new_childs_last_element = cast(PageElement, new_childs_last_element)
1998 if position >= len(self.contents):
1999 new_child.next_sibling = None
2001 parent: Optional[Tag] = self
2002 parents_next_sibling = None
2003 while parents_next_sibling is None and parent is not None:
2004 parents_next_sibling = parent.next_sibling
2005 parent = parent.parent
2006 if parents_next_sibling is not None:
2007 # We found the element that comes next in the document.
2008 break
2009 if parents_next_sibling is not None:
2010 new_childs_last_element.next_element = parents_next_sibling
2011 else:
2012 # The last element of this tag is the last element in
2013 # the document.
2014 new_childs_last_element.next_element = None
2015 else:
2016 next_child = self.contents[position]
2017 new_child.next_sibling = next_child
2018 if new_child.next_sibling is not None:
2019 new_child.next_sibling.previous_sibling = new_child
2020 new_childs_last_element.next_element = next_child
2022 if new_childs_last_element.next_element is not None:
2023 new_childs_last_element.next_element.previous_element = (
2024 new_childs_last_element
2025 )
2026 self.contents.insert(position, new_child)
2028 return [new_child]
2030 def unwrap(self) -> Self:
2031 """Replace this `PageElement` with its contents.
2033 :return: This object, no longer part of the tree.
2034 """
2035 my_parent = self.parent
2036 if my_parent is None:
2037 raise ValueError(
2038 "Cannot replace an element with its contents when that "
2039 "element is not part of a tree."
2040 )
2041 my_index = my_parent.index(self)
2042 self.extract(_self_index=my_index)
2043 for child in reversed(self.contents[:]):
2044 my_parent.insert(my_index, child)
2045 return self
2047 replace_with_children = unwrap
2049 @_deprecated("unwrap", "4.0.0")
2050 def replaceWithChildren(self) -> _OneElement:
2051 ": :meta private:"
2052 return self.unwrap()
2054 def append(self, tag: _InsertableElement) -> PageElement:
2055 """
2056 Appends the given `PageElement` to the contents of this `Tag`.
2058 :param tag: A PageElement.
2060 :return The newly appended PageElement.
2061 """
2062 return self.insert(len(self.contents), tag)[0]
2064 def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]:
2065 """Appends one or more objects to the contents of this
2066 `Tag`.
2068 :param tags: If a list of `PageElement` objects is provided,
2069 they will be appended to this tag's contents, one at a time.
2070 If a single `Tag` is provided, its `Tag.contents` will be
2071 used to extend this object's `Tag.contents`.
2073 :return The list of PageElements that were appended.
2074 """
2075 tag_list: Iterable[_InsertableElement]
2077 if isinstance(tags, Tag):
2078 tag_list = list(tags.contents)
2079 elif isinstance(tags, (PageElement, str)):
2080 # The caller should really be using append() instead,
2081 # but we can make it work.
2082 warnings.warn(
2083 "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.",
2084 UserWarning,
2085 stacklevel=2,
2086 )
2087 if isinstance(tags, str) and not isinstance(tags, PageElement):
2088 tags = NavigableString(tags)
2089 tag_list = [tags]
2090 elif isinstance(tags, Iterable):
2091 # Moving items around the tree may change their position in
2092 # the original list. Make a list that won't change.
2093 tag_list = list(tags)
2095 results: List[PageElement] = []
2096 for tag in tag_list:
2097 results.append(self.append(tag))
2099 return results
2101 def clear(self, decompose: bool = False) -> None:
2102 """Destroy all children of this `Tag` by calling
2103 `PageElement.extract` on them.
2105 :param decompose: If this is True, `PageElement.decompose` (a
2106 more destructive method) will be called instead of
2107 `PageElement.extract`.
2108 """
2109 for element in self.contents[:]:
2110 if decompose:
2111 element.decompose()
2112 else:
2113 element.extract()
2115 def smooth(self) -> None:
2116 """Smooth out the children of this `Tag` by consolidating consecutive
2117 strings.
2119 If you perform a lot of operations that modify the tree,
2120 calling this method afterwards can make pretty-printed output
2121 look more natural.
2122 """
2123 # Mark the first position of every pair of children that need
2124 # to be consolidated. Do this rather than making a copy of
2125 # self.contents, since in most cases very few strings will be
2126 # affected.
2127 marked = []
2128 for i, a in enumerate(self.contents):
2129 if isinstance(a, Tag):
2130 # Recursively smooth children.
2131 a.smooth()
2132 if i == len(self.contents) - 1:
2133 # This is the last item in .contents, and it's not a
2134 # tag. There's no chance it needs any work.
2135 continue
2136 b = self.contents[i + 1]
2137 if (
2138 isinstance(a, NavigableString)
2139 and isinstance(b, NavigableString)
2140 and not isinstance(a, PreformattedString)
2141 and not isinstance(b, PreformattedString)
2142 ):
2143 marked.append(i)
2145 # Go over the marked positions in reverse order, so that
2146 # removing items from .contents won't affect the remaining
2147 # positions.
2148 for i in reversed(marked):
2149 a = cast(NavigableString, self.contents[i])
2150 b = cast(NavigableString, self.contents[i + 1])
2151 b.extract()
2152 n = NavigableString(a + b)
2153 a.replace_with(n)
2155 def index(self, element: PageElement) -> int:
2156 """Find the index of a child of this `Tag` (by identity, not value).
2158 Doing this by identity avoids issues when a `Tag` contains two
2159 children that have string equality.
2161 :param element: Look for this `PageElement` in this object's contents.
2162 """
2163 for i, child in enumerate(self.contents):
2164 if child is element:
2165 return i
2166 raise ValueError("Tag.index: element not in tag")
2168 def get(
2169 self, key: str, default: Optional[_AttributeValue] = None
2170 ) -> Optional[_AttributeValue]:
2171 """Returns the value of the 'key' attribute for the tag, or
2172 the value given for 'default' if it doesn't have that
2173 attribute.
2175 :param key: The attribute to look for.
2176 :param default: Use this value if the attribute is not present
2177 on this `Tag`.
2178 """
2179 return self.attrs.get(key, default)
2181 def get_attribute_list(
2182 self, key: str, default: Optional[AttributeValueList] = None
2183 ) -> AttributeValueList:
2184 """The same as get(), but always returns a (possibly empty) list.
2186 :param key: The attribute to look for.
2187 :param default: Use this value if the attribute is not present
2188 on this `Tag`.
2189 :return: A list of strings, usually empty or containing only a single
2190 value.
2191 """
2192 list_value: AttributeValueList
2193 value = self.get(key, default)
2194 if value is None:
2195 list_value = self.attribute_value_list_class()
2196 elif isinstance(value, list):
2197 list_value = value
2198 else:
2199 if not isinstance(value, str):
2200 value = cast(str, value)
2201 list_value = self.attribute_value_list_class([value])
2202 return list_value
2204 def has_attr(self, key: str) -> bool:
2205 """Does this `Tag` have an attribute with the given name?"""
2206 return key in self.attrs
2208 def __hash__(self) -> int:
2209 return str(self).__hash__()
2211 def __getitem__(self, key: str) -> _AttributeValue:
2212 """tag[key] returns the value of the 'key' attribute for the Tag,
2213 and throws an exception if it's not there."""
2214 return self.attrs[key]
2216 def __iter__(self) -> Iterator[PageElement]:
2217 "Iterating over a Tag iterates over its contents."
2218 return iter(self.contents)
2220 def __len__(self) -> int:
2221 "The length of a Tag is the length of its list of contents."
2222 return len(self.contents)
2224 def __contains__(self, x: Any) -> bool:
2225 return x in self.contents
2227 def __bool__(self) -> bool:
2228 "A tag is non-None even if it has no contents."
2229 return True
2231 def __setitem__(self, key: str, value: _AttributeValue) -> None:
2232 """Setting tag[key] sets the value of the 'key' attribute for the
2233 tag."""
2234 self.attrs[key] = value
2236 def __delitem__(self, key: str) -> None:
2237 "Deleting tag[key] deletes all 'key' attributes for the tag."
2238 self.attrs.pop(key, None)
2240 def __call__(
2241 self,
2242 name: Optional[_StrainableElement] = None,
2243 attrs: _StrainableAttributes = {},
2244 recursive: bool = True,
2245 string: Optional[_StrainableString] = None,
2246 limit: Optional[int] = None,
2247 _stacklevel: int = 2,
2248 **kwargs: _StrainableAttribute,
2249 ) -> _QueryResults:
2250 """Calling a Tag like a function is the same as calling its
2251 find_all() method. Eg. tag('a') returns a list of all the A tags
2252 found within this tag."""
2253 return self.find_all(
2254 name, attrs, recursive, string, limit, _stacklevel, **kwargs
2255 )
2257 def __getattr__(self, subtag: str) -> Optional[Tag]:
2258 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
2259 # print("Getattr %s.%s" % (self.__class__, tag))
2260 result: _AtMostOneElement
2261 if len(subtag) > 3 and subtag.endswith("Tag"):
2262 # BS3: soup.aTag -> "soup.find("a")
2263 tag_name = subtag[:-3]
2264 warnings.warn(
2265 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")'
2266 % dict(name=tag_name),
2267 DeprecationWarning,
2268 stacklevel=2,
2269 )
2270 result = self.find(tag_name)
2271 # We special case contents to avoid recursion.
2272 elif not subtag.startswith("__") and not subtag == "contents":
2273 result = self.find(subtag)
2274 else:
2275 raise AttributeError(
2276 "'%s' object has no attribute '%s'" % (self.__class__, subtag)
2277 )
2278 return cast(Optional[Tag], result)
2280 def __eq__(self, other: Any) -> bool:
2281 """Returns true iff this Tag has the same name, the same attributes,
2282 and the same contents (recursively) as `other`."""
2283 if self is other:
2284 return True
2285 if not isinstance(other, Tag):
2286 return False
2287 if (
2288 not hasattr(other, "name")
2289 or not hasattr(other, "attrs")
2290 or not hasattr(other, "contents")
2291 or self.name != other.name
2292 or self.attrs != other.attrs
2293 or len(self) != len(other)
2294 ):
2295 return False
2296 for i, my_child in enumerate(self.contents):
2297 if my_child != other.contents[i]:
2298 return False
2299 return True
2301 def __ne__(self, other: Any) -> bool:
2302 """Returns true iff this Tag is not identical to `other`,
2303 as defined in __eq__."""
2304 return not self == other
2306 def __repr__(self) -> str:
2307 """Renders this `Tag` as a string."""
2308 return self.decode()
2310 __str__ = __unicode__ = __repr__
2312 def encode(
2313 self,
2314 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2315 indent_level: Optional[int] = None,
2316 formatter: _FormatterOrName = "minimal",
2317 errors: str = "xmlcharrefreplace",
2318 ) -> bytes:
2319 """Render this `Tag` and its contents as a bytestring.
2321 :param encoding: The encoding to use when converting to
2322 a bytestring. This may also affect the text of the document,
2323 specifically any encoding declarations within the document.
2324 :param indent_level: Each line of the rendering will be
2325 indented this many levels. (The ``formatter`` decides what a
2326 'level' means, in terms of spaces or other characters
2327 output.) This is used internally in recursive calls while
2328 pretty-printing.
2329 :param formatter: Either a `Formatter` object, or a string naming one of
2330 the standard formatters.
2331 :param errors: An error handling strategy such as
2332 'xmlcharrefreplace'. This value is passed along into
2333 :py:meth:`str.encode` and its value should be one of the `error
2334 handling constants defined by Python's codecs module
2335 <https://docs.python.org/3/library/codecs.html#error-handlers>`_.
2336 """
2337 # Turn the data structure into Unicode, then encode the
2338 # Unicode.
2339 u = self.decode(indent_level, encoding, formatter)
2340 return u.encode(encoding, errors)
2342 def decode(
2343 self,
2344 indent_level: Optional[int] = None,
2345 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2346 formatter: _FormatterOrName = "minimal",
2347 iterator: Optional[Iterator[PageElement]] = None,
2348 ) -> str:
2349 """Render this `Tag` and its contents as a Unicode string.
2351 :param indent_level: Each line of the rendering will be
2352 indented this many levels. (The ``formatter`` decides what a
2353 'level' means, in terms of spaces or other characters
2354 output.) This is used internally in recursive calls while
2355 pretty-printing.
2356 :param encoding: The encoding you intend to use when
2357 converting the string to a bytestring. decode() is *not*
2358 responsible for performing that encoding. This information
2359 is needed so that a real encoding can be substituted in if
2360 the document contains an encoding declaration (e.g. in a
2361 <meta> tag).
2362 :param formatter: Either a `Formatter` object, or a string
2363 naming one of the standard formatters.
2364 :param iterator: The iterator to use when navigating over the
2365 parse tree. This is only used by `Tag.decode_contents` and
2366 you probably won't need to use it.
2367 """
2368 pieces = []
2369 # First off, turn a non-Formatter `formatter` into a Formatter
2370 # object. This will stop the lookup from happening over and
2371 # over again.
2372 if not isinstance(formatter, Formatter):
2373 formatter = self.formatter_for_name(formatter)
2375 if indent_level is True:
2376 indent_level = 0
2378 # The currently active tag that put us into string literal
2379 # mode. Until this element is closed, children will be treated
2380 # as string literals and not pretty-printed. String literal
2381 # mode is turned on immediately after this tag begins, and
2382 # turned off immediately before it's closed. This means there
2383 # will be whitespace before and after the tag itself.
2384 string_literal_tag = None
2386 for event, element in self._event_stream(iterator):
2387 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
2388 element = cast(Tag, element)
2389 piece = element._format_tag(eventual_encoding, formatter, opening=True)
2390 elif event is Tag.END_ELEMENT_EVENT:
2391 element = cast(Tag, element)
2392 piece = element._format_tag(eventual_encoding, formatter, opening=False)
2393 if indent_level is not None:
2394 indent_level -= 1
2395 else:
2396 element = cast(NavigableString, element)
2397 piece = element.output_ready(formatter)
2399 # Now we need to apply the 'prettiness' -- extra
2400 # whitespace before and/or after this tag. This can get
2401 # complicated because certain tags, like <pre> and
2402 # <script>, can't be prettified, since adding whitespace would
2403 # change the meaning of the content.
2405 # The default behavior is to add whitespace before and
2406 # after an element when string literal mode is off, and to
2407 # leave things as they are when string literal mode is on.
2408 if string_literal_tag:
2409 indent_before = indent_after = False
2410 else:
2411 indent_before = indent_after = True
2413 # The only time the behavior is more complex than that is
2414 # when we encounter an opening or closing tag that might
2415 # put us into or out of string literal mode.
2416 if (
2417 event is Tag.START_ELEMENT_EVENT
2418 and not string_literal_tag
2419 and not cast(Tag, element)._should_pretty_print()
2420 ):
2421 # We are about to enter string literal mode. Add
2422 # whitespace before this tag, but not after. We
2423 # will stay in string literal mode until this tag
2424 # is closed.
2425 indent_before = True
2426 indent_after = False
2427 string_literal_tag = element
2428 elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag:
2429 # We are about to exit string literal mode by closing
2430 # the tag that sent us into that mode. Add whitespace
2431 # after this tag, but not before.
2432 indent_before = False
2433 indent_after = True
2434 string_literal_tag = None
2436 # Now we know whether to add whitespace before and/or
2437 # after this element.
2438 if indent_level is not None:
2439 if indent_before or indent_after:
2440 if isinstance(element, NavigableString):
2441 piece = piece.strip()
2442 if piece:
2443 piece = self._indent_string(
2444 piece, indent_level, formatter, indent_before, indent_after
2445 )
2446 if event == Tag.START_ELEMENT_EVENT:
2447 indent_level += 1
2448 pieces.append(piece)
2449 return "".join(pieces)
2451 class _TreeTraversalEvent(object):
2452 """An internal class representing an event in the process
2453 of traversing a parse tree.
2455 :meta private:
2456 """
2458 # Stand-ins for the different events yielded by _event_stream
2459 START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2460 END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2461 EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2462 STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2464 def _event_stream(
2465 self, iterator: Optional[Iterator[PageElement]] = None
2466 ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]:
2467 """Yield a sequence of events that can be used to reconstruct the DOM
2468 for this element.
2470 This lets us recreate the nested structure of this element
2471 (e.g. when formatting it as a string) without using recursive
2472 method calls.
2474 This is similar in concept to the SAX API, but it's a simpler
2475 interface designed for internal use. The events are different
2476 from SAX and the arguments associated with the events are Tags
2477 and other Beautiful Soup objects.
2479 :param iterator: An alternate iterator to use when traversing
2480 the tree.
2481 """
2482 tag_stack: List[Tag] = []
2484 iterator = iterator or self.self_and_descendants
2486 for c in iterator:
2487 # If the parent of the element we're about to yield is not
2488 # the tag currently on the stack, it means that the tag on
2489 # the stack closed before this element appeared.
2490 while tag_stack and c.parent != tag_stack[-1]:
2491 now_closed_tag = tag_stack.pop()
2492 yield Tag.END_ELEMENT_EVENT, now_closed_tag
2494 if isinstance(c, Tag):
2495 if c.is_empty_element:
2496 yield Tag.EMPTY_ELEMENT_EVENT, c
2497 else:
2498 yield Tag.START_ELEMENT_EVENT, c
2499 tag_stack.append(c)
2500 continue
2501 else:
2502 yield Tag.STRING_ELEMENT_EVENT, c
2504 while tag_stack:
2505 now_closed_tag = tag_stack.pop()
2506 yield Tag.END_ELEMENT_EVENT, now_closed_tag
2508 def _indent_string(
2509 self,
2510 s: str,
2511 indent_level: int,
2512 formatter: Formatter,
2513 indent_before: bool,
2514 indent_after: bool,
2515 ) -> str:
2516 """Add indentation whitespace before and/or after a string.
2518 :param s: The string to amend with whitespace.
2519 :param indent_level: The indentation level; affects how much
2520 whitespace goes before the string.
2521 :param indent_before: Whether or not to add whitespace
2522 before the string.
2523 :param indent_after: Whether or not to add whitespace
2524 (a newline) after the string.
2525 """
2526 space_before = ""
2527 if indent_before and indent_level:
2528 space_before = formatter.indent * indent_level
2530 space_after = ""
2531 if indent_after:
2532 space_after = "\n"
2534 return space_before + s + space_after
2536 def _format_tag(
2537 self, eventual_encoding: str, formatter: Formatter, opening: bool
2538 ) -> str:
2539 if self.hidden:
2540 # A hidden tag is invisible, although its contents
2541 # are visible.
2542 return ""
2544 # A tag starts with the < character (see below).
2546 # Then the / character, if this is a closing tag.
2547 closing_slash = ""
2548 if not opening:
2549 closing_slash = "/"
2551 # Then an optional namespace prefix.
2552 prefix = ""
2553 if self.prefix:
2554 prefix = self.prefix + ":"
2556 # Then a list of attribute values, if this is an opening tag.
2557 attribute_string = ""
2558 if opening:
2559 attributes = formatter.attributes(self)
2560 attrs = []
2561 for key, val in attributes:
2562 if val is None:
2563 decoded = key
2564 else:
2565 if isinstance(val, list) or isinstance(val, tuple):
2566 val = " ".join(val)
2567 elif not isinstance(val, str):
2568 val = str(val)
2569 elif (
2570 isinstance(val, AttributeValueWithCharsetSubstitution)
2571 and eventual_encoding is not None
2572 ):
2573 val = val.substitute_encoding(eventual_encoding)
2575 text = formatter.attribute_value(val)
2576 decoded = str(key) + "=" + formatter.quoted_attribute_value(text)
2577 attrs.append(decoded)
2578 if attrs:
2579 attribute_string = " " + " ".join(attrs)
2581 # Then an optional closing slash (for a void element in an
2582 # XML document).
2583 void_element_closing_slash = ""
2584 if self.is_empty_element:
2585 void_element_closing_slash = formatter.void_element_close_prefix or ""
2587 # Put it all together.
2588 return (
2589 "<"
2590 + closing_slash
2591 + prefix
2592 + self.name
2593 + attribute_string
2594 + void_element_closing_slash
2595 + ">"
2596 )
2598 def _should_pretty_print(self, indent_level: int = 1) -> bool:
2599 """Should this tag be pretty-printed?
2601 Most of them should, but some (such as <pre> in HTML
2602 documents) should not.
2603 """
2604 return indent_level is not None and (
2605 not self.preserve_whitespace_tags
2606 or self.name not in self.preserve_whitespace_tags
2607 )
2609 def prettify(
2610 self,
2611 encoding: Optional[_Encoding] = None,
2612 formatter: _FormatterOrName = "minimal",
2613 ) -> Union[str, bytes]:
2614 """Pretty-print this `Tag` as a string or bytestring.
2616 :param encoding: The encoding of the bytestring, or None if you want Unicode.
2617 :param formatter: A Formatter object, or a string naming one of
2618 the standard formatters.
2619 :return: A string (if no ``encoding`` is provided) or a bytestring
2620 (otherwise).
2621 """
2622 if encoding is None:
2623 return self.decode(indent_level=0, formatter=formatter)
2624 else:
2625 return self.encode(encoding=encoding, indent_level=0, formatter=formatter)
2627 def decode_contents(
2628 self,
2629 indent_level: Optional[int] = None,
2630 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2631 formatter: _FormatterOrName = "minimal",
2632 ) -> str:
2633 """Renders the contents of this tag as a Unicode string.
2635 :param indent_level: Each line of the rendering will be
2636 indented this many levels. (The formatter decides what a
2637 'level' means in terms of spaces or other characters
2638 output.) Used internally in recursive calls while
2639 pretty-printing.
2641 :param eventual_encoding: The tag is destined to be
2642 encoded into this encoding. decode_contents() is *not*
2643 responsible for performing that encoding. This information
2644 is needed so that a real encoding can be substituted in if
2645 the document contains an encoding declaration (e.g. in a
2646 <meta> tag).
2648 :param formatter: A `Formatter` object, or a string naming one of
2649 the standard Formatters.
2650 """
2651 return self.decode(
2652 indent_level, eventual_encoding, formatter, iterator=self.descendants
2653 )
2655 def encode_contents(
2656 self,
2657 indent_level: Optional[int] = None,
2658 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2659 formatter: _FormatterOrName = "minimal",
2660 ) -> bytes:
2661 """Renders the contents of this PageElement as a bytestring.
2663 :param indent_level: Each line of the rendering will be
2664 indented this many levels. (The ``formatter`` decides what a
2665 'level' means, in terms of spaces or other characters
2666 output.) This is used internally in recursive calls while
2667 pretty-printing.
2668 :param formatter: Either a `Formatter` object, or a string naming one of
2669 the standard formatters.
2670 :param encoding: The bytestring will be in this encoding.
2671 """
2672 contents = self.decode_contents(indent_level, encoding, formatter)
2673 return contents.encode(encoding)
2675 @_deprecated("encode_contents", "4.0.0")
2676 def renderContents(
2677 self,
2678 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2679 prettyPrint: bool = False,
2680 indentLevel: Optional[int] = 0,
2681 ) -> bytes:
2682 """Deprecated method for BS3 compatibility.
2684 :meta private:
2685 """
2686 if not prettyPrint:
2687 indentLevel = None
2688 return self.encode_contents(indent_level=indentLevel, encoding=encoding)
2690 # Soup methods
2692 def find(
2693 self,
2694 name: _FindMethodName = None,
2695 attrs: _StrainableAttributes = {},
2696 recursive: bool = True,
2697 string: Optional[_StrainableString] = None,
2698 **kwargs: _StrainableAttribute,
2699 ) -> _AtMostOneElement:
2700 """Look in the children of this PageElement and find the first
2701 PageElement that matches the given criteria.
2703 All find_* methods take a common set of arguments. See the online
2704 documentation for detailed explanations.
2706 :param name: A filter on tag name.
2707 :param attrs: Additional filters on attribute values.
2708 :param recursive: If this is True, find() will perform a
2709 recursive search of this Tag's children. Otherwise,
2710 only the direct children will be considered.
2711 :param string: A filter on the `Tag.string` attribute.
2712 :param limit: Stop looking after finding this many results.
2713 :kwargs: Additional filters on attribute values.
2714 """
2715 r = None
2716 results = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs)
2717 if results:
2718 r = results[0]
2719 return r
2721 findChild = _deprecated_function_alias("findChild", "find", "3.0.0")
2723 def find_all(
2724 self,
2725 name: _FindMethodName = None,
2726 attrs: _StrainableAttributes = {},
2727 recursive: bool = True,
2728 string: Optional[_StrainableString] = None,
2729 limit: Optional[int] = None,
2730 _stacklevel: int = 2,
2731 **kwargs: _StrainableAttribute,
2732 ) -> _QueryResults:
2733 """Look in the children of this `PageElement` and find all
2734 `PageElement` objects that match the given criteria.
2736 All find_* methods take a common set of arguments. See the online
2737 documentation for detailed explanations.
2739 :param name: A filter on tag name.
2740 :param attrs: Additional filters on attribute values.
2741 :param recursive: If this is True, find_all() will perform a
2742 recursive search of this PageElement's children. Otherwise,
2743 only the direct children will be considered.
2744 :param limit: Stop looking after finding this many results.
2745 :param _stacklevel: Used internally to improve warning messages.
2746 :kwargs: Additional filters on attribute values.
2747 """
2748 generator = self.descendants
2749 if not recursive:
2750 generator = self.children
2751 return self._find_all(
2752 name, attrs, string, limit, generator, _stacklevel=_stacklevel + 1, **kwargs
2753 )
2755 findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")
2756 findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")
2758 # Generator methods
2759 @property
2760 def children(self) -> Iterator[PageElement]:
2761 """Iterate over all direct children of this `PageElement`."""
2762 return (x for x in self.contents)
2764 @property
2765 def self_and_descendants(self) -> Iterator[PageElement]:
2766 """Iterate over this `Tag` and its children in a
2767 breadth-first sequence.
2768 """
2769 return self._self_and(self.descendants)
2771 @property
2772 def descendants(self) -> Iterator[PageElement]:
2773 """Iterate over all children of this `Tag` in a
2774 breadth-first sequence.
2775 """
2776 if not len(self.contents):
2777 return
2778 # _last_descendant() can't return None here because
2779 # accept_self is True. Worst case, last_descendant will end up
2780 # as self.
2781 last_descendant = cast(PageElement, self._last_descendant(accept_self=True))
2782 stopNode = last_descendant.next_element
2783 current: _AtMostOneElement = self.contents[0]
2784 while current is not stopNode and current is not None:
2785 successor = current.next_element
2786 yield current
2787 current = successor
2789 # CSS selector code
2790 def select_one(
2791 self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any
2792 ) -> Optional[Tag]:
2793 """Perform a CSS selection operation on the current element.
2795 :param selector: A CSS selector.
2797 :param namespaces: A dictionary mapping namespace prefixes
2798 used in the CSS selector to namespace URIs. By default,
2799 Beautiful Soup will use the prefixes it encountered while
2800 parsing the document.
2802 :param kwargs: Keyword arguments to be passed into Soup Sieve's
2803 soupsieve.select() method.
2804 """
2805 return self.css.select_one(selector, namespaces, **kwargs)
2807 def select(
2808 self,
2809 selector: str,
2810 namespaces: Optional[Dict[str, str]] = None,
2811 limit: int = 0,
2812 **kwargs: Any,
2813 ) -> ResultSet[Tag]:
2814 """Perform a CSS selection operation on the current element.
2816 This uses the SoupSieve library.
2818 :param selector: A string containing a CSS selector.
2820 :param namespaces: A dictionary mapping namespace prefixes
2821 used in the CSS selector to namespace URIs. By default,
2822 Beautiful Soup will use the prefixes it encountered while
2823 parsing the document.
2825 :param limit: After finding this number of results, stop looking.
2827 :param kwargs: Keyword arguments to be passed into SoupSieve's
2828 soupsieve.select() method.
2829 """
2830 return self.css.select(selector, namespaces, limit, **kwargs)
2832 @property
2833 def css(self) -> CSS:
2834 """Return an interface to the CSS selector API."""
2835 return CSS(self)
2837 # Old names for backwards compatibility
2838 @_deprecated("children", "4.0.0")
2839 def childGenerator(self) -> Iterator[PageElement]:
2840 """Deprecated generator.
2842 :meta private:
2843 """
2844 return self.children
2846 @_deprecated("descendants", "4.0.0")
2847 def recursiveChildGenerator(self) -> Iterator[PageElement]:
2848 """Deprecated generator.
2850 :meta private:
2851 """
2852 return self.descendants
2854 @_deprecated("has_attr", "4.0.0")
2855 def has_key(self, key: str) -> bool:
2856 """Deprecated method. This was kind of misleading because has_key()
2857 (attributes) was different from __in__ (contents).
2859 has_key() is gone in Python 3, anyway.
2861 :meta private:
2862 """
2863 return self.has_attr(key)
2866_PageElementT = TypeVar("_PageElementT", bound=PageElement)
2869class ResultSet(List[_PageElementT], Generic[_PageElementT]):
2870 """A ResultSet is a list of `PageElement` objects, gathered as the result
2871 of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of
2872 search results.
2873 """
2875 source: Optional[ElementFilter]
2877 def __init__(
2878 self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = ()
2879 ) -> None:
2880 super(ResultSet, self).__init__(result)
2881 self.source = source
2883 def __getattr__(self, key: str) -> None:
2884 """Raise a helpful exception to explain a common code fix."""
2885 raise AttributeError(
2886 f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""
2887 )
2890# Now that all the classes used by SoupStrainer have been defined,
2891# import SoupStrainer itself into this module to preserve the
2892# backwards compatibility of anyone who imports
2893# bs4.element.SoupStrainer.
2894from bs4.filter import SoupStrainer # noqa: E402