Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/element.py: 44%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3# Use of this source code is governed by the MIT license.
4__license__ = "MIT"
6import re
7import warnings
9from bs4.css import CSS
10from bs4._deprecation import (
11 _deprecated,
12 _deprecated_alias,
13 _deprecated_function_alias,
14)
15from bs4.formatter import (
16 Formatter,
17 HTMLFormatter,
18 XMLFormatter,
19)
20from bs4._warnings import AttributeResemblesVariableWarning
22from typing import (
23 Any,
24 Callable,
25 Dict,
26 Generic,
27 Iterable,
28 Iterator,
29 List,
30 Mapping,
31 Optional,
32 Pattern,
33 Set,
34 TYPE_CHECKING,
35 Tuple,
36 Type,
37 TypeVar,
38 Union,
39 cast,
40 overload,
41)
42from typing_extensions import (
43 Self,
44 TypeAlias,
45)
47if TYPE_CHECKING:
48 from bs4 import BeautifulSoup
49 from bs4.builder import TreeBuilder
50 from bs4.filter import ElementFilter
51 from bs4.formatter import (
52 _EntitySubstitutionFunction,
53 _FormatterOrName,
54 )
55 from bs4._typing import (
56 _AtMostOneElement,
57 _AttributeValue,
58 _AttributeValues,
59 _Encoding,
60 _InsertableElement,
61 _OneElement,
62 _QueryResults,
63 _RawOrProcessedAttributeValues,
64 _StrainableElement,
65 _StrainableAttribute,
66 _StrainableAttributes,
67 _StrainableString,
68 )
70_OneOrMoreStringTypes: TypeAlias = Union[
71 Type["NavigableString"], Iterable[Type["NavigableString"]]
72]
74_FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]]
76# Deprecated module-level attributes.
77# See https://peps.python.org/pep-0562/
78_deprecated_names = dict(
79 whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy."
80)
81#: :meta private:
82_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+")
85def __getattr__(name: str) -> Any:
86 if name in _deprecated_names:
87 message = _deprecated_names[name]
88 warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2)
90 return globals()[f"_deprecated_{name}"]
91 raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
94#: Documents output by Beautiful Soup will be encoded with
95#: this encoding unless you specify otherwise.
96DEFAULT_OUTPUT_ENCODING: str = "utf-8"
98#: A regular expression that can be used to split on whitespace.
99nonwhitespace_re: Pattern[str] = re.compile(r"\S+")
101#: These encodings are recognized by Python (so `Tag.encode`
102#: could theoretically support them) but XML and HTML don't recognize
103#: them (so they should not show up in an XML or HTML document as that
104#: document's encoding).
105#:
106#: If an XML document is encoded in one of these encodings, no encoding
107#: will be mentioned in the XML declaration. If an HTML document is
108#: encoded in one of these encodings, and the HTML document has a
109#: <meta> tag that mentions an encoding, the encoding will be given as
110#: the empty string.
111#:
112#: Source:
113#: Python documentation, `Python Specific Encodings <https://docs.python.org/3/library/codecs.html#python-specific-encodings>`_
114PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set(
115 [
116 "idna",
117 "mbcs",
118 "oem",
119 "palmos",
120 "punycode",
121 "raw_unicode_escape",
122 "undefined",
123 "unicode_escape",
124 "raw-unicode-escape",
125 "unicode-escape",
126 "string-escape",
127 "string_escape",
128 ]
129)
132class NamespacedAttribute(str):
133 """A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"')
134 which remembers the namespace prefix ('xml') and the name ('lang')
135 that were used to create it.
136 """
138 prefix: Optional[str]
139 name: Optional[str]
140 namespace: Optional[str]
142 def __new__(
143 cls,
144 prefix: Optional[str],
145 name: Optional[str] = None,
146 namespace: Optional[str] = None,
147 ) -> Self:
148 if not name:
149 # This is the default namespace. Its name "has no value"
150 # per https://www.w3.org/TR/xml-names/#defaulting
151 name = None
153 if not name:
154 obj = str.__new__(cls, prefix)
155 elif not prefix:
156 # Not really namespaced.
157 obj = str.__new__(cls, name)
158 else:
159 obj = str.__new__(cls, prefix + ":" + name)
160 obj.prefix = prefix
161 obj.name = name
162 obj.namespace = namespace
163 return obj
166class AttributeValueWithCharsetSubstitution(str):
167 """An abstract class standing in for a character encoding specified
168 inside an HTML ``<meta>`` tag.
170 Subclasses exist for each place such a character encoding might be
171 found: either inside the ``charset`` attribute
172 (`CharsetMetaAttributeValue`) or inside the ``content`` attribute
173 (`ContentMetaAttributeValue`)
175 This allows Beautiful Soup to replace that part of the HTML file
176 with a different encoding when ouputting a tree as a string.
177 """
179 # The original, un-encoded value of the ``content`` attribute.
180 #: :meta private:
181 original_value: str
183 def substitute_encoding(self, eventual_encoding: str) -> str:
184 """Do whatever's necessary in this implementation-specific
185 portion an HTML document to substitute in a specific encoding.
186 """
187 raise NotImplementedError()
190class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
191 """A generic stand-in for the value of a ``<meta>`` tag's ``charset``
192 attribute.
194 When Beautiful Soup parses the markup ``<meta charset="utf8">``, the
195 value of the ``charset`` attribute will become one of these objects.
197 If the document is later encoded to an encoding other than UTF-8, its
198 ``<meta>`` tag will mention the new encoding instead of ``utf8``.
199 """
201 def __new__(cls, original_value: str) -> Self:
202 # We don't need to use the original value for anything, but
203 # it might be useful for the user to know.
204 obj = str.__new__(cls, original_value)
205 obj.original_value = original_value
206 return obj
208 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
209 """When an HTML document is being encoded to a given encoding, the
210 value of a ``<meta>`` tag's ``charset`` becomes the name of
211 the encoding.
212 """
213 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
214 return ""
215 return eventual_encoding
218class AttributeValueList(List[str]):
219 """Class for the list used to hold the values of attributes which
220 have multiple values (such as HTML's 'class'). It's just a regular
221 list, but you can subclass it and pass it in to the TreeBuilder
222 constructor as attribute_value_list_class, to have your subclass
223 instantiated instead.
224 """
227class AttributeDict(Dict[Any,Any]):
228 """Superclass for the dictionary used to hold a tag's
229 attributes. You can use this, but it's just a regular dict with no
230 special logic.
231 """
234class XMLAttributeDict(AttributeDict):
235 """A dictionary for holding a Tag's attributes, which processes
236 incoming values for consistency with the HTML spec.
237 """
239 def __setitem__(self, key: str, value: Any) -> None:
240 """Set an attribute value, possibly modifying it to comply with
241 the XML spec.
243 This just means converting common non-string values to
244 strings: XML attributes may have "any literal string as a
245 value."
246 """
247 if value is None:
248 value = ""
249 if isinstance(value, bool):
250 # XML does not define any rules for boolean attributes.
251 # Preserve the old Beautiful Soup behavior (a bool that
252 # gets converted to a string on output) rather than
253 # guessing what the value should be.
254 pass
255 elif isinstance(value, (int, float)):
256 # It's dangerous to convert _every_ attribute value into a
257 # plain string, since an attribute value may be a more
258 # sophisticated string-like object
259 # (e.g. CharsetMetaAttributeValue). But we can definitely
260 # convert numeric values and booleans, which are the most common.
261 value = str(value)
263 super().__setitem__(key, value)
266class HTMLAttributeDict(AttributeDict):
267 """A dictionary for holding a Tag's attributes, which processes
268 incoming values for consistency with the HTML spec, which says
269 'Attribute values are a mixture of text and character
270 references...'
272 Basically, this means converting common non-string values into
273 strings, like XMLAttributeDict, though HTML also has some rules
274 around boolean attributes that XML doesn't have.
275 """
277 def __setitem__(self, key: str, value: Any) -> None:
278 """Set an attribute value, possibly modifying it to comply
279 with the HTML spec,
280 """
281 if value in (False, None):
282 # 'The values "true" and "false" are not allowed on
283 # boolean attributes. To represent a false value, the
284 # attribute has to be omitted altogether.'
285 if key in self:
286 del self[key]
287 return
288 if isinstance(value, bool):
289 # 'If the [boolean] attribute is present, its value must
290 # either be the empty string or a value that is an ASCII
291 # case-insensitive match for the attribute's canonical
292 # name, with no leading or trailing whitespace.'
293 #
294 # [fixme] It's not clear to me whether "canonical name"
295 # means fully-qualified name, unqualified name, or
296 # (probably not) name with namespace prefix. For now I'm
297 # going with unqualified name.
298 if isinstance(key, NamespacedAttribute):
299 value = key.name
300 else:
301 value = key
302 elif isinstance(value, (int, float)):
303 # See note in XMLAttributeDict for the reasoning why we
304 # only do this to numbers.
305 value = str(value)
306 super().__setitem__(key, value)
309class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
310 """A generic stand-in for the value of a ``<meta>`` tag's ``content``
311 attribute.
313 When Beautiful Soup parses the markup:
314 ``<meta http-equiv="content-type" content="text/html; charset=utf8">``
316 The value of the ``content`` attribute will become one of these objects.
318 If the document is later encoded to an encoding other than UTF-8, its
319 ``<meta>`` tag will mention the new encoding instead of ``utf8``.
320 """
322 #: Match the 'charset' argument inside the 'content' attribute
323 #: of a <meta> tag.
324 #: :meta private:
325 CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
327 def __new__(cls, original_value: str) -> Self:
328 cls.CHARSET_RE.search(original_value)
329 obj = str.__new__(cls, original_value)
330 obj.original_value = original_value
331 return obj
333 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
334 """When an HTML document is being encoded to a given encoding, the
335 value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes
336 the name of the encoding.
337 """
338 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
339 return self.CHARSET_RE.sub("", self.original_value)
341 def rewrite(match: re.Match[str]) -> str:
342 return match.group(1) + eventual_encoding
344 return self.CHARSET_RE.sub(rewrite, self.original_value)
347class PageElement(object):
348 """An abstract class representing a single element in the parse tree.
350 `NavigableString`, `Tag`, etc. are all subclasses of
351 `PageElement`. For this reason you'll see a lot of methods that
352 return `PageElement`, but you'll never see an actual `PageElement`
353 object. For the most part you can think of `PageElement` as
354 meaning "a `Tag` or a `NavigableString`."
355 """
357 #: In general, we can't tell just by looking at an element whether
358 #: it's contained in an XML document or an HTML document. But for
359 #: `Tag` objects (q.v.) we can store this information at parse time.
360 #: :meta private:
361 known_xml: Optional[bool] = None
363 #: Whether or not this element has been decomposed from the tree
364 #: it was created in.
365 _decomposed: bool
367 parent: Optional[Tag]
368 next_element: _AtMostOneElement
369 previous_element: _AtMostOneElement
370 next_sibling: _AtMostOneElement
371 previous_sibling: _AtMostOneElement
373 #: Whether or not this element is hidden from generated output.
374 #: Only the `BeautifulSoup` object itself is hidden.
375 hidden: bool = False
377 def setup(
378 self,
379 parent: Optional[Tag] = None,
380 previous_element: _AtMostOneElement = None,
381 next_element: _AtMostOneElement = None,
382 previous_sibling: _AtMostOneElement = None,
383 next_sibling: _AtMostOneElement = None,
384 ) -> None:
385 """Sets up the initial relations between this element and
386 other elements.
388 :param parent: The parent of this element.
390 :param previous_element: The element parsed immediately before
391 this one.
393 :param next_element: The element parsed immediately after
394 this one.
396 :param previous_sibling: The most recently encountered element
397 on the same level of the parse tree as this one.
399 :param previous_sibling: The next element to be encountered
400 on the same level of the parse tree as this one.
401 """
402 self.parent = parent
404 self.previous_element = previous_element
405 if self.previous_element is not None:
406 self.previous_element.next_element = self
408 self.next_element = next_element
409 if self.next_element is not None:
410 self.next_element.previous_element = self
412 self.next_sibling = next_sibling
413 if self.next_sibling is not None:
414 self.next_sibling.previous_sibling = self
416 if (
417 previous_sibling is None
418 and self.parent is not None
419 and self.parent.contents
420 ):
421 previous_sibling = self.parent.contents[-1]
423 self.previous_sibling = previous_sibling
424 if self.previous_sibling is not None:
425 self.previous_sibling.next_sibling = self
427 def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str:
428 """Format the given string using the given formatter.
430 :param s: A string.
431 :param formatter: A Formatter object, or a string naming one of the standard formatters.
432 """
433 if formatter is None:
434 return s
435 if not isinstance(formatter, Formatter):
436 formatter = self.formatter_for_name(formatter)
437 output = formatter.substitute(s)
438 return output
440 def formatter_for_name(
441 self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction]
442 ) -> Formatter:
443 """Look up or create a Formatter for the given identifier,
444 if necessary.
446 :param formatter: Can be a `Formatter` object (used as-is), a
447 function (used as the entity substitution hook for an
448 `bs4.formatter.XMLFormatter` or
449 `bs4.formatter.HTMLFormatter`), or a string (used to look
450 up an `bs4.formatter.XMLFormatter` or
451 `bs4.formatter.HTMLFormatter` in the appropriate registry.
453 """
454 if isinstance(formatter_name, Formatter):
455 return formatter_name
456 c: type[Formatter]
457 registry: Mapping[Optional[str], Formatter]
458 if self._is_xml:
459 c = XMLFormatter
460 registry = XMLFormatter.REGISTRY
461 else:
462 c = HTMLFormatter
463 registry = HTMLFormatter.REGISTRY
464 if callable(formatter_name):
465 return c(entity_substitution=formatter_name)
466 return registry[formatter_name]
468 @property
469 def _is_xml(self) -> bool:
470 """Is this element part of an XML tree or an HTML tree?
472 This is used in formatter_for_name, when deciding whether an
473 XMLFormatter or HTMLFormatter is more appropriate. It can be
474 inefficient, but it should be called very rarely.
475 """
476 if self.known_xml is not None:
477 # Most of the time we will have determined this when the
478 # document is parsed.
479 return self.known_xml
481 # Otherwise, it's likely that this element was created by
482 # direct invocation of the constructor from within the user's
483 # Python code.
484 if self.parent is None:
485 # This is the top-level object. It should have .known_xml set
486 # from tree creation. If not, take a guess--BS is usually
487 # used on HTML markup.
488 return getattr(self, "is_xml", False)
489 return self.parent._is_xml
491 nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0")
492 previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0")
494 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
495 raise NotImplementedError()
497 def __copy__(self) -> Self:
498 """A copy of a PageElement can only be a deep copy, because
499 only one PageElement can occupy a given place in a parse tree.
500 """
501 return self.__deepcopy__({})
503 default: Iterable[type[NavigableString]] = tuple() #: :meta private:
505 def _all_strings(
506 self, strip: bool = False, types: Iterable[type[NavigableString]] = default
507 ) -> Iterator[str]:
508 """Yield all strings of certain classes, possibly stripping them.
510 This is implemented differently in `Tag` and `NavigableString`.
511 """
512 raise NotImplementedError()
514 @property
515 def stripped_strings(self) -> Iterator[str]:
516 """Yield all interesting strings in this PageElement, stripping them
517 first.
519 See `Tag` for information on which strings are considered
520 interesting in a given context.
521 """
522 for string in self._all_strings(True):
523 yield string
525 def get_text(
526 self,
527 separator: str = "",
528 strip: bool = False,
529 types: Iterable[Type[NavigableString]] = default,
530 ) -> str:
531 """Get all child strings of this PageElement, concatenated using the
532 given separator.
534 :param separator: Strings will be concatenated using this separator.
536 :param strip: If True, strings will be stripped before being
537 concatenated.
539 :param types: A tuple of NavigableString subclasses. Any
540 strings of a subclass not found in this list will be
541 ignored. Although there are exceptions, the default
542 behavior in most cases is to consider only NavigableString
543 and CData objects. That means no comments, processing
544 instructions, etc.
546 :return: A string.
547 """
548 return separator.join([s for s in self._all_strings(strip, types=types)])
550 getText = get_text
551 text = property(get_text)
553 def replace_with(self, *args: _InsertableElement) -> Self:
554 """Replace this `PageElement` with one or more other elements,
555 objects, keeping the rest of the tree the same.
557 :return: This `PageElement`, no longer part of the tree.
558 """
559 if self.parent is None:
560 raise ValueError(
561 "Cannot replace one element with another when the "
562 "element to be replaced is not part of a tree."
563 )
564 if len(args) == 1 and args[0] is self:
565 # Replacing an element with itself is a no-op.
566 return self
567 if any(x is self.parent for x in args):
568 raise ValueError("Cannot replace a Tag with its parent.")
569 old_parent = self.parent
570 my_index = self.parent.index(self)
571 self.extract(_self_index=my_index)
572 for idx, replace_with in enumerate(args, start=my_index):
573 old_parent.insert(idx, replace_with)
574 return self
576 replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0")
578 def wrap(self, wrap_inside: Tag) -> Tag:
579 """Wrap this `PageElement` inside a `Tag`.
581 :return: ``wrap_inside``, occupying the position in the tree that used
582 to be occupied by this object, and with this object now inside it.
583 """
584 me = self.replace_with(wrap_inside)
585 wrap_inside.append(me)
586 return wrap_inside
588 def extract(self, _self_index: Optional[int] = None) -> Self:
589 """Destructively rips this element out of the tree.
591 :param _self_index: The location of this element in its parent's
592 .contents, if known. Passing this in allows for a performance
593 optimization.
595 :return: this `PageElement`, no longer part of the tree.
596 """
597 if self.parent is not None:
598 if _self_index is None:
599 _self_index = self.parent.index(self)
600 del self.parent.contents[_self_index]
602 # Find the two elements that would be next to each other if
603 # this element (and any children) hadn't been parsed. Connect
604 # the two.
605 last_child = self._last_descendant()
607 # last_child can't be None because we passed accept_self=True
608 # into _last_descendant. Worst case, last_child will be
609 # self. Making this cast removes several mypy complaints later
610 # on as we manipulate last_child.
611 last_child = cast(PageElement, last_child)
612 next_element = last_child.next_element
614 if self.previous_element is not None:
615 if self.previous_element is not next_element:
616 self.previous_element.next_element = next_element
617 if next_element is not None and next_element is not self.previous_element:
618 next_element.previous_element = self.previous_element
619 self.previous_element = None
620 last_child.next_element = None
622 self.parent = None
623 if (
624 self.previous_sibling is not None
625 and self.previous_sibling is not self.next_sibling
626 ):
627 self.previous_sibling.next_sibling = self.next_sibling
628 if (
629 self.next_sibling is not None
630 and self.next_sibling is not self.previous_sibling
631 ):
632 self.next_sibling.previous_sibling = self.previous_sibling
633 self.previous_sibling = self.next_sibling = None
634 return self
636 def decompose(self) -> None:
637 """Recursively destroys this `PageElement` and its children.
639 The element will be removed from the tree and wiped out; so
640 will everything beneath it.
642 The behavior of a decomposed `PageElement` is undefined and you
643 should never use one for anything, but if you need to *check*
644 whether an element has been decomposed, you can use the
645 `PageElement.decomposed` property.
646 """
647 self.extract()
648 e: _AtMostOneElement = self
649 next_up: _AtMostOneElement = None
650 while e is not None:
651 next_up = e.next_element
652 e.__dict__.clear()
653 if isinstance(e, Tag):
654 e.contents = []
655 e._decomposed = True
656 e = next_up
658 def _last_descendant(
659 self, is_initialized: bool = True, accept_self: bool = True
660 ) -> _AtMostOneElement:
661 """Finds the last element beneath this object to be parsed.
663 Special note to help you figure things out if your type
664 checking is tripped up by the fact that this method returns
665 _AtMostOneElement instead of PageElement: the only time
666 this method returns None is if `accept_self` is False and the
667 `PageElement` has no children--either it's a NavigableString
668 or an empty Tag.
670 :param is_initialized: Has `PageElement.setup` been called on
671 this `PageElement` yet?
673 :param accept_self: Is ``self`` an acceptable answer to the
674 question?
675 """
676 if is_initialized and self.next_sibling is not None:
677 last_child = self.next_sibling.previous_element
678 else:
679 last_child = self
680 while isinstance(last_child, Tag) and last_child.contents:
681 last_child = last_child.contents[-1]
682 if not accept_self and last_child is self:
683 last_child = None
684 return last_child
686 _lastRecursiveChild = _deprecated_alias(
687 "_lastRecursiveChild", "_last_descendant", "4.0.0"
688 )
690 def insert_before(self, *args: _InsertableElement) -> List[PageElement]:
691 """Makes the given element(s) the immediate predecessor of this one.
693 All the elements will have the same `PageElement.parent` as
694 this one, and the given elements will occur immediately before
695 this one.
697 :param args: One or more PageElements.
699 :return The list of PageElements that were inserted.
700 """
701 parent = self.parent
702 if parent is None:
703 raise ValueError("Element has no parent, so 'before' has no meaning.")
704 if any(x is self for x in args):
705 raise ValueError("Can't insert an element before itself.")
706 results: List[PageElement] = []
707 for predecessor in args:
708 # Extract first so that the index won't be screwed up if they
709 # are siblings.
710 if isinstance(predecessor, PageElement):
711 predecessor.extract()
712 index = parent.index(self)
713 results.extend(parent.insert(index, predecessor))
715 return results
717 def insert_after(self, *args: _InsertableElement) -> List[PageElement]:
718 """Makes the given element(s) the immediate successor of this one.
720 The elements will have the same `PageElement.parent` as this
721 one, and the given elements will occur immediately after this
722 one.
724 :param args: One or more PageElements.
726 :return The list of PageElements that were inserted.
727 """
728 # Do all error checking before modifying the tree.
729 parent = self.parent
730 if parent is None:
731 raise ValueError("Element has no parent, so 'after' has no meaning.")
732 if any(x is self for x in args):
733 raise ValueError("Can't insert an element after itself.")
735 offset = 0
736 results: List[PageElement] = []
737 for successor in args:
738 # Extract first so that the index won't be screwed up if they
739 # are siblings.
740 if isinstance(successor, PageElement):
741 successor.extract()
742 index = parent.index(self)
743 results.extend(parent.insert(index + 1 + offset, successor))
744 offset += 1
746 return results
748 def find_next(
749 self,
750 name: _FindMethodName = None,
751 attrs: _StrainableAttributes = {},
752 string: Optional[_StrainableString] = None,
753 **kwargs: _StrainableAttribute,
754 ) -> _AtMostOneElement:
755 """Find the first PageElement that matches the given criteria and
756 appears later in the document than this PageElement.
758 All find_* methods take a common set of arguments. See the online
759 documentation for detailed explanations.
761 :param name: A filter on tag name.
762 :param attrs: Additional filters on attribute values.
763 :param string: A filter for a NavigableString with specific text.
764 :kwargs: Additional filters on attribute values.
765 """
766 return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
768 findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0")
770 def find_all_next(
771 self,
772 name: _FindMethodName = None,
773 attrs: _StrainableAttributes = {},
774 string: Optional[_StrainableString] = None,
775 limit: Optional[int] = None,
776 _stacklevel: int = 2,
777 **kwargs: _StrainableAttribute,
778 ) -> _QueryResults:
779 """Find all `PageElement` objects that match the given criteria and
780 appear later in the document than this `PageElement`.
782 All find_* methods take a common set of arguments. See the online
783 documentation for detailed explanations.
785 :param name: A filter on tag name.
786 :param attrs: Additional filters on attribute values.
787 :param string: A filter for a NavigableString with specific text.
788 :param limit: Stop looking after finding this many results.
789 :param _stacklevel: Used internally to improve warning messages.
790 :kwargs: Additional filters on attribute values.
791 """
792 return self._find_all(
793 name,
794 attrs,
795 string,
796 limit,
797 self.next_elements,
798 _stacklevel=_stacklevel + 1,
799 **kwargs,
800 )
802 findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0")
804 def find_next_sibling(
805 self,
806 name: _FindMethodName = None,
807 attrs: _StrainableAttributes = {},
808 string: Optional[_StrainableString] = None,
809 **kwargs: _StrainableAttribute,
810 ) -> _AtMostOneElement:
811 """Find the closest sibling to this PageElement that matches the
812 given criteria and appears later in the document.
814 All find_* methods take a common set of arguments. See the
815 online documentation for detailed explanations.
817 :param name: A filter on tag name.
818 :param attrs: Additional filters on attribute values.
819 :param string: A filter for a `NavigableString` with specific text.
820 :kwargs: Additional filters on attribute values.
821 """
822 return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs)
824 findNextSibling = _deprecated_function_alias(
825 "findNextSibling", "find_next_sibling", "4.0.0"
826 )
828 def find_next_siblings(
829 self,
830 name: _FindMethodName = None,
831 attrs: _StrainableAttributes = {},
832 string: Optional[_StrainableString] = None,
833 limit: Optional[int] = None,
834 _stacklevel: int = 2,
835 **kwargs: _StrainableAttribute,
836 ) -> _QueryResults:
837 """Find all siblings of this `PageElement` that match the given criteria
838 and appear later in the document.
840 All find_* methods take a common set of arguments. See the online
841 documentation for detailed explanations.
843 :param name: A filter on tag name.
844 :param attrs: Additional filters on attribute values.
845 :param string: A filter for a `NavigableString` with specific text.
846 :param limit: Stop looking after finding this many results.
847 :param _stacklevel: Used internally to improve warning messages.
848 :kwargs: Additional filters on attribute values.
849 """
850 return self._find_all(
851 name,
852 attrs,
853 string,
854 limit,
855 self.next_siblings,
856 _stacklevel=_stacklevel + 1,
857 **kwargs,
858 )
860 findNextSiblings = _deprecated_function_alias(
861 "findNextSiblings", "find_next_siblings", "4.0.0"
862 )
863 fetchNextSiblings = _deprecated_function_alias(
864 "fetchNextSiblings", "find_next_siblings", "3.0.0"
865 )
867 def find_previous(
868 self,
869 name: _FindMethodName = None,
870 attrs: _StrainableAttributes = {},
871 string: Optional[_StrainableString] = None,
872 **kwargs: _StrainableAttribute,
873 ) -> _AtMostOneElement:
874 """Look backwards in the document from this `PageElement` and find the
875 first `PageElement` that matches the given criteria.
877 All find_* methods take a common set of arguments. See the online
878 documentation for detailed explanations.
880 :param name: A filter on tag name.
881 :param attrs: Additional filters on attribute values.
882 :param string: A filter for a `NavigableString` with specific text.
883 :kwargs: Additional filters on attribute values.
884 """
885 return self._find_one(self.find_all_previous, name, attrs, string, **kwargs)
887 findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0")
889 def find_all_previous(
890 self,
891 name: _FindMethodName = None,
892 attrs: _StrainableAttributes = {},
893 string: Optional[_StrainableString] = None,
894 limit: Optional[int] = None,
895 _stacklevel: int = 2,
896 **kwargs: _StrainableAttribute,
897 ) -> _QueryResults:
898 """Look backwards in the document from this `PageElement` and find all
899 `PageElement` that match the given criteria.
901 All find_* methods take a common set of arguments. See the online
902 documentation for detailed explanations.
904 :param name: A filter on tag name.
905 :param attrs: Additional filters on attribute values.
906 :param string: A filter for a `NavigableString` with specific text.
907 :param limit: Stop looking after finding this many results.
908 :param _stacklevel: Used internally to improve warning messages.
909 :kwargs: Additional filters on attribute values.
910 """
911 return self._find_all(
912 name,
913 attrs,
914 string,
915 limit,
916 self.previous_elements,
917 _stacklevel=_stacklevel + 1,
918 **kwargs,
919 )
921 findAllPrevious = _deprecated_function_alias(
922 "findAllPrevious", "find_all_previous", "4.0.0"
923 )
924 fetchAllPrevious = _deprecated_function_alias(
925 "fetchAllPrevious", "find_all_previous", "3.0.0"
926 )
928 def find_previous_sibling(
929 self,
930 name: _FindMethodName = None,
931 attrs: _StrainableAttributes = {},
932 string: Optional[_StrainableString] = None,
933 **kwargs: _StrainableAttribute,
934 ) -> _AtMostOneElement:
935 """Returns the closest sibling to this `PageElement` that matches the
936 given criteria and appears earlier in the document.
938 All find_* methods take a common set of arguments. See the online
939 documentation for detailed explanations.
941 :param name: A filter on tag name.
942 :param attrs: Additional filters on attribute values.
943 :param string: A filter for a `NavigableString` with specific text.
944 :kwargs: Additional filters on attribute values.
945 """
946 return self._find_one(
947 self.find_previous_siblings, name, attrs, string, **kwargs
948 )
950 findPreviousSibling = _deprecated_function_alias(
951 "findPreviousSibling", "find_previous_sibling", "4.0.0"
952 )
954 def find_previous_siblings(
955 self,
956 name: _FindMethodName = None,
957 attrs: _StrainableAttributes = {},
958 string: Optional[_StrainableString] = None,
959 limit: Optional[int] = None,
960 _stacklevel: int = 2,
961 **kwargs: _StrainableAttribute,
962 ) -> _QueryResults:
963 """Returns all siblings to this PageElement that match the
964 given criteria and appear earlier in the document.
966 All find_* methods take a common set of arguments. See the online
967 documentation for detailed explanations.
969 :param name: A filter on tag name.
970 :param attrs: Additional filters on attribute values.
971 :param string: A filter for a NavigableString with specific text.
972 :param limit: Stop looking after finding this many results.
973 :param _stacklevel: Used internally to improve warning messages.
974 :kwargs: Additional filters on attribute values.
975 """
976 return self._find_all(
977 name,
978 attrs,
979 string,
980 limit,
981 self.previous_siblings,
982 _stacklevel=_stacklevel + 1,
983 **kwargs,
984 )
986 findPreviousSiblings = _deprecated_function_alias(
987 "findPreviousSiblings", "find_previous_siblings", "4.0.0"
988 )
989 fetchPreviousSiblings = _deprecated_function_alias(
990 "fetchPreviousSiblings", "find_previous_siblings", "3.0.0"
991 )
993 def find_parent(
994 self,
995 name: _FindMethodName = None,
996 attrs: _StrainableAttributes = {},
997 **kwargs: _StrainableAttribute,
998 ) -> _AtMostOneElement:
999 """Find the closest parent of this PageElement that matches the given
1000 criteria.
1002 All find_* methods take a common set of arguments. See the online
1003 documentation for detailed explanations.
1005 :param name: A filter on tag name.
1006 :param attrs: Additional filters on attribute values.
1007 :param self: Whether the PageElement itself should be considered
1008 as one of its 'parents'.
1009 :kwargs: Additional filters on attribute values.
1010 """
1011 # NOTE: We can't use _find_one because findParents takes a different
1012 # set of arguments.
1013 r = None
1014 results = self.find_parents(
1015 name, attrs, 1, _stacklevel=3, **kwargs
1016 )
1017 if results:
1018 r = results[0]
1019 return r
1021 findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0")
1023 def find_parents(
1024 self,
1025 name: _FindMethodName = None,
1026 attrs: _StrainableAttributes = {},
1027 limit: Optional[int] = None,
1028 _stacklevel: int = 2,
1029 **kwargs: _StrainableAttribute,
1030 ) -> _QueryResults:
1031 """Find all parents of this `PageElement` that match the given criteria.
1033 All find_* methods take a common set of arguments. See the online
1034 documentation for detailed explanations.
1036 :param name: A filter on tag name.
1037 :param attrs: Additional filters on attribute values.
1038 :param limit: Stop looking after finding this many results.
1039 :param _stacklevel: Used internally to improve warning messages.
1040 :kwargs: Additional filters on attribute values.
1041 """
1042 iterator = self.parents
1043 return self._find_all(
1044 name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs
1045 )
1047 findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0")
1048 fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0")
1050 @property
1051 def next(self) -> _AtMostOneElement:
1052 """The `PageElement`, if any, that was parsed just after this one."""
1053 return self.next_element
1055 @property
1056 def previous(self) -> _AtMostOneElement:
1057 """The `PageElement`, if any, that was parsed just before this one."""
1058 return self.previous_element
1060 # These methods do the real heavy lifting.
1062 def _find_one(
1063 self,
1064 # TODO-TYPING: "There is no syntax to indicate optional or
1065 # keyword arguments; such function types are rarely used
1066 # as callback types." - So, not sure how to get more
1067 # specific here.
1068 method: Callable,
1069 name: _FindMethodName,
1070 attrs: _StrainableAttributes,
1071 string: Optional[_StrainableString],
1072 **kwargs: _StrainableAttribute,
1073 ) -> _AtMostOneElement:
1074 r: _AtMostOneElement = None
1075 results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
1076 if results:
1077 r = results[0]
1078 return r
1080 def _find_all(
1081 self,
1082 name: _FindMethodName,
1083 attrs: _StrainableAttributes,
1084 string: Optional[_StrainableString],
1085 limit: Optional[int],
1086 generator: Iterator[PageElement],
1087 _stacklevel: int = 3,
1088 **kwargs: _StrainableAttribute,
1089 ) -> _QueryResults:
1090 """Iterates over a generator looking for things that match."""
1092 if string is None and "text" in kwargs:
1093 string = kwargs.pop("text")
1094 warnings.warn(
1095 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
1096 DeprecationWarning,
1097 stacklevel=_stacklevel,
1098 )
1100 if "_class" in kwargs:
1101 warnings.warn(
1102 AttributeResemblesVariableWarning.MESSAGE
1103 % dict(
1104 original="_class",
1105 autocorrect="class_",
1106 ),
1107 AttributeResemblesVariableWarning,
1108 stacklevel=_stacklevel,
1109 )
1111 from bs4.filter import ElementFilter
1113 if isinstance(name, ElementFilter):
1114 matcher = name
1115 else:
1116 matcher = SoupStrainer(name, attrs, string, **kwargs)
1118 result: Iterable[_OneElement]
1119 if string is None and not limit and not attrs and not kwargs:
1120 if name is True or name is None:
1121 # Optimization to find all tags.
1122 result = (element for element in generator if isinstance(element, Tag))
1123 return ResultSet(matcher, result)
1124 elif isinstance(name, str):
1125 # Optimization to find all tags with a given name.
1126 if name.count(":") == 1:
1127 # This is a name with a prefix. If this is a namespace-aware document,
1128 # we need to match the local name against tag.name. If not,
1129 # we need to match the fully-qualified name against tag.name.
1130 prefix, local_name = name.split(":", 1)
1131 else:
1132 prefix = None
1133 local_name = name
1134 result = []
1135 for element in generator:
1136 if not isinstance(element, Tag):
1137 continue
1138 if element.name == name or (
1139 element.name == local_name
1140 and (prefix is None or element.prefix == prefix)
1141 ):
1142 result.append(element)
1143 return ResultSet(matcher, result)
1144 return matcher.find_all(generator, limit)
1146 # These generators can be used to navigate starting from both
1147 # NavigableStrings and Tags.
1148 @property
1149 def next_elements(self) -> Iterator[PageElement]:
1150 """All PageElements that were parsed after this one."""
1151 i = self.next_element
1152 while i is not None:
1153 successor = i.next_element
1154 yield i
1155 i = successor
1157 @property
1158 def self_and_next_elements(self) -> Iterator[PageElement]:
1159 """This PageElement, then all PageElements that were parsed after it."""
1160 return self._self_and(self.next_elements)
1162 @property
1163 def next_siblings(self) -> Iterator[PageElement]:
1164 """All PageElements that are siblings of this one but were parsed
1165 later.
1166 """
1167 i = self.next_sibling
1168 while i is not None:
1169 successor = i.next_sibling
1170 yield i
1171 i = successor
1173 @property
1174 def self_and_next_siblings(self) -> Iterator[PageElement]:
1175 """This PageElement, then all of its siblings."""
1176 return self._self_and(self.next_siblings)
1178 @property
1179 def previous_elements(self) -> Iterator[PageElement]:
1180 """All PageElements that were parsed before this one.
1182 :yield: A sequence of PageElements.
1183 """
1184 i = self.previous_element
1185 while i is not None:
1186 successor = i.previous_element
1187 yield i
1188 i = successor
1190 @property
1191 def self_and_previous_elements(self) -> Iterator[PageElement]:
1192 """This PageElement, then all elements that were parsed
1193 earlier."""
1194 return self._self_and(self.previous_elements)
1196 @property
1197 def previous_siblings(self) -> Iterator[PageElement]:
1198 """All PageElements that are siblings of this one but were parsed
1199 earlier.
1201 :yield: A sequence of PageElements.
1202 """
1203 i = self.previous_sibling
1204 while i is not None:
1205 successor = i.previous_sibling
1206 yield i
1207 i = successor
1209 @property
1210 def self_and_previous_siblings(self) -> Iterator[PageElement]:
1211 """This PageElement, then all of its siblings that were parsed
1212 earlier."""
1213 return self._self_and(self.previous_siblings)
1215 @property
1216 def parents(self) -> Iterator[Tag]:
1217 """All elements that are parents of this PageElement.
1219 :yield: A sequence of Tags, ending with a BeautifulSoup object.
1220 """
1221 i = self.parent
1222 while i is not None:
1223 successor = i.parent
1224 yield i
1225 i = successor
1227 @property
1228 def self_and_parents(self) -> Iterator[PageElement]:
1229 """This element, then all of its parents.
1231 :yield: A sequence of PageElements, ending with a BeautifulSoup object.
1232 """
1233 return self._self_and(self.parents)
1235 def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]:
1236 """Modify a generator by yielding this element, then everything
1237 yielded by the other generator.
1238 """
1239 if not self.hidden:
1240 yield self
1241 for i in other_generator:
1242 yield i
1244 @property
1245 def decomposed(self) -> bool:
1246 """Check whether a PageElement has been decomposed."""
1247 return getattr(self, "_decomposed", False) or False
1249 @_deprecated("next_elements", "4.0.0")
1250 def nextGenerator(self) -> Iterator[PageElement]:
1251 ":meta private:"
1252 return self.next_elements
1254 @_deprecated("next_siblings", "4.0.0")
1255 def nextSiblingGenerator(self) -> Iterator[PageElement]:
1256 ":meta private:"
1257 return self.next_siblings
1259 @_deprecated("previous_elements", "4.0.0")
1260 def previousGenerator(self) -> Iterator[PageElement]:
1261 ":meta private:"
1262 return self.previous_elements
1264 @_deprecated("previous_siblings", "4.0.0")
1265 def previousSiblingGenerator(self) -> Iterator[PageElement]:
1266 ":meta private:"
1267 return self.previous_siblings
1269 @_deprecated("parents", "4.0.0")
1270 def parentGenerator(self) -> Iterator[PageElement]:
1271 ":meta private:"
1272 return self.parents
1275class NavigableString(str, PageElement):
1276 """A Python string that is part of a parse tree.
1278 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
1279 create a `NavigableString` for the string "penguin".
1280 """
1282 #: A string prepended to the body of the 'real' string
1283 #: when formatting it as part of a document, such as the '<!--'
1284 #: in an HTML comment.
1285 PREFIX: str = ""
1287 #: A string appended to the body of the 'real' string
1288 #: when formatting it as part of a document, such as the '-->'
1289 #: in an HTML comment.
1290 SUFFIX: str = ""
1292 def __new__(cls, value: Union[str, bytes]) -> Self:
1293 """Create a new NavigableString.
1295 When unpickling a NavigableString, this method is called with
1296 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
1297 passed in to the superclass's __new__ or the superclass won't know
1298 how to handle non-ASCII characters.
1299 """
1300 if isinstance(value, str):
1301 u = str.__new__(cls, value)
1302 else:
1303 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
1304 u.hidden = False
1305 u.setup()
1306 return u
1308 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
1309 """A copy of a NavigableString has the same contents and class
1310 as the original, but it is not connected to the parse tree.
1312 :param recursive: This parameter is ignored; it's only defined
1313 so that NavigableString.__deepcopy__ implements the same
1314 signature as Tag.__deepcopy__.
1315 """
1316 return type(self)(self)
1318 def __getnewargs__(self) -> Tuple[str]:
1319 return (str(self),)
1321 # TODO-TYPING This should be SupportsIndex|slice but SupportsIndex
1322 # is introduced in 3.8. This can be changed once 3.7 support is dropped.
1323 def __getitem__(self, key: Union[int|slice]) -> str: # type:ignore
1324 """Raise an exception """
1325 if isinstance(key, str):
1326 raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__))
1327 return super(NavigableString, self).__getitem__(key)
1329 @property
1330 def string(self) -> str:
1331 """Convenience property defined to match `Tag.string`.
1333 :return: This property always returns the `NavigableString` it was
1334 called on.
1336 :meta private:
1337 """
1338 return self
1340 def output_ready(self, formatter: _FormatterOrName = "minimal") -> str:
1341 """Run the string through the provided formatter, making it
1342 ready for output as part of an HTML or XML document.
1344 :param formatter: A `Formatter` object, or a string naming one
1345 of the standard formatters.
1346 """
1347 output = self.format_string(self, formatter)
1348 return self.PREFIX + output + self.SUFFIX
1350 @property
1351 def name(self) -> None:
1352 """Since a NavigableString is not a Tag, it has no .name.
1354 This property is implemented so that code like this doesn't crash
1355 when run on a mixture of Tag and NavigableString objects:
1356 [x.name for x in tag.children]
1358 :meta private:
1359 """
1360 return None
1362 @name.setter
1363 def name(self, name: str) -> None:
1364 """Prevent NavigableString.name from ever being set.
1366 :meta private:
1367 """
1368 raise AttributeError("A NavigableString cannot be given a name.")
1370 def _all_strings(
1371 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
1372 ) -> Iterator[str]:
1373 """Yield all strings of certain classes, possibly stripping them.
1375 This makes it easy for NavigableString to implement methods
1376 like get_text() as conveniences, creating a consistent
1377 text-extraction API across all PageElements.
1379 :param strip: If True, all strings will be stripped before being
1380 yielded.
1382 :param types: A tuple of NavigableString subclasses. If this
1383 NavigableString isn't one of those subclasses, the
1384 sequence will be empty. By default, the subclasses
1385 considered are NavigableString and CData objects. That
1386 means no comments, processing instructions, etc.
1388 :yield: A sequence that either contains this string, or is empty.
1389 """
1390 if types is self.default:
1391 # This is kept in Tag because it's full of subclasses of
1392 # this class, which aren't defined until later in the file.
1393 types = Tag.MAIN_CONTENT_STRING_TYPES
1395 # Do nothing if the caller is looking for specific types of
1396 # string, and we're of a different type.
1397 #
1398 # We check specific types instead of using isinstance(self,
1399 # types) because all of these classes subclass
1400 # NavigableString. Anyone who's using this feature probably
1401 # wants generic NavigableStrings but not other stuff.
1402 my_type = type(self)
1403 if types is not None:
1404 if isinstance(types, type):
1405 # Looking for a single type.
1406 if my_type is not types:
1407 return
1408 elif my_type not in types:
1409 # Looking for one of a list of types.
1410 return
1412 value = self
1413 if strip:
1414 final_value = value.strip()
1415 else:
1416 final_value = self
1417 if len(final_value) > 0:
1418 yield final_value
1420 @property
1421 def strings(self) -> Iterator[str]:
1422 """Yield this string, but only if it is interesting.
1424 This is defined the way it is for compatibility with
1425 `Tag.strings`. See `Tag` for information on which strings are
1426 interesting in a given context.
1428 :yield: A sequence that either contains this string, or is empty.
1429 """
1430 return self._all_strings()
1433class PreformattedString(NavigableString):
1434 """A `NavigableString` not subject to the normal formatting rules.
1436 This is an abstract class used for special kinds of strings such
1437 as comments (`Comment`) and CDATA blocks (`CData`).
1438 """
1440 PREFIX: str = ""
1441 SUFFIX: str = ""
1443 def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str:
1444 """Make this string ready for output by adding any subclass-specific
1445 prefix or suffix.
1447 :param formatter: A `Formatter` object, or a string naming one
1448 of the standard formatters. The string will be passed into the
1449 `Formatter`, but only to trigger any side effects: the return
1450 value is ignored.
1452 :return: The string, with any subclass-specific prefix and
1453 suffix added on.
1454 """
1455 if formatter is not None:
1456 self.format_string(self, formatter)
1457 return self.PREFIX + self + self.SUFFIX
1460class CData(PreformattedString):
1461 """A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_."""
1463 PREFIX: str = "<![CDATA["
1464 SUFFIX: str = "]]>"
1467class ProcessingInstruction(PreformattedString):
1468 """A SGML processing instruction."""
1470 PREFIX: str = "<?"
1471 SUFFIX: str = ">"
1474class XMLProcessingInstruction(ProcessingInstruction):
1475 """An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_."""
1477 PREFIX: str = "<?"
1478 SUFFIX: str = "?>"
1481class Comment(PreformattedString):
1482 """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_."""
1484 PREFIX: str = "<!--"
1485 SUFFIX: str = "-->"
1488class Declaration(PreformattedString):
1489 """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_."""
1491 PREFIX: str = "<?"
1492 SUFFIX: str = "?>"
1495class Doctype(PreformattedString):
1496 """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_."""
1498 @classmethod
1499 def for_name_and_ids(
1500 cls, name: str, pub_id: Optional[str], system_id: Optional[str]
1501 ) -> Doctype:
1502 """Generate an appropriate document type declaration for a given
1503 public ID and system ID.
1505 :param name: The name of the document's root element, e.g. 'html'.
1506 :param pub_id: The Formal Public Identifier for this document type,
1507 e.g. '-//W3C//DTD XHTML 1.1//EN'
1508 :param system_id: The system identifier for this document type,
1509 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1510 """
1511 return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id))
1513 @classmethod
1514 def _string_for_name_and_ids(
1515 cls, name: str, pub_id: Optional[str], system_id: Optional[str]
1516 ) -> str:
1517 """Generate a string to be used as the basis of a Doctype object.
1519 This is a separate method from for_name_and_ids() because the lxml
1520 TreeBuilder needs to call it.
1521 """
1522 value = name or ""
1523 if pub_id is not None:
1524 value += ' PUBLIC "%s"' % pub_id
1525 if system_id is not None:
1526 value += ' "%s"' % system_id
1527 elif system_id is not None:
1528 value += ' SYSTEM "%s"' % system_id
1529 return value
1531 PREFIX: str = "<!DOCTYPE "
1532 SUFFIX: str = ">\n"
1535class Stylesheet(NavigableString):
1536 """A `NavigableString` representing the contents of a `<style> HTML
1537 tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_
1538 (probably CSS).
1540 Used to distinguish embedded stylesheets from textual content.
1541 """
1544class Script(NavigableString):
1545 """A `NavigableString` representing the contents of a `<script>
1546 HTML tag
1547 <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_
1548 (probably Javascript).
1550 Used to distinguish executable code from textual content.
1551 """
1554class TemplateString(NavigableString):
1555 """A `NavigableString` representing a string found inside an `HTML
1556 <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_
1557 embedded in a larger document.
1559 Used to distinguish such strings from the main body of the document.
1560 """
1563class RubyTextString(NavigableString):
1564 """A NavigableString representing the contents of an `<rt> HTML
1565 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_.
1567 Can be used to distinguish such strings from the strings they're
1568 annotating.
1569 """
1572class RubyParenthesisString(NavigableString):
1573 """A NavigableString representing the contents of an `<rp> HTML
1574 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_.
1575 """
1578class Tag(PageElement):
1579 """An HTML or XML tag that is part of a parse tree, along with its
1580 attributes, contents, and relationships to other parts of the tree.
1582 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
1583 create a `Tag` object representing the ``<b>`` tag. You can
1584 instantiate `Tag` objects directly, but it's not necessary unless
1585 you're adding entirely new markup to a parsed document. Most of
1586 the constructor arguments are intended for use by the `TreeBuilder`
1587 that's parsing a document.
1589 :param parser: A `BeautifulSoup` object representing the parse tree this
1590 `Tag` will be part of.
1591 :param builder: The `TreeBuilder` being used to build the tree.
1592 :param name: The name of the tag.
1593 :param namespace: The URI of this tag's XML namespace, if any.
1594 :param prefix: The prefix for this tag's XML namespace, if any.
1595 :param attrs: A dictionary of attribute values.
1596 :param parent: The `Tag` to use as the parent of this `Tag`. May be
1597 the `BeautifulSoup` object itself.
1598 :param previous: The `PageElement` that was parsed immediately before
1599 parsing this tag.
1600 :param is_xml: If True, this is an XML tag. Otherwise, this is an
1601 HTML tag.
1602 :param sourceline: The line number where this tag was found in its
1603 source document.
1604 :param sourcepos: The character position within ``sourceline`` where this
1605 tag was found.
1606 :param can_be_empty_element: If True, this tag should be
1607 represented as <tag/>. If False, this tag should be represented
1608 as <tag></tag>.
1609 :param cdata_list_attributes: A dictionary of attributes whose values should
1610 be parsed as lists of strings if they ever show up on this tag.
1611 :param preserve_whitespace_tags: Names of tags whose contents
1612 should have their whitespace preserved if they are encountered inside
1613 this tag.
1614 :param interesting_string_types: When iterating over this tag's
1615 string contents in methods like `Tag.strings` or
1616 `PageElement.get_text`, these are the types of strings that are
1617 interesting enough to be considered. By default,
1618 `NavigableString` (normal strings) and `CData` (CDATA
1619 sections) are the only interesting string subtypes.
1620 :param namespaces: A dictionary mapping currently active
1621 namespace prefixes to URIs, as of the point in the parsing process when
1622 this tag was encountered. This can be used later to
1623 construct CSS selectors.
1625 """
1627 def __init__(
1628 self,
1629 parser: Optional[BeautifulSoup] = None,
1630 builder: Optional[TreeBuilder] = None,
1631 name: Optional[str] = None,
1632 namespace: Optional[str] = None,
1633 prefix: Optional[str] = None,
1634 attrs: Optional[_RawOrProcessedAttributeValues] = None,
1635 parent: Optional[Union[BeautifulSoup, Tag]] = None,
1636 previous: _AtMostOneElement = None,
1637 is_xml: Optional[bool] = None,
1638 sourceline: Optional[int] = None,
1639 sourcepos: Optional[int] = None,
1640 can_be_empty_element: Optional[bool] = None,
1641 cdata_list_attributes: Optional[Dict[str, Set[str]]] = None,
1642 preserve_whitespace_tags: Optional[Set[str]] = None,
1643 interesting_string_types: Optional[Set[Type[NavigableString]]] = None,
1644 namespaces: Optional[Dict[str, str]] = None,
1645 # NOTE: Any new arguments here need to be mirrored in
1646 # Tag.copy_self, and potentially BeautifulSoup.new_tag
1647 # as well.
1648 ):
1649 if parser is None:
1650 self.parser_class = None
1651 else:
1652 # We don't actually store the parser object: that lets extracted
1653 # chunks be garbage-collected.
1654 self.parser_class = parser.__class__
1655 if name is None:
1656 raise ValueError("No value provided for new tag's name.")
1657 self.name = name
1658 self.namespace = namespace
1659 self._namespaces = namespaces or {}
1660 self.prefix = prefix
1661 if (not builder or builder.store_line_numbers) and (
1662 sourceline is not None or sourcepos is not None
1663 ):
1664 self.sourceline = sourceline
1665 self.sourcepos = sourcepos
1666 else:
1667 self.sourceline = sourceline
1668 self.sourcepos = sourcepos
1670 attr_dict_class: type[AttributeDict]
1671 attribute_value_list_class: type[AttributeValueList]
1672 if builder is None:
1673 if is_xml:
1674 attr_dict_class = XMLAttributeDict
1675 else:
1676 attr_dict_class = HTMLAttributeDict
1677 attribute_value_list_class = AttributeValueList
1678 else:
1679 attr_dict_class = builder.attribute_dict_class
1680 attribute_value_list_class = builder.attribute_value_list_class
1681 self.attribute_value_list_class = attribute_value_list_class
1683 if attrs is None:
1684 self.attrs = attr_dict_class()
1685 else:
1686 if builder is not None and builder.cdata_list_attributes:
1687 self.attrs = builder._replace_cdata_list_attribute_values(
1688 self.name, attrs
1689 )
1690 else:
1691 self.attrs = attr_dict_class()
1692 # Make sure that the values of any multi-valued
1693 # attributes (e.g. when a Tag is copied) are stored in
1694 # new lists.
1695 for k, v in attrs.items():
1696 if isinstance(v, list):
1697 v = v.__class__(v)
1698 self.attrs[k] = v
1700 # If possible, determine ahead of time whether this tag is an
1701 # XML tag.
1702 if builder:
1703 self.known_xml = builder.is_xml
1704 else:
1705 self.known_xml = is_xml
1706 self.contents: List[PageElement] = []
1707 self.setup(parent, previous)
1708 self.hidden = False
1710 if builder is None:
1711 # In the absence of a TreeBuilder, use whatever values were
1712 # passed in here. They're probably None, unless this is a copy of some
1713 # other tag.
1714 self.can_be_empty_element = can_be_empty_element
1715 self.cdata_list_attributes = cdata_list_attributes
1716 self.preserve_whitespace_tags = preserve_whitespace_tags
1717 self.interesting_string_types = interesting_string_types
1718 else:
1719 # Set up any substitutions for this tag, such as the charset in a META tag.
1720 self.attribute_value_list_class = builder.attribute_value_list_class
1721 builder.set_up_substitutions(self)
1723 # Ask the TreeBuilder whether this tag might be an empty-element tag.
1724 self.can_be_empty_element = builder.can_be_empty_element(name)
1726 # Keep track of the list of attributes of this tag that
1727 # might need to be treated as a list.
1728 #
1729 # For performance reasons, we store the whole data structure
1730 # rather than asking the question of every tag. Asking would
1731 # require building a new data structure every time, and
1732 # (unlike can_be_empty_element), we almost never need
1733 # to check this.
1734 self.cdata_list_attributes = builder.cdata_list_attributes
1736 # Keep track of the names that might cause this tag to be treated as a
1737 # whitespace-preserved tag.
1738 self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1740 if self.name in builder.string_containers:
1741 # This sort of tag uses a special string container
1742 # subclass for most of its strings. We need to be able
1743 # to look up the proper container subclass.
1744 self.interesting_string_types = {builder.string_containers[self.name]}
1745 else:
1746 self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES
1748 parser_class: Optional[type[BeautifulSoup]]
1749 name: str
1750 namespace: Optional[str]
1751 prefix: Optional[str]
1752 attrs: _AttributeValues
1753 sourceline: Optional[int]
1754 sourcepos: Optional[int]
1755 known_xml: Optional[bool]
1756 contents: List[PageElement]
1757 hidden: bool
1758 interesting_string_types: Optional[Set[Type[NavigableString]]]
1760 can_be_empty_element: Optional[bool]
1761 cdata_list_attributes: Optional[Dict[str, Set[str]]]
1762 preserve_whitespace_tags: Optional[Set[str]]
1764 #: :meta private:
1765 parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0")
1767 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self:
1768 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
1769 Its contents are a copy of the old Tag's contents.
1770 """
1771 clone = self.copy_self()
1773 if recursive:
1774 # Clone this tag's descendants recursively, but without
1775 # making any recursive function calls.
1776 tag_stack: List[Tag] = [clone]
1777 for event, element in self._event_stream(self.descendants):
1778 if event is Tag.END_ELEMENT_EVENT:
1779 # Stop appending incoming Tags to the Tag that was
1780 # just closed.
1781 tag_stack.pop()
1782 else:
1783 descendant_clone = element.__deepcopy__(memo, recursive=False)
1784 # Add to its parent's .contents
1785 tag_stack[-1].append(descendant_clone)
1787 if event is Tag.START_ELEMENT_EVENT:
1788 # Add the Tag itself to the stack so that its
1789 # children will be .appended to it.
1790 tag_stack.append(cast(Tag, descendant_clone))
1791 return clone
1793 def copy_self(self) -> Self:
1794 """Create a new Tag just like this one, but with no
1795 contents and unattached to any parse tree.
1797 This is the first step in the deepcopy process, but you can
1798 call it on its own to create a copy of a Tag without copying its
1799 contents.
1800 """
1801 clone = type(self)(
1802 None,
1803 None,
1804 self.name,
1805 self.namespace,
1806 self.prefix,
1807 self.attrs,
1808 is_xml=self._is_xml,
1809 sourceline=self.sourceline,
1810 sourcepos=self.sourcepos,
1811 can_be_empty_element=self.can_be_empty_element,
1812 cdata_list_attributes=self.cdata_list_attributes,
1813 preserve_whitespace_tags=self.preserve_whitespace_tags,
1814 interesting_string_types=self.interesting_string_types,
1815 namespaces=self._namespaces,
1816 )
1817 for attr in ("can_be_empty_element", "hidden"):
1818 setattr(clone, attr, getattr(self, attr))
1819 return clone
1821 @property
1822 def is_empty_element(self) -> bool:
1823 """Is this tag an empty-element tag? (aka a self-closing tag)
1825 A tag that has contents is never an empty-element tag.
1827 A tag that has no contents may or may not be an empty-element
1828 tag. It depends on the `TreeBuilder` used to create the
1829 tag. If the builder has a designated list of empty-element
1830 tags, then only a tag whose name shows up in that list is
1831 considered an empty-element tag. This is usually the case
1832 for HTML documents.
1834 If the builder has no designated list of empty-element, then
1835 any tag with no contents is an empty-element tag. This is usually
1836 the case for XML documents.
1837 """
1838 return len(self.contents) == 0 and self.can_be_empty_element is True
1840 @_deprecated("is_empty_element", "4.0.0")
1841 def isSelfClosing(self) -> bool:
1842 ": :meta private:"
1843 return self.is_empty_element
1845 @property
1846 def string(self) -> Optional[str]:
1847 """Convenience property to get the single string within this
1848 `Tag`, assuming there is just one.
1850 :return: If this `Tag` has a single child that's a
1851 `NavigableString`, the return value is that string. If this
1852 element has one child `Tag`, the return value is that child's
1853 `Tag.string`, recursively. If this `Tag` has no children,
1854 or has more than one child, the return value is ``None``.
1856 If this property is unexpectedly returning ``None`` for you,
1857 it's probably because your `Tag` has more than one thing
1858 inside it.
1859 """
1860 if len(self.contents) != 1:
1861 return None
1862 child = self.contents[0]
1863 if isinstance(child, NavigableString):
1864 return child
1865 elif isinstance(child, Tag):
1866 return child.string
1867 return None
1869 @string.setter
1870 def string(self, string: str) -> None:
1871 """Replace the `Tag.contents` of this `Tag` with a single string."""
1872 self.clear()
1873 if isinstance(string, NavigableString):
1874 new_class = string.__class__
1875 else:
1876 new_class = NavigableString
1877 self.append(new_class(string))
1879 #: :meta private:
1880 MAIN_CONTENT_STRING_TYPES = {NavigableString, CData}
1882 def _all_strings(
1883 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
1884 ) -> Iterator[str]:
1885 """Yield all strings of certain classes, possibly stripping them.
1887 :param strip: If True, all strings will be stripped before being
1888 yielded.
1890 :param types: A tuple of NavigableString subclasses. Any strings of
1891 a subclass not found in this list will be ignored. By
1892 default, the subclasses considered are the ones found in
1893 self.interesting_string_types. If that's not specified,
1894 only NavigableString and CData objects will be
1895 considered. That means no comments, processing
1896 instructions, etc.
1897 """
1898 if types is self.default:
1899 if self.interesting_string_types is None:
1900 types = self.MAIN_CONTENT_STRING_TYPES
1901 else:
1902 types = self.interesting_string_types
1904 for descendant in self.descendants:
1905 if not isinstance(descendant, NavigableString):
1906 continue
1907 descendant_type = type(descendant)
1908 if isinstance(types, type):
1909 if descendant_type is not types:
1910 # We're not interested in strings of this type.
1911 continue
1912 elif types is not None and descendant_type not in types:
1913 # We're not interested in strings of this type.
1914 continue
1915 if strip:
1916 stripped = descendant.strip()
1917 if len(stripped) == 0:
1918 continue
1919 yield stripped
1920 else:
1921 yield descendant
1923 strings = property(_all_strings)
1925 def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]:
1926 """Insert one or more new PageElements as a child of this `Tag`.
1928 This works similarly to :py:meth:`list.insert`, except you can insert
1929 multiple elements at once.
1931 :param position: The numeric position that should be occupied
1932 in this Tag's `Tag.children` by the first new `PageElement`.
1934 :param new_children: The PageElements to insert.
1936 :return The newly inserted PageElements.
1937 """
1938 inserted: List[PageElement] = []
1939 for new_child in new_children:
1940 inserted.extend(self._insert(position, new_child))
1941 position += 1
1942 return inserted
1944 def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]:
1945 if new_child is None:
1946 raise ValueError("Cannot insert None into a tag.")
1947 if new_child is self:
1948 raise ValueError("Cannot insert a tag into itself.")
1949 if isinstance(new_child, str) and not isinstance(new_child, NavigableString):
1950 new_child = NavigableString(new_child)
1952 from bs4 import BeautifulSoup
1953 if isinstance(new_child, BeautifulSoup):
1954 # We don't want to end up with a situation where one BeautifulSoup
1955 # object contains another. Insert the BeautifulSoup's children and
1956 # return them.
1957 return self.insert(position, *list(new_child.contents))
1958 position = min(position, len(self.contents))
1959 if hasattr(new_child, "parent") and new_child.parent is not None:
1960 # We're 'inserting' an element that's already one
1961 # of this object's children.
1962 if new_child.parent is self:
1963 current_index = self.index(new_child)
1964 if current_index < position:
1965 # We're moving this element further down the list
1966 # of this object's children. That means that when
1967 # we extract this element, our target index will
1968 # jump down one.
1969 position -= 1
1970 elif current_index == position:
1971 # We're 'inserting' an element into its current location.
1972 # This is a no-op.
1973 return [new_child]
1974 new_child.extract()
1976 new_child.parent = self
1977 previous_child = None
1978 if position == 0:
1979 new_child.previous_sibling = None
1980 new_child.previous_element = self
1981 else:
1982 previous_child = self.contents[position - 1]
1983 new_child.previous_sibling = previous_child
1984 new_child.previous_sibling.next_sibling = new_child
1985 new_child.previous_element = previous_child._last_descendant(False)
1986 if new_child.previous_element is not None:
1987 new_child.previous_element.next_element = new_child
1989 new_childs_last_element = new_child._last_descendant(
1990 is_initialized=False, accept_self=True
1991 )
1992 # new_childs_last_element can't be None because we passed
1993 # accept_self=True into _last_descendant. Worst case,
1994 # new_childs_last_element will be new_child itself. Making
1995 # this cast removes several mypy complaints later on as we
1996 # manipulate new_childs_last_element.
1997 new_childs_last_element = cast(PageElement, new_childs_last_element)
1999 if position >= len(self.contents):
2000 new_child.next_sibling = None
2002 parent: Optional[Tag] = self
2003 parents_next_sibling = None
2004 while parents_next_sibling is None and parent is not None:
2005 parents_next_sibling = parent.next_sibling
2006 parent = parent.parent
2007 if parents_next_sibling is not None:
2008 # We found the element that comes next in the document.
2009 break
2010 if parents_next_sibling is not None:
2011 new_childs_last_element.next_element = parents_next_sibling
2012 else:
2013 # The last element of this tag is the last element in
2014 # the document.
2015 new_childs_last_element.next_element = None
2016 else:
2017 next_child = self.contents[position]
2018 new_child.next_sibling = next_child
2019 if new_child.next_sibling is not None:
2020 new_child.next_sibling.previous_sibling = new_child
2021 new_childs_last_element.next_element = next_child
2023 if new_childs_last_element.next_element is not None:
2024 new_childs_last_element.next_element.previous_element = (
2025 new_childs_last_element
2026 )
2027 self.contents.insert(position, new_child)
2029 return [new_child]
2031 def unwrap(self) -> Self:
2032 """Replace this `PageElement` with its contents.
2034 :return: This object, no longer part of the tree.
2035 """
2036 my_parent = self.parent
2037 if my_parent is None:
2038 raise ValueError(
2039 "Cannot replace an element with its contents when that "
2040 "element is not part of a tree."
2041 )
2042 my_index = my_parent.index(self)
2043 self.extract(_self_index=my_index)
2044 for child in reversed(self.contents[:]):
2045 my_parent.insert(my_index, child)
2046 return self
2048 replace_with_children = unwrap
2050 @_deprecated("unwrap", "4.0.0")
2051 def replaceWithChildren(self) -> _OneElement:
2052 ": :meta private:"
2053 return self.unwrap()
2055 def append(self, tag: _InsertableElement) -> PageElement:
2056 """
2057 Appends the given `PageElement` to the contents of this `Tag`.
2059 :param tag: A PageElement.
2061 :return The newly appended PageElement.
2062 """
2063 return self.insert(len(self.contents), tag)[0]
2065 def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]:
2066 """Appends one or more objects to the contents of this
2067 `Tag`.
2069 :param tags: If a list of `PageElement` objects is provided,
2070 they will be appended to this tag's contents, one at a time.
2071 If a single `Tag` is provided, its `Tag.contents` will be
2072 used to extend this object's `Tag.contents`.
2074 :return The list of PageElements that were appended.
2075 """
2076 tag_list: Iterable[_InsertableElement]
2078 if isinstance(tags, Tag):
2079 tag_list = list(tags.contents)
2080 elif isinstance(tags, (PageElement, str)):
2081 # The caller should really be using append() instead,
2082 # but we can make it work.
2083 warnings.warn(
2084 "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.",
2085 UserWarning,
2086 stacklevel=2,
2087 )
2088 if isinstance(tags, str) and not isinstance(tags, PageElement):
2089 tags = NavigableString(tags)
2090 tag_list = [tags]
2091 elif isinstance(tags, Iterable):
2092 # Moving items around the tree may change their position in
2093 # the original list. Make a list that won't change.
2094 tag_list = list(tags)
2096 results: List[PageElement] = []
2097 for tag in tag_list:
2098 results.append(self.append(tag))
2100 return results
2102 def clear(self, decompose: bool = False) -> None:
2103 """Destroy all children of this `Tag` by calling
2104 `PageElement.extract` on them.
2106 :param decompose: If this is True, `PageElement.decompose` (a
2107 more destructive method) will be called instead of
2108 `PageElement.extract`.
2109 """
2110 for element in self.contents[:]:
2111 if decompose:
2112 element.decompose()
2113 else:
2114 element.extract()
2116 def smooth(self) -> None:
2117 """Smooth out the children of this `Tag` by consolidating consecutive
2118 strings.
2120 If you perform a lot of operations that modify the tree,
2121 calling this method afterwards can make pretty-printed output
2122 look more natural.
2123 """
2124 # Mark the first position of every pair of children that need
2125 # to be consolidated. Do this rather than making a copy of
2126 # self.contents, since in most cases very few strings will be
2127 # affected.
2128 marked = []
2129 for i, a in enumerate(self.contents):
2130 if isinstance(a, Tag):
2131 # Recursively smooth children.
2132 a.smooth()
2133 if i == len(self.contents) - 1:
2134 # This is the last item in .contents, and it's not a
2135 # tag. There's no chance it needs any work.
2136 continue
2137 b = self.contents[i + 1]
2138 if (
2139 isinstance(a, NavigableString)
2140 and isinstance(b, NavigableString)
2141 and not isinstance(a, PreformattedString)
2142 and not isinstance(b, PreformattedString)
2143 ):
2144 marked.append(i)
2146 # Go over the marked positions in reverse order, so that
2147 # removing items from .contents won't affect the remaining
2148 # positions.
2149 for i in reversed(marked):
2150 a = cast(NavigableString, self.contents[i])
2151 b = cast(NavigableString, self.contents[i + 1])
2152 b.extract()
2153 n = NavigableString(a + b)
2154 a.replace_with(n)
2156 def index(self, element: PageElement) -> int:
2157 """Find the index of a child of this `Tag` (by identity, not value).
2159 Doing this by identity avoids issues when a `Tag` contains two
2160 children that have string equality.
2162 :param element: Look for this `PageElement` in this object's contents.
2163 """
2164 for i, child in enumerate(self.contents):
2165 if child is element:
2166 return i
2167 raise ValueError("Tag.index: element not in tag")
2169 def get(
2170 self, key: str, default: Optional[_AttributeValue] = None
2171 ) -> Optional[_AttributeValue]:
2172 """Returns the value of the 'key' attribute for the tag, or
2173 the value given for 'default' if it doesn't have that
2174 attribute.
2176 :param key: The attribute to look for.
2177 :param default: Use this value if the attribute is not present
2178 on this `Tag`.
2179 """
2180 return self.attrs.get(key, default)
2182 def get_attribute_list(
2183 self, key: str, default: Optional[AttributeValueList] = None
2184 ) -> AttributeValueList:
2185 """The same as get(), but always returns a (possibly empty) list.
2187 :param key: The attribute to look for.
2188 :param default: Use this value if the attribute is not present
2189 on this `Tag`.
2190 :return: A list of strings, usually empty or containing only a single
2191 value.
2192 """
2193 list_value: AttributeValueList
2194 value = self.get(key, default)
2195 if value is None:
2196 list_value = self.attribute_value_list_class()
2197 elif isinstance(value, list):
2198 list_value = value
2199 else:
2200 if not isinstance(value, str):
2201 value = cast(str, value)
2202 list_value = self.attribute_value_list_class([value])
2203 return list_value
2205 def has_attr(self, key: str) -> bool:
2206 """Does this `Tag` have an attribute with the given name?"""
2207 return key in self.attrs
2209 def __hash__(self) -> int:
2210 return str(self).__hash__()
2212 def __getitem__(self, key: str) -> _AttributeValue:
2213 """tag[key] returns the value of the 'key' attribute for the Tag,
2214 and throws an exception if it's not there."""
2215 return self.attrs[key]
2217 def __iter__(self) -> Iterator[PageElement]:
2218 "Iterating over a Tag iterates over its contents."
2219 return iter(self.contents)
2221 def __len__(self) -> int:
2222 "The length of a Tag is the length of its list of contents."
2223 return len(self.contents)
2225 def __contains__(self, x: Any) -> bool:
2226 return x in self.contents
2228 def __bool__(self) -> bool:
2229 "A tag is non-None even if it has no contents."
2230 return True
2232 def __setitem__(self, key: str, value: _AttributeValue) -> None:
2233 """Setting tag[key] sets the value of the 'key' attribute for the
2234 tag."""
2235 self.attrs[key] = value
2237 def __delitem__(self, key: str) -> None:
2238 "Deleting tag[key] deletes all 'key' attributes for the tag."
2239 self.attrs.pop(key, None)
2241 def __call__(
2242 self,
2243 name: Optional[_StrainableElement] = None,
2244 attrs: _StrainableAttributes = {},
2245 recursive: bool = True,
2246 string: Optional[_StrainableString] = None,
2247 limit: Optional[int] = None,
2248 _stacklevel: int = 2,
2249 **kwargs: _StrainableAttribute,
2250 ) -> _QueryResults:
2251 """Calling a Tag like a function is the same as calling its
2252 find_all() method. Eg. tag('a') returns a list of all the A tags
2253 found within this tag."""
2254 return self.find_all(
2255 name, attrs, recursive, string, limit, _stacklevel, **kwargs
2256 )
2258 def __getattr__(self, subtag: str) -> Optional[Tag]:
2259 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
2260 # print("Getattr %s.%s" % (self.__class__, tag))
2261 result: _AtMostOneElement
2262 if len(subtag) > 3 and subtag.endswith("Tag"):
2263 # BS3: soup.aTag -> "soup.find("a")
2264 tag_name = subtag[:-3]
2265 warnings.warn(
2266 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")'
2267 % dict(name=tag_name),
2268 DeprecationWarning,
2269 stacklevel=2,
2270 )
2271 result = self.find(tag_name)
2272 # We special case contents to avoid recursion.
2273 elif not subtag.startswith("__") and not subtag == "contents":
2274 result = self.find(subtag)
2275 else:
2276 raise AttributeError(
2277 "'%s' object has no attribute '%s'" % (self.__class__, subtag)
2278 )
2279 return cast(Optional[Tag], result)
2281 def __eq__(self, other: Any) -> bool:
2282 """Returns true iff this Tag has the same name, the same attributes,
2283 and the same contents (recursively) as `other`."""
2284 if self is other:
2285 return True
2286 if not isinstance(other, Tag):
2287 return False
2288 if (
2289 not hasattr(other, "name")
2290 or not hasattr(other, "attrs")
2291 or not hasattr(other, "contents")
2292 or self.name != other.name
2293 or self.attrs != other.attrs
2294 or len(self) != len(other)
2295 ):
2296 return False
2297 for i, my_child in enumerate(self.contents):
2298 if my_child != other.contents[i]:
2299 return False
2300 return True
2302 def __ne__(self, other: Any) -> bool:
2303 """Returns true iff this Tag is not identical to `other`,
2304 as defined in __eq__."""
2305 return not self == other
2307 def __repr__(self) -> str:
2308 """Renders this `Tag` as a string."""
2309 return self.decode()
2311 __str__ = __unicode__ = __repr__
2313 def encode(
2314 self,
2315 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2316 indent_level: Optional[int] = None,
2317 formatter: _FormatterOrName = "minimal",
2318 errors: str = "xmlcharrefreplace",
2319 ) -> bytes:
2320 """Render this `Tag` and its contents as a bytestring.
2322 :param encoding: The encoding to use when converting to
2323 a bytestring. This may also affect the text of the document,
2324 specifically any encoding declarations within the document.
2325 :param indent_level: Each line of the rendering will be
2326 indented this many levels. (The ``formatter`` decides what a
2327 'level' means, in terms of spaces or other characters
2328 output.) This is used internally in recursive calls while
2329 pretty-printing.
2330 :param formatter: Either a `Formatter` object, or a string naming one of
2331 the standard formatters.
2332 :param errors: An error handling strategy such as
2333 'xmlcharrefreplace'. This value is passed along into
2334 :py:meth:`str.encode` and its value should be one of the `error
2335 handling constants defined by Python's codecs module
2336 <https://docs.python.org/3/library/codecs.html#error-handlers>`_.
2337 """
2338 # Turn the data structure into Unicode, then encode the
2339 # Unicode.
2340 u = self.decode(indent_level, encoding, formatter)
2341 return u.encode(encoding, errors)
2343 def decode(
2344 self,
2345 indent_level: Optional[int] = None,
2346 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2347 formatter: _FormatterOrName = "minimal",
2348 iterator: Optional[Iterator[PageElement]] = None,
2349 ) -> str:
2350 """Render this `Tag` and its contents as a Unicode string.
2352 :param indent_level: Each line of the rendering will be
2353 indented this many levels. (The ``formatter`` decides what a
2354 'level' means, in terms of spaces or other characters
2355 output.) This is used internally in recursive calls while
2356 pretty-printing.
2357 :param encoding: The encoding you intend to use when
2358 converting the string to a bytestring. decode() is *not*
2359 responsible for performing that encoding. This information
2360 is needed so that a real encoding can be substituted in if
2361 the document contains an encoding declaration (e.g. in a
2362 <meta> tag).
2363 :param formatter: Either a `Formatter` object, or a string
2364 naming one of the standard formatters.
2365 :param iterator: The iterator to use when navigating over the
2366 parse tree. This is only used by `Tag.decode_contents` and
2367 you probably won't need to use it.
2368 """
2369 pieces = []
2370 # First off, turn a non-Formatter `formatter` into a Formatter
2371 # object. This will stop the lookup from happening over and
2372 # over again.
2373 if not isinstance(formatter, Formatter):
2374 formatter = self.formatter_for_name(formatter)
2376 if indent_level is True:
2377 indent_level = 0
2379 # The currently active tag that put us into string literal
2380 # mode. Until this element is closed, children will be treated
2381 # as string literals and not pretty-printed. String literal
2382 # mode is turned on immediately after this tag begins, and
2383 # turned off immediately before it's closed. This means there
2384 # will be whitespace before and after the tag itself.
2385 string_literal_tag = None
2387 for event, element in self._event_stream(iterator):
2388 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
2389 element = cast(Tag, element)
2390 piece = element._format_tag(eventual_encoding, formatter, opening=True)
2391 elif event is Tag.END_ELEMENT_EVENT:
2392 element = cast(Tag, element)
2393 piece = element._format_tag(eventual_encoding, formatter, opening=False)
2394 if indent_level is not None:
2395 indent_level -= 1
2396 else:
2397 element = cast(NavigableString, element)
2398 piece = element.output_ready(formatter)
2400 # Now we need to apply the 'prettiness' -- extra
2401 # whitespace before and/or after this tag. This can get
2402 # complicated because certain tags, like <pre> and
2403 # <script>, can't be prettified, since adding whitespace would
2404 # change the meaning of the content.
2406 # The default behavior is to add whitespace before and
2407 # after an element when string literal mode is off, and to
2408 # leave things as they are when string literal mode is on.
2409 if string_literal_tag:
2410 indent_before = indent_after = False
2411 else:
2412 indent_before = indent_after = True
2414 # The only time the behavior is more complex than that is
2415 # when we encounter an opening or closing tag that might
2416 # put us into or out of string literal mode.
2417 if (
2418 event is Tag.START_ELEMENT_EVENT
2419 and not string_literal_tag
2420 and not cast(Tag, element)._should_pretty_print()
2421 ):
2422 # We are about to enter string literal mode. Add
2423 # whitespace before this tag, but not after. We
2424 # will stay in string literal mode until this tag
2425 # is closed.
2426 indent_before = True
2427 indent_after = False
2428 string_literal_tag = element
2429 elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag:
2430 # We are about to exit string literal mode by closing
2431 # the tag that sent us into that mode. Add whitespace
2432 # after this tag, but not before.
2433 indent_before = False
2434 indent_after = True
2435 string_literal_tag = None
2437 # Now we know whether to add whitespace before and/or
2438 # after this element.
2439 if indent_level is not None:
2440 if indent_before or indent_after:
2441 if isinstance(element, NavigableString):
2442 piece = piece.strip()
2443 if piece:
2444 piece = self._indent_string(
2445 piece, indent_level, formatter, indent_before, indent_after
2446 )
2447 if event == Tag.START_ELEMENT_EVENT:
2448 indent_level += 1
2449 pieces.append(piece)
2450 return "".join(pieces)
2452 class _TreeTraversalEvent(object):
2453 """An internal class representing an event in the process
2454 of traversing a parse tree.
2456 :meta private:
2457 """
2459 # Stand-ins for the different events yielded by _event_stream
2460 START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2461 END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2462 EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2463 STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2465 def _event_stream(
2466 self, iterator: Optional[Iterator[PageElement]] = None
2467 ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]:
2468 """Yield a sequence of events that can be used to reconstruct the DOM
2469 for this element.
2471 This lets us recreate the nested structure of this element
2472 (e.g. when formatting it as a string) without using recursive
2473 method calls.
2475 This is similar in concept to the SAX API, but it's a simpler
2476 interface designed for internal use. The events are different
2477 from SAX and the arguments associated with the events are Tags
2478 and other Beautiful Soup objects.
2480 :param iterator: An alternate iterator to use when traversing
2481 the tree.
2482 """
2483 tag_stack: List[Tag] = []
2485 iterator = iterator or self.self_and_descendants
2487 for c in iterator:
2488 # If the parent of the element we're about to yield is not
2489 # the tag currently on the stack, it means that the tag on
2490 # the stack closed before this element appeared.
2491 while tag_stack and c.parent != tag_stack[-1]:
2492 now_closed_tag = tag_stack.pop()
2493 yield Tag.END_ELEMENT_EVENT, now_closed_tag
2495 if isinstance(c, Tag):
2496 if c.is_empty_element:
2497 yield Tag.EMPTY_ELEMENT_EVENT, c
2498 else:
2499 yield Tag.START_ELEMENT_EVENT, c
2500 tag_stack.append(c)
2501 continue
2502 else:
2503 yield Tag.STRING_ELEMENT_EVENT, c
2505 while tag_stack:
2506 now_closed_tag = tag_stack.pop()
2507 yield Tag.END_ELEMENT_EVENT, now_closed_tag
2509 def _indent_string(
2510 self,
2511 s: str,
2512 indent_level: int,
2513 formatter: Formatter,
2514 indent_before: bool,
2515 indent_after: bool,
2516 ) -> str:
2517 """Add indentation whitespace before and/or after a string.
2519 :param s: The string to amend with whitespace.
2520 :param indent_level: The indentation level; affects how much
2521 whitespace goes before the string.
2522 :param indent_before: Whether or not to add whitespace
2523 before the string.
2524 :param indent_after: Whether or not to add whitespace
2525 (a newline) after the string.
2526 """
2527 space_before = ""
2528 if indent_before and indent_level:
2529 space_before = formatter.indent * indent_level
2531 space_after = ""
2532 if indent_after:
2533 space_after = "\n"
2535 return space_before + s + space_after
2537 def _format_tag(
2538 self, eventual_encoding: str, formatter: Formatter, opening: bool
2539 ) -> str:
2540 if self.hidden:
2541 # A hidden tag is invisible, although its contents
2542 # are visible.
2543 return ""
2545 # A tag starts with the < character (see below).
2547 # Then the / character, if this is a closing tag.
2548 closing_slash = ""
2549 if not opening:
2550 closing_slash = "/"
2552 # Then an optional namespace prefix.
2553 prefix = ""
2554 if self.prefix:
2555 prefix = self.prefix + ":"
2557 # Then a list of attribute values, if this is an opening tag.
2558 attribute_string = ""
2559 if opening:
2560 attributes = formatter.attributes(self)
2561 attrs = []
2562 for key, val in attributes:
2563 if val is None:
2564 decoded = key
2565 else:
2566 if isinstance(val, list) or isinstance(val, tuple):
2567 val = " ".join(val)
2568 elif not isinstance(val, str):
2569 val = str(val)
2570 elif (
2571 isinstance(val, AttributeValueWithCharsetSubstitution)
2572 and eventual_encoding is not None
2573 ):
2574 val = val.substitute_encoding(eventual_encoding)
2576 text = formatter.attribute_value(val)
2577 decoded = str(key) + "=" + formatter.quoted_attribute_value(text)
2578 attrs.append(decoded)
2579 if attrs:
2580 attribute_string = " " + " ".join(attrs)
2582 # Then an optional closing slash (for a void element in an
2583 # XML document).
2584 void_element_closing_slash = ""
2585 if self.is_empty_element:
2586 void_element_closing_slash = formatter.void_element_close_prefix or ""
2588 # Put it all together.
2589 return (
2590 "<"
2591 + closing_slash
2592 + prefix
2593 + self.name
2594 + attribute_string
2595 + void_element_closing_slash
2596 + ">"
2597 )
2599 def _should_pretty_print(self, indent_level: int = 1) -> bool:
2600 """Should this tag be pretty-printed?
2602 Most of them should, but some (such as <pre> in HTML
2603 documents) should not.
2604 """
2605 return indent_level is not None and (
2606 not self.preserve_whitespace_tags
2607 or self.name not in self.preserve_whitespace_tags
2608 )
2610 @overload
2611 def prettify(
2612 self,
2613 encoding: None = None,
2614 formatter: _FormatterOrName = "minimal",
2615 ) -> str:
2616 ...
2618 @overload
2619 def prettify(
2620 self,
2621 encoding: _Encoding,
2622 formatter: _FormatterOrName = "minimal",
2623 ) -> bytes:
2624 ...
2626 def prettify(
2627 self,
2628 encoding: Optional[_Encoding] = None,
2629 formatter: _FormatterOrName = "minimal",
2630 ) -> Union[str, bytes]:
2631 """Pretty-print this `Tag` as a string or bytestring.
2633 :param encoding: The encoding of the bytestring, or None if you want Unicode.
2634 :param formatter: A Formatter object, or a string naming one of
2635 the standard formatters.
2636 :return: A string (if no ``encoding`` is provided) or a bytestring
2637 (otherwise).
2638 """
2639 if encoding is None:
2640 return self.decode(indent_level=0, formatter=formatter)
2641 else:
2642 return self.encode(encoding=encoding, indent_level=0, formatter=formatter)
2644 def decode_contents(
2645 self,
2646 indent_level: Optional[int] = None,
2647 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2648 formatter: _FormatterOrName = "minimal",
2649 ) -> str:
2650 """Renders the contents of this tag as a Unicode string.
2652 :param indent_level: Each line of the rendering will be
2653 indented this many levels. (The formatter decides what a
2654 'level' means in terms of spaces or other characters
2655 output.) Used internally in recursive calls while
2656 pretty-printing.
2658 :param eventual_encoding: The tag is destined to be
2659 encoded into this encoding. decode_contents() is *not*
2660 responsible for performing that encoding. This information
2661 is needed so that a real encoding can be substituted in if
2662 the document contains an encoding declaration (e.g. in a
2663 <meta> tag).
2665 :param formatter: A `Formatter` object, or a string naming one of
2666 the standard Formatters.
2667 """
2668 return self.decode(
2669 indent_level, eventual_encoding, formatter, iterator=self.descendants
2670 )
2672 def encode_contents(
2673 self,
2674 indent_level: Optional[int] = None,
2675 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2676 formatter: _FormatterOrName = "minimal",
2677 ) -> bytes:
2678 """Renders the contents of this PageElement as a bytestring.
2680 :param indent_level: Each line of the rendering will be
2681 indented this many levels. (The ``formatter`` decides what a
2682 'level' means, in terms of spaces or other characters
2683 output.) This is used internally in recursive calls while
2684 pretty-printing.
2685 :param formatter: Either a `Formatter` object, or a string naming one of
2686 the standard formatters.
2687 :param encoding: The bytestring will be in this encoding.
2688 """
2689 contents = self.decode_contents(indent_level, encoding, formatter)
2690 return contents.encode(encoding)
2692 @_deprecated("encode_contents", "4.0.0")
2693 def renderContents(
2694 self,
2695 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2696 prettyPrint: bool = False,
2697 indentLevel: Optional[int] = 0,
2698 ) -> bytes:
2699 """Deprecated method for BS3 compatibility.
2701 :meta private:
2702 """
2703 if not prettyPrint:
2704 indentLevel = None
2705 return self.encode_contents(indent_level=indentLevel, encoding=encoding)
2707 # Soup methods
2709 def find(
2710 self,
2711 name: _FindMethodName = None,
2712 attrs: _StrainableAttributes = {},
2713 recursive: bool = True,
2714 string: Optional[_StrainableString] = None,
2715 **kwargs: _StrainableAttribute,
2716 ) -> _AtMostOneElement:
2717 """Look in the children of this PageElement and find the first
2718 PageElement that matches the given criteria.
2720 All find_* methods take a common set of arguments. See the online
2721 documentation for detailed explanations.
2723 :param name: A filter on tag name.
2724 :param attrs: Additional filters on attribute values.
2725 :param recursive: If this is True, find() will perform a
2726 recursive search of this Tag's children. Otherwise,
2727 only the direct children will be considered.
2728 :param string: A filter on the `Tag.string` attribute.
2729 :param limit: Stop looking after finding this many results.
2730 :kwargs: Additional filters on attribute values.
2731 """
2732 r = None
2733 results = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs)
2734 if results:
2735 r = results[0]
2736 return r
2738 findChild = _deprecated_function_alias("findChild", "find", "3.0.0")
2740 def find_all(
2741 self,
2742 name: _FindMethodName = None,
2743 attrs: _StrainableAttributes = {},
2744 recursive: bool = True,
2745 string: Optional[_StrainableString] = None,
2746 limit: Optional[int] = None,
2747 _stacklevel: int = 2,
2748 **kwargs: _StrainableAttribute,
2749 ) -> _QueryResults:
2750 """Look in the children of this `PageElement` and find all
2751 `PageElement` objects that match the given criteria.
2753 All find_* methods take a common set of arguments. See the online
2754 documentation for detailed explanations.
2756 :param name: A filter on tag name.
2757 :param attrs: Additional filters on attribute values.
2758 :param recursive: If this is True, find_all() will perform a
2759 recursive search of this PageElement's children. Otherwise,
2760 only the direct children will be considered.
2761 :param limit: Stop looking after finding this many results.
2762 :param _stacklevel: Used internally to improve warning messages.
2763 :kwargs: Additional filters on attribute values.
2764 """
2765 generator = self.descendants
2766 if not recursive:
2767 generator = self.children
2768 return self._find_all(
2769 name, attrs, string, limit, generator, _stacklevel=_stacklevel + 1, **kwargs
2770 )
2772 findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")
2773 findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")
2775 # Generator methods
2776 @property
2777 def children(self) -> Iterator[PageElement]:
2778 """Iterate over all direct children of this `PageElement`."""
2779 return (x for x in self.contents)
2781 @property
2782 def self_and_descendants(self) -> Iterator[PageElement]:
2783 """Iterate over this `Tag` and its children in a
2784 breadth-first sequence.
2785 """
2786 return self._self_and(self.descendants)
2788 @property
2789 def descendants(self) -> Iterator[PageElement]:
2790 """Iterate over all children of this `Tag` in a
2791 breadth-first sequence.
2792 """
2793 if not len(self.contents):
2794 return
2795 # _last_descendant() can't return None here because
2796 # accept_self is True. Worst case, last_descendant will end up
2797 # as self.
2798 last_descendant = cast(PageElement, self._last_descendant(accept_self=True))
2799 stopNode = last_descendant.next_element
2800 current: _AtMostOneElement = self.contents[0]
2801 while current is not stopNode and current is not None:
2802 successor = current.next_element
2803 yield current
2804 current = successor
2806 # CSS selector code
2807 def select_one(
2808 self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any
2809 ) -> Optional[Tag]:
2810 """Perform a CSS selection operation on the current element.
2812 :param selector: A CSS selector.
2814 :param namespaces: A dictionary mapping namespace prefixes
2815 used in the CSS selector to namespace URIs. By default,
2816 Beautiful Soup will use the prefixes it encountered while
2817 parsing the document.
2819 :param kwargs: Keyword arguments to be passed into Soup Sieve's
2820 soupsieve.select() method.
2821 """
2822 return self.css.select_one(selector, namespaces, **kwargs)
2824 def select(
2825 self,
2826 selector: str,
2827 namespaces: Optional[Dict[str, str]] = None,
2828 limit: int = 0,
2829 **kwargs: Any,
2830 ) -> ResultSet[Tag]:
2831 """Perform a CSS selection operation on the current element.
2833 This uses the SoupSieve library.
2835 :param selector: A string containing a CSS selector.
2837 :param namespaces: A dictionary mapping namespace prefixes
2838 used in the CSS selector to namespace URIs. By default,
2839 Beautiful Soup will use the prefixes it encountered while
2840 parsing the document.
2842 :param limit: After finding this number of results, stop looking.
2844 :param kwargs: Keyword arguments to be passed into SoupSieve's
2845 soupsieve.select() method.
2846 """
2847 return self.css.select(selector, namespaces, limit, **kwargs)
2849 @property
2850 def css(self) -> CSS:
2851 """Return an interface to the CSS selector API."""
2852 return CSS(self)
2854 # Old names for backwards compatibility
2855 @_deprecated("children", "4.0.0")
2856 def childGenerator(self) -> Iterator[PageElement]:
2857 """Deprecated generator.
2859 :meta private:
2860 """
2861 return self.children
2863 @_deprecated("descendants", "4.0.0")
2864 def recursiveChildGenerator(self) -> Iterator[PageElement]:
2865 """Deprecated generator.
2867 :meta private:
2868 """
2869 return self.descendants
2871 @_deprecated("has_attr", "4.0.0")
2872 def has_key(self, key: str) -> bool:
2873 """Deprecated method. This was kind of misleading because has_key()
2874 (attributes) was different from __in__ (contents).
2876 has_key() is gone in Python 3, anyway.
2878 :meta private:
2879 """
2880 return self.has_attr(key)
2883_PageElementT = TypeVar("_PageElementT", bound=PageElement)
2886class ResultSet(List[_PageElementT], Generic[_PageElementT]):
2887 """A ResultSet is a list of `PageElement` objects, gathered as the result
2888 of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of
2889 search results.
2890 """
2892 source: Optional[ElementFilter]
2894 def __init__(
2895 self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = ()
2896 ) -> None:
2897 super(ResultSet, self).__init__(result)
2898 self.source = source
2900 def __getattr__(self, key: str) -> None:
2901 """Raise a helpful exception to explain a common code fix."""
2902 raise AttributeError(
2903 f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""
2904 )
2907# Now that all the classes used by SoupStrainer have been defined,
2908# import SoupStrainer itself into this module to preserve the
2909# backwards compatibility of anyone who imports
2910# bs4.element.SoupStrainer.
2911from bs4.filter import SoupStrainer # noqa: E402