Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/element.py: 55%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3# Use of this source code is governed by the MIT license.
4__license__ = "MIT"
6import re
7import warnings
9from bs4.css import CSS
10from bs4._deprecation import (
11 _deprecated,
12 _deprecated_alias,
13 _deprecated_function_alias,
14)
15from bs4.formatter import (
16 Formatter,
17 HTMLFormatter,
18 XMLFormatter,
19)
20from bs4._warnings import AttributeResemblesVariableWarning
22from typing import (
23 Any,
24 Callable,
25 Dict,
26 Generic,
27 Iterable,
28 Iterator,
29 List,
30 Mapping,
31 MutableSequence,
32 Optional,
33 Pattern,
34 Set,
35 TYPE_CHECKING,
36 Tuple,
37 Type,
38 TypeVar,
39 Union,
40 cast,
41 overload,
42)
43from typing_extensions import (
44 Self,
45 TypeAlias,
46)
48if TYPE_CHECKING:
49 from bs4 import BeautifulSoup
50 from bs4.builder import TreeBuilder
51 from bs4.filter import ElementFilter
52 from bs4.formatter import (
53 _EntitySubstitutionFunction,
54 _FormatterOrName,
55 )
56 from bs4._typing import (
57 _AtMostOneElement,
58 _AtMostOneTag,
59 _AtMostOneNavigableString,
60 _AttributeValue,
61 _AttributeValues,
62 _Encoding,
63 _InsertableElement,
64 _OneElement,
65 _QueryResults,
66 _RawOrProcessedAttributeValues,
67 _StrainableElement,
68 _StrainableAttribute,
69 _StrainableAttributes,
70 _StrainableString,
71 _SomeNavigableStrings,
72 _SomeTags,
73 )
75_OneOrMoreStringTypes: TypeAlias = Union[
76 Type["NavigableString"], Iterable[Type["NavigableString"]]
77]
79_FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]]
81# Deprecated module-level attributes.
82# See https://peps.python.org/pep-0562/
83_deprecated_names = dict(
84 whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy."
85)
86#: :meta private:
87_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+")
90def __getattr__(name: str) -> Any:
91 if name in _deprecated_names:
92 message = _deprecated_names[name]
93 warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2)
95 return globals()[f"_deprecated_{name}"]
96 raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
99#: Documents output by Beautiful Soup will be encoded with
100#: this encoding unless you specify otherwise.
101DEFAULT_OUTPUT_ENCODING: str = "utf-8"
103#: A regular expression that can be used to split on whitespace.
104nonwhitespace_re: Pattern[str] = re.compile(r"\S+")
106#: These encodings are recognized by Python (so `Tag.encode`
107#: could theoretically support them) but XML and HTML don't recognize
108#: them (so they should not show up in an XML or HTML document as that
109#: document's encoding).
110#:
111#: If an XML document is encoded in one of these encodings, no encoding
112#: will be mentioned in the XML declaration. If an HTML document is
113#: encoded in one of these encodings, and the HTML document has a
114#: <meta> tag that mentions an encoding, the encoding will be given as
115#: the empty string.
116#:
117#: Source:
118#: Python documentation, `Python Specific Encodings <https://docs.python.org/3/library/codecs.html#python-specific-encodings>`_
119PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set(
120 [
121 "idna",
122 "mbcs",
123 "oem",
124 "palmos",
125 "punycode",
126 "raw_unicode_escape",
127 "undefined",
128 "unicode_escape",
129 "raw-unicode-escape",
130 "unicode-escape",
131 "string-escape",
132 "string_escape",
133 ]
134)
137class NamespacedAttribute(str):
138 """A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"')
139 which remembers the namespace prefix ('xml') and the name ('lang')
140 that were used to create it.
141 """
143 prefix: Optional[str]
144 name: Optional[str]
145 namespace: Optional[str]
147 def __new__(
148 cls,
149 prefix: Optional[str],
150 name: Optional[str] = None,
151 namespace: Optional[str] = None,
152 ) -> Self:
153 if not name:
154 # This is the default namespace. Its name "has no value"
155 # per https://www.w3.org/TR/xml-names/#defaulting
156 name = None
158 if not name:
159 obj = str.__new__(cls, prefix)
160 elif not prefix:
161 # Not really namespaced.
162 obj = str.__new__(cls, name)
163 else:
164 obj = str.__new__(cls, prefix + ":" + name)
165 obj.prefix = prefix
166 obj.name = name
167 obj.namespace = namespace
168 return obj
171class AttributeValueWithCharsetSubstitution(str):
172 """An abstract class standing in for a character encoding specified
173 inside an HTML ``<meta>`` tag.
175 Subclasses exist for each place such a character encoding might be
176 found: either inside the ``charset`` attribute
177 (`CharsetMetaAttributeValue`) or inside the ``content`` attribute
178 (`ContentMetaAttributeValue`)
180 This allows Beautiful Soup to replace that part of the HTML file
181 with a different encoding when ouputting a tree as a string.
182 """
184 # The original, un-encoded value of the ``content`` attribute.
185 #: :meta private:
186 original_value: str
188 def substitute_encoding(self, eventual_encoding: str) -> str:
189 """Do whatever's necessary in this implementation-specific
190 portion an HTML document to substitute in a specific encoding.
191 """
192 raise NotImplementedError()
195class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
196 """A generic stand-in for the value of a ``<meta>`` tag's ``charset``
197 attribute.
199 When Beautiful Soup parses the markup ``<meta charset="utf8">``, the
200 value of the ``charset`` attribute will become one of these objects.
202 If the document is later encoded to an encoding other than UTF-8, its
203 ``<meta>`` tag will mention the new encoding instead of ``utf8``.
204 """
206 def __new__(cls, original_value: str) -> Self:
207 # We don't need to use the original value for anything, but
208 # it might be useful for the user to know.
209 obj = str.__new__(cls, original_value)
210 obj.original_value = original_value
211 return obj
213 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
214 """When an HTML document is being encoded to a given encoding, the
215 value of a ``<meta>`` tag's ``charset`` becomes the name of
216 the encoding.
217 """
218 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
219 return ""
220 return eventual_encoding
223class AttributeValueList(List[str]):
224 """Class for the list used to hold the values of attributes which
225 have multiple values (such as HTML's 'class'). It's just a regular
226 list, but you can subclass it and pass it in to the TreeBuilder
227 constructor as attribute_value_list_class, to have your subclass
228 instantiated instead.
229 """
232class AttributeDict(Dict[Any,Any]):
233 """Superclass for the dictionary used to hold a tag's
234 attributes. You can use this, but it's just a regular dict with no
235 special logic.
236 """
239class XMLAttributeDict(AttributeDict):
240 """A dictionary for holding a Tag's attributes, which processes
241 incoming values for consistency with the HTML spec.
242 """
244 def __setitem__(self, key: str, value: Any) -> None:
245 """Set an attribute value, possibly modifying it to comply with
246 the XML spec.
248 This just means converting common non-string values to
249 strings: XML attributes may have "any literal string as a
250 value."
251 """
252 if value is None:
253 value = ""
254 if isinstance(value, bool):
255 # XML does not define any rules for boolean attributes.
256 # Preserve the old Beautiful Soup behavior (a bool that
257 # gets converted to a string on output) rather than
258 # guessing what the value should be.
259 pass
260 elif isinstance(value, (int, float)):
261 # It's dangerous to convert _every_ attribute value into a
262 # plain string, since an attribute value may be a more
263 # sophisticated string-like object
264 # (e.g. CharsetMetaAttributeValue). But we can definitely
265 # convert numeric values and booleans, which are the most common.
266 value = str(value)
268 super().__setitem__(key, value)
271class HTMLAttributeDict(AttributeDict):
272 """A dictionary for holding a Tag's attributes, which processes
273 incoming values for consistency with the HTML spec, which says
274 'Attribute values are a mixture of text and character
275 references...'
277 Basically, this means converting common non-string values into
278 strings, like XMLAttributeDict, though HTML also has some rules
279 around boolean attributes that XML doesn't have.
280 """
282 def __setitem__(self, key: str, value: Any) -> None:
283 """Set an attribute value, possibly modifying it to comply
284 with the HTML spec,
285 """
286 if value in (False, None):
287 # 'The values "true" and "false" are not allowed on
288 # boolean attributes. To represent a false value, the
289 # attribute has to be omitted altogether.'
290 if key in self:
291 del self[key]
292 return
293 if isinstance(value, bool):
294 # 'If the [boolean] attribute is present, its value must
295 # either be the empty string or a value that is an ASCII
296 # case-insensitive match for the attribute's canonical
297 # name, with no leading or trailing whitespace.'
298 #
299 # [fixme] It's not clear to me whether "canonical name"
300 # means fully-qualified name, unqualified name, or
301 # (probably not) name with namespace prefix. For now I'm
302 # going with unqualified name.
303 if isinstance(key, NamespacedAttribute):
304 value = key.name
305 else:
306 value = key
307 elif isinstance(value, (int, float)):
308 # See note in XMLAttributeDict for the reasoning why we
309 # only do this to numbers.
310 value = str(value)
311 super().__setitem__(key, value)
314class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
315 """A generic stand-in for the value of a ``<meta>`` tag's ``content``
316 attribute.
318 When Beautiful Soup parses the markup:
319 ``<meta http-equiv="content-type" content="text/html; charset=utf8">``
321 The value of the ``content`` attribute will become one of these objects.
323 If the document is later encoded to an encoding other than UTF-8, its
324 ``<meta>`` tag will mention the new encoding instead of ``utf8``.
325 """
327 #: Match the 'charset' argument inside the 'content' attribute
328 #: of a <meta> tag.
329 #: :meta private:
330 CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
332 def __new__(cls, original_value: str) -> Self:
333 cls.CHARSET_RE.search(original_value)
334 obj = str.__new__(cls, original_value)
335 obj.original_value = original_value
336 return obj
338 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
339 """When an HTML document is being encoded to a given encoding, the
340 value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes
341 the name of the encoding.
342 """
343 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
344 return self.CHARSET_RE.sub("", self.original_value)
346 def rewrite(match: re.Match[str]) -> str:
347 return match.group(1) + eventual_encoding
349 return self.CHARSET_RE.sub(rewrite, self.original_value)
352class PageElement(object):
353 """An abstract class representing a single element in the parse tree.
355 `NavigableString`, `Tag`, etc. are all subclasses of
356 `PageElement`. For this reason you'll see a lot of methods that
357 return `PageElement`, but you'll never see an actual `PageElement`
358 object. For the most part you can think of `PageElement` as
359 meaning "a `Tag` or a `NavigableString`."
360 """
362 #: In general, we can't tell just by looking at an element whether
363 #: it's contained in an XML document or an HTML document. But for
364 #: `Tag` objects (q.v.) we can store this information at parse time.
365 #: :meta private:
366 known_xml: Optional[bool] = None
368 #: Whether or not this element has been decomposed from the tree
369 #: it was created in.
370 _decomposed: bool
372 parent: Optional[Tag]
373 next_element: _AtMostOneElement
374 previous_element: _AtMostOneElement
375 next_sibling: _AtMostOneElement
376 previous_sibling: _AtMostOneElement
378 #: Whether or not this element is hidden from generated output.
379 #: Only the `BeautifulSoup` object itself is hidden.
380 hidden: bool = False
382 def setup(
383 self,
384 parent: Optional[Tag] = None,
385 previous_element: _AtMostOneElement = None,
386 next_element: _AtMostOneElement = None,
387 previous_sibling: _AtMostOneElement = None,
388 next_sibling: _AtMostOneElement = None,
389 ) -> None:
390 """Sets up the initial relations between this element and
391 other elements.
393 :param parent: The parent of this element.
395 :param previous_element: The element parsed immediately before
396 this one.
398 :param next_element: The element parsed immediately after
399 this one.
401 :param previous_sibling: The most recently encountered element
402 on the same level of the parse tree as this one.
404 :param previous_sibling: The next element to be encountered
405 on the same level of the parse tree as this one.
406 """
407 self.parent = parent
409 self.previous_element = previous_element
410 if self.previous_element is not None:
411 self.previous_element.next_element = self
413 self.next_element = next_element
414 if self.next_element is not None:
415 self.next_element.previous_element = self
417 self.next_sibling = next_sibling
418 if self.next_sibling is not None:
419 self.next_sibling.previous_sibling = self
421 if (
422 previous_sibling is None
423 and self.parent is not None
424 and self.parent.contents
425 ):
426 previous_sibling = self.parent.contents[-1]
428 self.previous_sibling = previous_sibling
429 if self.previous_sibling is not None:
430 self.previous_sibling.next_sibling = self
432 def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str:
433 """Format the given string using the given formatter.
435 :param s: A string.
436 :param formatter: A Formatter object, or a string naming one of the standard formatters.
437 """
438 if formatter is None:
439 return s
440 if not isinstance(formatter, Formatter):
441 formatter = self.formatter_for_name(formatter)
442 output = formatter.substitute(s)
443 return output
445 def formatter_for_name(
446 self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction]
447 ) -> Formatter:
448 """Look up or create a Formatter for the given identifier,
449 if necessary.
451 :param formatter: Can be a `Formatter` object (used as-is), a
452 function (used as the entity substitution hook for an
453 `bs4.formatter.XMLFormatter` or
454 `bs4.formatter.HTMLFormatter`), or a string (used to look
455 up an `bs4.formatter.XMLFormatter` or
456 `bs4.formatter.HTMLFormatter` in the appropriate registry.
458 """
459 if isinstance(formatter_name, Formatter):
460 return formatter_name
461 c: type[Formatter]
462 registry: Mapping[Optional[str], Formatter]
463 if self._is_xml:
464 c = XMLFormatter
465 registry = XMLFormatter.REGISTRY
466 else:
467 c = HTMLFormatter
468 registry = HTMLFormatter.REGISTRY
469 if callable(formatter_name):
470 return c(entity_substitution=formatter_name)
471 return registry[formatter_name]
473 @property
474 def _is_xml(self) -> bool:
475 """Is this element part of an XML tree or an HTML tree?
477 This is used in formatter_for_name, when deciding whether an
478 XMLFormatter or HTMLFormatter is more appropriate. It can be
479 inefficient, but it should be called very rarely.
480 """
481 if self.known_xml is not None:
482 # Most of the time we will have determined this when the
483 # document is parsed.
484 return self.known_xml
486 # Otherwise, it's likely that this element was created by
487 # direct invocation of the constructor from within the user's
488 # Python code.
489 if self.parent is None:
490 # This is the top-level object. It should have .known_xml set
491 # from tree creation. If not, take a guess--BS is usually
492 # used on HTML markup.
493 return getattr(self, "is_xml", False)
494 return self.parent._is_xml
496 nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0")
497 previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0")
499 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
500 raise NotImplementedError()
502 def __copy__(self) -> Self:
503 """A copy of a PageElement can only be a deep copy, because
504 only one PageElement can occupy a given place in a parse tree.
505 """
506 return self.__deepcopy__({})
508 default: Iterable[type[NavigableString]] = tuple() #: :meta private:
510 def _all_strings(
511 self, strip: bool = False, types: Iterable[type[NavigableString]] = default
512 ) -> Iterator[str]:
513 """Yield all strings of certain classes, possibly stripping them.
515 This is implemented differently in `Tag` and `NavigableString`.
516 """
517 raise NotImplementedError()
519 @property
520 def stripped_strings(self) -> Iterator[str]:
521 """Yield all interesting strings in this PageElement, stripping them
522 first.
524 See `Tag` for information on which strings are considered
525 interesting in a given context.
526 """
527 for string in self._all_strings(True):
528 yield string
530 def get_text(
531 self,
532 separator: str = "",
533 strip: bool = False,
534 types: Iterable[Type[NavigableString]] = default,
535 ) -> str:
536 """Get all child strings of this PageElement, concatenated using the
537 given separator.
539 :param separator: Strings will be concatenated using this separator.
541 :param strip: If True, strings will be stripped before being
542 concatenated.
544 :param types: A tuple of NavigableString subclasses. Any
545 strings of a subclass not found in this list will be
546 ignored. Although there are exceptions, the default
547 behavior in most cases is to consider only NavigableString
548 and CData objects. That means no comments, processing
549 instructions, etc.
551 :return: A string.
552 """
553 return separator.join([s for s in self._all_strings(strip, types=types)])
555 getText = get_text
556 text = property(get_text)
558 def replace_with(self, *args: _InsertableElement) -> Self:
559 """Replace this `PageElement` with one or more other elements,
560 objects, keeping the rest of the tree the same.
562 :return: This `PageElement`, no longer part of the tree.
563 """
564 if self.parent is None:
565 raise ValueError(
566 "Cannot replace one element with another when the "
567 "element to be replaced is not part of a tree."
568 )
569 if len(args) == 1 and args[0] is self:
570 # Replacing an element with itself is a no-op.
571 return self
572 if any(x is self.parent for x in args):
573 raise ValueError("Cannot replace a Tag with its parent.")
574 old_parent = self.parent
575 my_index = self.parent.index(self)
576 self.extract(_self_index=my_index)
577 for idx, replace_with in enumerate(args, start=my_index):
578 old_parent.insert(idx, replace_with)
579 return self
581 replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0")
583 def wrap(self, wrap_inside: Tag) -> Tag:
584 """Wrap this `PageElement` inside a `Tag`.
586 :return: ``wrap_inside``, occupying the position in the tree that used
587 to be occupied by this object, and with this object now inside it.
588 """
589 me = self.replace_with(wrap_inside)
590 wrap_inside.append(me)
591 return wrap_inside
593 def extract(self, _self_index: Optional[int] = None) -> Self:
594 """Destructively rips this element out of the tree.
596 :param _self_index: The location of this element in its parent's
597 .contents, if known. Passing this in allows for a performance
598 optimization.
600 :return: this `PageElement`, no longer part of the tree.
601 """
602 if self.parent is not None:
603 if _self_index is None:
604 _self_index = self.parent.index(self)
605 del self.parent.contents[_self_index]
607 # Find the two elements that would be next to each other if
608 # this element (and any children) hadn't been parsed. Connect
609 # the two.
610 last_child = self._last_descendant()
612 # last_child can't be None because we passed accept_self=True
613 # into _last_descendant. Worst case, last_child will be
614 # self. Making this cast removes several mypy complaints later
615 # on as we manipulate last_child.
616 last_child = cast(PageElement, last_child)
617 next_element = last_child.next_element
619 if self.previous_element is not None:
620 if self.previous_element is not next_element:
621 self.previous_element.next_element = next_element
622 if next_element is not None and next_element is not self.previous_element:
623 next_element.previous_element = self.previous_element
624 self.previous_element = None
625 last_child.next_element = None
627 self.parent = None
628 if (
629 self.previous_sibling is not None
630 and self.previous_sibling is not self.next_sibling
631 ):
632 self.previous_sibling.next_sibling = self.next_sibling
633 if (
634 self.next_sibling is not None
635 and self.next_sibling is not self.previous_sibling
636 ):
637 self.next_sibling.previous_sibling = self.previous_sibling
638 self.previous_sibling = self.next_sibling = None
639 return self
641 def decompose(self) -> None:
642 """Recursively destroys this `PageElement` and its children.
644 The element will be removed from the tree and wiped out; so
645 will everything beneath it.
647 The behavior of a decomposed `PageElement` is undefined and you
648 should never use one for anything, but if you need to *check*
649 whether an element has been decomposed, you can use the
650 `PageElement.decomposed` property.
651 """
652 self.extract()
653 e: _AtMostOneElement = self
654 next_up: _AtMostOneElement = None
655 while e is not None:
656 next_up = e.next_element
657 e.__dict__.clear()
658 if isinstance(e, Tag):
659 e.name = ""
660 e.contents = []
661 e._decomposed = True
662 e = next_up
664 def _last_descendant(
665 self, is_initialized: bool = True, accept_self: bool = True
666 ) -> _AtMostOneElement:
667 """Finds the last element beneath this object to be parsed.
669 Special note to help you figure things out if your type
670 checking is tripped up by the fact that this method returns
671 _AtMostOneElement instead of PageElement: the only time
672 this method returns None is if `accept_self` is False and the
673 `PageElement` has no children--either it's a NavigableString
674 or an empty Tag.
676 :param is_initialized: Has `PageElement.setup` been called on
677 this `PageElement` yet?
679 :param accept_self: Is ``self`` an acceptable answer to the
680 question?
681 """
682 if is_initialized and self.next_sibling is not None:
683 last_child = self.next_sibling.previous_element
684 else:
685 last_child = self
686 while isinstance(last_child, Tag) and last_child.contents:
687 last_child = last_child.contents[-1]
688 if not accept_self and last_child is self:
689 last_child = None
690 return last_child
692 _lastRecursiveChild = _deprecated_alias(
693 "_lastRecursiveChild", "_last_descendant", "4.0.0"
694 )
696 def insert_before(self, *args: _InsertableElement) -> List[PageElement]:
697 """Makes the given element(s) the immediate predecessor of this one.
699 All the elements will have the same `PageElement.parent` as
700 this one, and the given elements will occur immediately before
701 this one.
703 :param args: One or more PageElements.
705 :return The list of PageElements that were inserted.
706 """
707 parent = self.parent
708 if parent is None:
709 raise ValueError("Element has no parent, so 'before' has no meaning.")
710 if any(x is self for x in args):
711 raise ValueError("Can't insert an element before itself.")
712 results: List[PageElement] = []
713 for predecessor in args:
714 # Extract first so that the index won't be screwed up if they
715 # are siblings.
716 if isinstance(predecessor, PageElement):
717 predecessor.extract()
718 index = parent.index(self)
719 results.extend(parent.insert(index, predecessor))
721 return results
723 def insert_after(self, *args: _InsertableElement) -> List[PageElement]:
724 """Makes the given element(s) the immediate successor of this one.
726 The elements will have the same `PageElement.parent` as this
727 one, and the given elements will occur immediately after this
728 one.
730 :param args: One or more PageElements.
732 :return The list of PageElements that were inserted.
733 """
734 # Do all error checking before modifying the tree.
735 parent = self.parent
736 if parent is None:
737 raise ValueError("Element has no parent, so 'after' has no meaning.")
738 if any(x is self for x in args):
739 raise ValueError("Can't insert an element after itself.")
741 offset = 0
742 results: List[PageElement] = []
743 for successor in args:
744 # Extract first so that the index won't be screwed up if they
745 # are siblings.
746 if isinstance(successor, PageElement):
747 successor.extract()
748 index = parent.index(self)
749 results.extend(parent.insert(index + 1 + offset, successor))
750 offset += 1
752 return results
754 # For the suppression of this pyright warning, see discussion here:
755 # https://github.com/microsoft/pyright/issues/10929
756 @overload
757 def find_next( # pyright: ignore [reportOverlappingOverload]
758 self,
759 name: _FindMethodName = None,
760 attrs: Optional[_StrainableAttributes] = None,
761 string: None=None,
762 **kwargs: _StrainableAttribute,
763 ) -> _AtMostOneTag:
764 ...
766 @overload
767 def find_next(
768 self,
769 name: None=None,
770 attrs: None=None,
771 string: _StrainableString="",
772 **kwargs: _StrainableAttribute,
773 ) -> _AtMostOneNavigableString:
774 ...
776 def find_next(
777 self,
778 name: _FindMethodName = None,
779 attrs: Optional[_StrainableAttributes] = None,
780 string: Optional[_StrainableString] = None,
781 **kwargs: _StrainableAttribute,
782 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
783 """Find the first PageElement that matches the given criteria and
784 appears later in the document than this PageElement.
786 All find_* methods take a common set of arguments. See the online
787 documentation for detailed explanations.
789 :param name: A filter on tag name.
790 :param attrs: Additional filters on attribute values.
791 :param string: A filter for a NavigableString with specific text.
792 :kwargs: Additional filters on attribute values.
793 """
794 return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
796 findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0")
798 @overload
799 def find_all_next( # pyright: ignore [reportOverlappingOverload]
800 self,
801 name: _FindMethodName = None,
802 attrs: Optional[_StrainableAttributes] = None,
803 string: None = None,
804 limit: Optional[int] = None,
805 _stacklevel: int = 2,
806 **kwargs: _StrainableAttribute,
807 ) -> _SomeTags:
808 ...
810 @overload
811 def find_all_next(
812 self,
813 name: None = None,
814 attrs: None = None,
815 string: _StrainableString = "",
816 limit: Optional[int] = None,
817 _stacklevel: int = 2,
818 **kwargs: _StrainableAttribute,
819 ) -> _SomeNavigableStrings:
820 ...
822 def find_all_next(
823 self,
824 name: _FindMethodName = None,
825 attrs: Optional[_StrainableAttributes] = None,
826 string: Optional[_StrainableString] = None,
827 limit: Optional[int] = None,
828 _stacklevel: int = 2,
829 **kwargs: _StrainableAttribute,
830 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
831 """Find all `PageElement` objects that match the given criteria and
832 appear later in the document than this `PageElement`.
834 All find_* methods take a common set of arguments. See the online
835 documentation for detailed explanations.
837 :param name: A filter on tag name.
838 :param attrs: Additional filters on attribute values.
839 :param string: A filter for a NavigableString with specific text.
840 :param limit: Stop looking after finding this many results.
841 :param _stacklevel: Used internally to improve warning messages.
842 :kwargs: Additional filters on attribute values.
843 """
844 return self._find_all(
845 name,
846 attrs,
847 string,
848 limit,
849 self.next_elements,
850 _stacklevel=_stacklevel + 1,
851 **kwargs,
852 )
854 findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0")
856 @overload
857 def find_next_sibling( # pyright: ignore [reportOverlappingOverload]
858 self,
859 name: _FindMethodName = None,
860 attrs: Optional[_StrainableAttributes] = None,
861 string: None=None,
862 **kwargs: _StrainableAttribute,
863 ) -> _AtMostOneTag:
864 ...
866 @overload
867 def find_next_sibling(
868 self,
869 name: None=None,
870 attrs: None=None,
871 string: _StrainableString="",
872 **kwargs: _StrainableAttribute,
873 ) -> _AtMostOneNavigableString:
874 ...
876 def find_next_sibling(
877 self,
878 name: _FindMethodName = None,
879 attrs: Optional[_StrainableAttributes] = None,
880 string: Optional[_StrainableString] = None,
881 **kwargs: _StrainableAttribute,
882 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
883 """Find the closest sibling to this PageElement that matches the
884 given criteria and appears later in the document.
886 All find_* methods take a common set of arguments. See the
887 online documentation for detailed explanations.
889 :param name: A filter on tag name.
890 :param attrs: Additional filters on attribute values.
891 :param string: A filter for a `NavigableString` with specific text.
892 :kwargs: Additional filters on attribute values.
893 """
894 return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs)
896 findNextSibling = _deprecated_function_alias(
897 "findNextSibling", "find_next_sibling", "4.0.0"
898 )
900 @overload
901 def find_next_siblings( # pyright: ignore [reportOverlappingOverload]
902 self,
903 name: _FindMethodName = None,
904 attrs: Optional[_StrainableAttributes] = None,
905 string: None = None,
906 limit: Optional[int] = None,
907 _stacklevel: int = 2,
908 **kwargs: _StrainableAttribute,
909 ) -> _SomeTags:
910 ...
912 @overload
913 def find_next_siblings(
914 self,
915 name: None = None,
916 attrs: None = None,
917 string: _StrainableString = "",
918 limit: Optional[int] = None,
919 _stacklevel: int = 2,
920 **kwargs: _StrainableAttribute,
921 ) -> _SomeNavigableStrings:
922 ...
924 def find_next_siblings(
925 self,
926 name: _FindMethodName = None,
927 attrs: Optional[_StrainableAttributes] = None,
928 string: Optional[_StrainableString] = None,
929 limit: Optional[int] = None,
930 _stacklevel: int = 2,
931 **kwargs: _StrainableAttribute,
932 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
933 """Find all siblings of this `PageElement` that match the given criteria
934 and appear later in the document.
936 All find_* methods take a common set of arguments. See the online
937 documentation for detailed explanations.
939 :param name: A filter on tag name.
940 :param attrs: Additional filters on attribute values.
941 :param string: A filter for a `NavigableString` with specific text.
942 :param limit: Stop looking after finding this many results.
943 :param _stacklevel: Used internally to improve warning messages.
944 :kwargs: Additional filters on attribute values.
945 """
946 return self._find_all(
947 name,
948 attrs,
949 string,
950 limit,
951 self.next_siblings,
952 _stacklevel=_stacklevel + 1,
953 **kwargs,
954 )
956 findNextSiblings = _deprecated_function_alias(
957 "findNextSiblings", "find_next_siblings", "4.0.0"
958 )
959 fetchNextSiblings = _deprecated_function_alias(
960 "fetchNextSiblings", "find_next_siblings", "3.0.0"
961 )
963 @overload
964 def find_previous( # pyright: ignore [reportOverlappingOverload]
965 self,
966 name: _FindMethodName = None,
967 attrs: Optional[_StrainableAttributes] = None,
968 string: None=None,
969 **kwargs: _StrainableAttribute,
970 ) -> _AtMostOneTag:
971 ...
973 @overload
974 def find_previous(
975 self,
976 name: None=None,
977 attrs: None=None,
978 string: _StrainableString="",
979 **kwargs: _StrainableAttribute,
980 ) -> _AtMostOneNavigableString:
981 ...
983 def find_previous(
984 self,
985 name: _FindMethodName = None,
986 attrs: Optional[_StrainableAttributes] = None,
987 string: Optional[_StrainableString] = None,
988 **kwargs: _StrainableAttribute,
989 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
990 """Look backwards in the document from this `PageElement` and find the
991 first `PageElement` that matches the given criteria.
993 All find_* methods take a common set of arguments. See the online
994 documentation for detailed explanations.
996 :param name: A filter on tag name.
997 :param attrs: Additional filters on attribute values.
998 :param string: A filter for a `NavigableString` with specific text.
999 :kwargs: Additional filters on attribute values.
1000 """
1001 return self._find_one(self.find_all_previous, name, attrs, string, **kwargs)
1003 findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0")
1005 @overload
1006 def find_all_previous( # pyright: ignore [reportOverlappingOverload]
1007 self,
1008 name: _FindMethodName = None,
1009 attrs: Optional[_StrainableAttributes] = None,
1010 string: None = None,
1011 limit: Optional[int] = None,
1012 _stacklevel: int = 2,
1013 **kwargs: _StrainableAttribute,
1014 ) -> _SomeTags:
1015 ...
1017 @overload
1018 def find_all_previous(
1019 self,
1020 name: None = None,
1021 attrs: None = None,
1022 string: _StrainableString = "",
1023 limit: Optional[int] = None,
1024 _stacklevel: int = 2,
1025 **kwargs: _StrainableAttribute,
1026 ) -> _SomeNavigableStrings:
1027 ...
1029 def find_all_previous(
1030 self,
1031 name: _FindMethodName = None,
1032 attrs: Optional[_StrainableAttributes] = None,
1033 string: Optional[_StrainableString] = None,
1034 limit: Optional[int] = None,
1035 _stacklevel: int = 2,
1036 **kwargs: _StrainableAttribute,
1037 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
1038 """Look backwards in the document from this `PageElement` and find all
1039 `PageElement` that match the given criteria.
1041 All find_* methods take a common set of arguments. See the online
1042 documentation for detailed explanations.
1044 :param name: A filter on tag name.
1045 :param attrs: Additional filters on attribute values.
1046 :param string: A filter for a `NavigableString` with specific text.
1047 :param limit: Stop looking after finding this many results.
1048 :param _stacklevel: Used internally to improve warning messages.
1049 :kwargs: Additional filters on attribute values.
1050 """
1051 return self._find_all(
1052 name,
1053 attrs,
1054 string,
1055 limit,
1056 self.previous_elements,
1057 _stacklevel=_stacklevel + 1,
1058 **kwargs,
1059 )
1061 findAllPrevious = _deprecated_function_alias(
1062 "findAllPrevious", "find_all_previous", "4.0.0"
1063 )
1064 fetchAllPrevious = _deprecated_function_alias(
1065 "fetchAllPrevious", "find_all_previous", "3.0.0"
1066 )
1068 @overload
1069 def find_previous_sibling( # pyright: ignore [reportOverlappingOverload]
1070 self,
1071 name: _FindMethodName = None,
1072 attrs: Optional[_StrainableAttributes] = None,
1073 string: None=None,
1074 **kwargs: _StrainableAttribute,
1075 ) -> _AtMostOneTag:
1076 ...
1078 @overload
1079 def find_previous_sibling(
1080 self,
1081 name: None=None,
1082 attrs: None=None,
1083 string: _StrainableString="",
1084 **kwargs: _StrainableAttribute,
1085 ) -> _AtMostOneNavigableString:
1086 ...
1088 def find_previous_sibling(
1089 self,
1090 name: _FindMethodName = None,
1091 attrs: Optional[_StrainableAttributes] = None,
1092 string: Optional[_StrainableString] = None,
1093 **kwargs: _StrainableAttribute,
1094 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
1095 """Returns the closest sibling to this `PageElement` that matches the
1096 given criteria and appears earlier in the document.
1098 All find_* methods take a common set of arguments. See the online
1099 documentation for detailed explanations.
1101 :param name: A filter on tag name.
1102 :param attrs: Additional filters on attribute values.
1103 :param string: A filter for a `NavigableString` with specific text.
1104 :kwargs: Additional filters on attribute values.
1105 """
1106 return self._find_one(
1107 self.find_previous_siblings, name, attrs, string, **kwargs
1108 )
1110 findPreviousSibling = _deprecated_function_alias(
1111 "findPreviousSibling", "find_previous_sibling", "4.0.0"
1112 )
1114 @overload
1115 def find_previous_siblings( # pyright: ignore [reportOverlappingOverload]
1116 self,
1117 name: _FindMethodName = None,
1118 attrs: Optional[_StrainableAttributes] = None,
1119 string: None = None,
1120 limit: Optional[int] = None,
1121 _stacklevel: int = 2,
1122 **kwargs: _StrainableAttribute,
1123 ) -> _SomeTags:
1124 ...
1126 @overload
1127 def find_previous_siblings(
1128 self,
1129 name: None = None,
1130 attrs: None = None,
1131 string: _StrainableString = "",
1132 limit: Optional[int] = None,
1133 _stacklevel: int = 2,
1134 **kwargs: _StrainableAttribute,
1135 ) -> _SomeNavigableStrings:
1136 ...
1138 def find_previous_siblings(
1139 self,
1140 name: _FindMethodName = None,
1141 attrs: Optional[_StrainableAttributes] = None,
1142 string: Optional[_StrainableString] = None,
1143 limit: Optional[int] = None,
1144 _stacklevel: int = 2,
1145 **kwargs: _StrainableAttribute,
1146 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
1147 """Returns all siblings to this PageElement that match the
1148 given criteria and appear earlier in the document.
1150 All find_* methods take a common set of arguments. See the online
1151 documentation for detailed explanations.
1153 :param name: A filter on tag name.
1154 :param attrs: Additional filters on attribute values.
1155 :param string: A filter for a NavigableString with specific text.
1156 :param limit: Stop looking after finding this many results.
1157 :param _stacklevel: Used internally to improve warning messages.
1158 :kwargs: Additional filters on attribute values.
1159 """
1160 return self._find_all(
1161 name,
1162 attrs,
1163 string,
1164 limit,
1165 self.previous_siblings,
1166 _stacklevel=_stacklevel + 1,
1167 **kwargs,
1168 )
1170 findPreviousSiblings = _deprecated_function_alias(
1171 "findPreviousSiblings", "find_previous_siblings", "4.0.0"
1172 )
1173 fetchPreviousSiblings = _deprecated_function_alias(
1174 "fetchPreviousSiblings", "find_previous_siblings", "3.0.0"
1175 )
1177 def find_parent(
1178 self,
1179 name: _FindMethodName = None,
1180 attrs: Optional[_StrainableAttributes] = None,
1181 **kwargs: _StrainableAttribute,
1182 ) -> _AtMostOneTag:
1183 """Find the closest parent of this PageElement that matches the given
1184 criteria.
1186 All find_* methods take a common set of arguments. See the online
1187 documentation for detailed explanations.
1189 :param name: A filter on tag name.
1190 :param attrs: Additional filters on attribute values.
1191 :param self: Whether the PageElement itself should be considered
1192 as one of its 'parents'.
1193 :kwargs: Additional filters on attribute values.
1194 """
1195 # NOTE: We can't use _find_one because findParents takes a different
1196 # set of arguments.
1197 r = None
1198 results = self.find_parents(
1199 name, attrs, 1, _stacklevel=3, **kwargs
1200 )
1201 if results:
1202 r = results[0]
1203 return r
1205 findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0")
1207 def find_parents(
1208 self,
1209 name: _FindMethodName = None,
1210 attrs: Optional[_StrainableAttributes] = None,
1211 limit: Optional[int] = None,
1212 _stacklevel: int = 2,
1213 **kwargs: _StrainableAttribute,
1214 ) -> _SomeTags:
1215 """Find all parents of this `PageElement` that match the given criteria.
1217 All find_* methods take a common set of arguments. See the online
1218 documentation for detailed explanations.
1220 :param name: A filter on tag name.
1221 :param attrs: Additional filters on attribute values.
1222 :param limit: Stop looking after finding this many results.
1223 :param _stacklevel: Used internally to improve warning messages.
1224 :kwargs: Additional filters on attribute values.
1225 """
1226 iterator = self.parents
1227 # Only Tags can have children, so this ResultSet will contain
1228 # nothing but Tags.
1229 return cast(ResultSet[Tag], self._find_all(
1230 name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs
1231 ))
1233 findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0")
1234 fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0")
1236 @property
1237 def next(self) -> _AtMostOneElement:
1238 """The `PageElement`, if any, that was parsed just after this one."""
1239 return self.next_element
1241 @property
1242 def previous(self) -> _AtMostOneElement:
1243 """The `PageElement`, if any, that was parsed just before this one."""
1244 return self.previous_element
1246 # These methods do the real heavy lifting.
1248 def _find_one(
1249 self,
1250 # TODO-TYPING: "There is no syntax to indicate optional or
1251 # keyword arguments; such function types are rarely used
1252 # as callback types." - So, not sure how to get more
1253 # specific here.
1254 method: Callable,
1255 name: _FindMethodName,
1256 attrs: Optional[_StrainableAttributes],
1257 string: Optional[_StrainableString],
1258 **kwargs: _StrainableAttribute,
1259 ) -> _AtMostOneElement:
1260 r: _AtMostOneElement = None
1261 results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
1262 if results:
1263 r = results[0]
1264 return r
1266 def _find_all(
1267 self,
1268 name: _FindMethodName,
1269 attrs: Optional[_StrainableAttributes],
1270 string: Optional[_StrainableString],
1271 limit: Optional[int],
1272 generator: Iterator[PageElement],
1273 _stacklevel: int = 3,
1274 **kwargs: _StrainableAttribute,
1275 ) -> _QueryResults:
1276 """Iterates over a generator looking for things that match."""
1278 if string is None and "text" in kwargs:
1279 string = kwargs.pop("text")
1280 warnings.warn(
1281 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
1282 DeprecationWarning,
1283 stacklevel=_stacklevel,
1284 )
1286 if "_class" in kwargs:
1287 warnings.warn(
1288 AttributeResemblesVariableWarning.MESSAGE
1289 % dict(
1290 original="_class",
1291 autocorrect="class_",
1292 ),
1293 AttributeResemblesVariableWarning,
1294 stacklevel=_stacklevel,
1295 )
1297 from bs4.filter import ElementFilter
1299 if isinstance(name, ElementFilter):
1300 matcher = name
1301 else:
1302 matcher = SoupStrainer(name, attrs, string, **kwargs)
1304 result: MutableSequence[_OneElement]
1305 if string is None and not limit and not attrs and not kwargs:
1306 if name is True or name is None:
1307 # Optimization to find all tags.
1308 result = [element for element in generator if isinstance(element, Tag)]
1309 return ResultSet(matcher, result)
1310 elif isinstance(name, str):
1311 # Optimization to find all tags with a given name.
1312 if name.count(":") == 1:
1313 # This is a name with a prefix. If this is a namespace-aware document,
1314 # we need to match the local name against tag.name. If not,
1315 # we need to match the fully-qualified name against tag.name.
1316 prefix, local_name = name.split(":", 1)
1317 else:
1318 prefix = None
1319 local_name = name
1320 result = []
1321 for element in generator:
1322 if not isinstance(element, Tag):
1323 continue
1324 if element.name == name or (
1325 element.name == local_name
1326 and (prefix is None or element.prefix == prefix)
1327 ):
1328 result.append(element)
1329 return ResultSet(matcher, result)
1330 return matcher.find_all(generator, limit)
1332 # These generators can be used to navigate starting from both
1333 # NavigableStrings and Tags.
1334 @property
1335 def next_elements(self) -> Iterator[PageElement]:
1336 """All PageElements that were parsed after this one."""
1337 i = self.next_element
1338 while i is not None:
1339 successor = i.next_element
1340 yield i
1341 i = successor
1343 @property
1344 def self_and_next_elements(self) -> Iterator[PageElement]:
1345 """This PageElement, then all PageElements that were parsed after it."""
1346 return self._self_and(self.next_elements)
1348 @property
1349 def next_siblings(self) -> Iterator[PageElement]:
1350 """All PageElements that are siblings of this one but were parsed
1351 later.
1352 """
1353 i = self.next_sibling
1354 while i is not None:
1355 successor = i.next_sibling
1356 yield i
1357 i = successor
1359 @property
1360 def self_and_next_siblings(self) -> Iterator[PageElement]:
1361 """This PageElement, then all of its siblings."""
1362 return self._self_and(self.next_siblings)
1364 @property
1365 def previous_elements(self) -> Iterator[PageElement]:
1366 """All PageElements that were parsed before this one.
1368 :yield: A sequence of PageElements.
1369 """
1370 i = self.previous_element
1371 while i is not None:
1372 successor = i.previous_element
1373 yield i
1374 i = successor
1376 @property
1377 def self_and_previous_elements(self) -> Iterator[PageElement]:
1378 """This PageElement, then all elements that were parsed
1379 earlier."""
1380 return self._self_and(self.previous_elements)
1382 @property
1383 def previous_siblings(self) -> Iterator[PageElement]:
1384 """All PageElements that are siblings of this one but were parsed
1385 earlier.
1387 :yield: A sequence of PageElements.
1388 """
1389 i = self.previous_sibling
1390 while i is not None:
1391 successor = i.previous_sibling
1392 yield i
1393 i = successor
1395 @property
1396 def self_and_previous_siblings(self) -> Iterator[PageElement]:
1397 """This PageElement, then all of its siblings that were parsed
1398 earlier."""
1399 return self._self_and(self.previous_siblings)
1401 @property
1402 def parents(self) -> Iterator[Tag]:
1403 """All elements that are parents of this PageElement.
1405 :yield: A sequence of Tags, ending with a BeautifulSoup object.
1406 """
1407 i = self.parent
1408 while i is not None:
1409 successor = i.parent
1410 yield i
1411 i = successor
1413 @property
1414 def self_and_parents(self) -> Iterator[PageElement]:
1415 """This element, then all of its parents.
1417 :yield: A sequence of PageElements, ending with a BeautifulSoup object.
1418 """
1419 return self._self_and(self.parents)
1421 def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]:
1422 """Modify a generator by yielding this element, then everything
1423 yielded by the other generator.
1424 """
1425 if not self.hidden:
1426 yield self
1427 for i in other_generator:
1428 yield i
1430 @property
1431 def decomposed(self) -> bool:
1432 """Check whether a PageElement has been decomposed."""
1433 return getattr(self, "_decomposed", False) or False
1435 @_deprecated("next_elements", "4.0.0")
1436 def nextGenerator(self) -> Iterator[PageElement]:
1437 ":meta private:"
1438 return self.next_elements
1440 @_deprecated("next_siblings", "4.0.0")
1441 def nextSiblingGenerator(self) -> Iterator[PageElement]:
1442 ":meta private:"
1443 return self.next_siblings
1445 @_deprecated("previous_elements", "4.0.0")
1446 def previousGenerator(self) -> Iterator[PageElement]:
1447 ":meta private:"
1448 return self.previous_elements
1450 @_deprecated("previous_siblings", "4.0.0")
1451 def previousSiblingGenerator(self) -> Iterator[PageElement]:
1452 ":meta private:"
1453 return self.previous_siblings
1455 @_deprecated("parents", "4.0.0")
1456 def parentGenerator(self) -> Iterator[PageElement]:
1457 ":meta private:"
1458 return self.parents
1461class NavigableString(str, PageElement):
1462 """A Python string that is part of a parse tree.
1464 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
1465 create a `NavigableString` for the string "penguin".
1466 """
1468 #: A string prepended to the body of the 'real' string
1469 #: when formatting it as part of a document, such as the '<!--'
1470 #: in an HTML comment.
1471 PREFIX: str = ""
1473 #: A string appended to the body of the 'real' string
1474 #: when formatting it as part of a document, such as the '-->'
1475 #: in an HTML comment.
1476 SUFFIX: str = ""
1478 def __new__(cls, value: Union[str, bytes]) -> Self:
1479 """Create a new NavigableString.
1481 When unpickling a NavigableString, this method is called with
1482 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
1483 passed in to the superclass's __new__ or the superclass won't know
1484 how to handle non-ASCII characters.
1485 """
1486 if isinstance(value, str):
1487 u = str.__new__(cls, value)
1488 else:
1489 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
1490 u.hidden = False
1491 u.setup()
1492 return u
1494 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
1495 """A copy of a NavigableString has the same contents and class
1496 as the original, but it is not connected to the parse tree.
1498 :param recursive: This parameter is ignored; it's only defined
1499 so that NavigableString.__deepcopy__ implements the same
1500 signature as Tag.__deepcopy__.
1501 """
1502 return type(self)(self)
1504 def __getnewargs__(self) -> Tuple[str]:
1505 return (str(self),)
1507 # TODO-TYPING This should be SupportsIndex|slice but SupportsIndex
1508 # is introduced in 3.8. This can be changed once 3.7 support is dropped.
1509 def __getitem__(self, key: Union[int|slice]) -> str: # type:ignore
1510 """Raise an exception """
1511 if isinstance(key, str):
1512 raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__))
1513 return super(NavigableString, self).__getitem__(key)
1515 @property
1516 def string(self) -> str:
1517 """Convenience property defined to match `Tag.string`.
1519 :return: This property always returns the `NavigableString` it was
1520 called on.
1522 :meta private:
1523 """
1524 return self
1526 def output_ready(self, formatter: _FormatterOrName = "minimal") -> str:
1527 """Run the string through the provided formatter, making it
1528 ready for output as part of an HTML or XML document.
1530 :param formatter: A `Formatter` object, or a string naming one
1531 of the standard formatters.
1532 """
1533 output = self.format_string(self, formatter)
1534 return self.PREFIX + output + self.SUFFIX
1536 @property
1537 def name(self) -> None:
1538 """Since a NavigableString is not a Tag, it has no .name.
1540 This property is implemented so that code like this doesn't crash
1541 when run on a mixture of Tag and NavigableString objects:
1542 [x.name for x in tag.children]
1544 :meta private:
1545 """
1546 return None
1548 @name.setter
1549 def name(self, name: str) -> None:
1550 """Prevent NavigableString.name from ever being set.
1552 :meta private:
1553 """
1554 raise AttributeError("A NavigableString cannot be given a name.")
1556 def _all_strings(
1557 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
1558 ) -> Iterator[str]:
1559 """Yield all strings of certain classes, possibly stripping them.
1561 This makes it easy for NavigableString to implement methods
1562 like get_text() as conveniences, creating a consistent
1563 text-extraction API across all PageElements.
1565 :param strip: If True, all strings will be stripped before being
1566 yielded.
1568 :param types: A tuple of NavigableString subclasses. If this
1569 NavigableString isn't one of those subclasses, the
1570 sequence will be empty. By default, the subclasses
1571 considered are NavigableString and CData objects. That
1572 means no comments, processing instructions, etc.
1574 :yield: A sequence that either contains this string, or is empty.
1575 """
1576 if types is self.default:
1577 # This is kept in Tag because it's full of subclasses of
1578 # this class, which aren't defined until later in the file.
1579 types = Tag.MAIN_CONTENT_STRING_TYPES
1581 # Do nothing if the caller is looking for specific types of
1582 # string, and we're of a different type.
1583 #
1584 # We check specific types instead of using isinstance(self,
1585 # types) because all of these classes subclass
1586 # NavigableString. Anyone who's using this feature probably
1587 # wants generic NavigableStrings but not other stuff.
1588 my_type = type(self)
1589 if types is not None:
1590 if isinstance(types, type):
1591 # Looking for a single type.
1592 if my_type is not types:
1593 return
1594 elif my_type not in types:
1595 # Looking for one of a list of types.
1596 return
1598 value = self
1599 if strip:
1600 final_value = value.strip()
1601 else:
1602 final_value = self
1603 if len(final_value) > 0:
1604 yield final_value
1606 @property
1607 def strings(self) -> Iterator[str]:
1608 """Yield this string, but only if it is interesting.
1610 This is defined the way it is for compatibility with
1611 `Tag.strings`. See `Tag` for information on which strings are
1612 interesting in a given context.
1614 :yield: A sequence that either contains this string, or is empty.
1615 """
1616 return self._all_strings()
1619class PreformattedString(NavigableString):
1620 """A `NavigableString` not subject to the normal formatting rules.
1622 This is an abstract class used for special kinds of strings such
1623 as comments (`Comment`) and CDATA blocks (`CData`).
1624 """
1626 PREFIX: str = ""
1627 SUFFIX: str = ""
1629 def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str:
1630 """Make this string ready for output by adding any subclass-specific
1631 prefix or suffix.
1633 :param formatter: A `Formatter` object, or a string naming one
1634 of the standard formatters. The string will be passed into the
1635 `Formatter`, but only to trigger any side effects: the return
1636 value is ignored.
1638 :return: The string, with any subclass-specific prefix and
1639 suffix added on.
1640 """
1641 if formatter is not None:
1642 self.format_string(self, formatter)
1643 return self.PREFIX + self + self.SUFFIX
1646class CData(PreformattedString):
1647 """A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_."""
1649 PREFIX: str = "<![CDATA["
1650 SUFFIX: str = "]]>"
1653class ProcessingInstruction(PreformattedString):
1654 """A SGML processing instruction."""
1656 PREFIX: str = "<?"
1657 SUFFIX: str = ">"
1660class XMLProcessingInstruction(ProcessingInstruction):
1661 """An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_."""
1663 PREFIX: str = "<?"
1664 SUFFIX: str = "?>"
1667class Comment(PreformattedString):
1668 """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_."""
1670 PREFIX: str = "<!--"
1671 SUFFIX: str = "-->"
1674class Declaration(PreformattedString):
1675 """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_."""
1677 PREFIX: str = "<?"
1678 SUFFIX: str = "?>"
1681class Doctype(PreformattedString):
1682 """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_."""
1684 @classmethod
1685 def for_name_and_ids(
1686 cls, name: str, pub_id: Optional[str], system_id: Optional[str]
1687 ) -> Doctype:
1688 """Generate an appropriate document type declaration for a given
1689 public ID and system ID.
1691 :param name: The name of the document's root element, e.g. 'html'.
1692 :param pub_id: The Formal Public Identifier for this document type,
1693 e.g. '-//W3C//DTD XHTML 1.1//EN'
1694 :param system_id: The system identifier for this document type,
1695 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1696 """
1697 return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id))
1699 @classmethod
1700 def _string_for_name_and_ids(
1701 cls, name: str, pub_id: Optional[str], system_id: Optional[str]
1702 ) -> str:
1703 """Generate a string to be used as the basis of a Doctype object.
1705 This is a separate method from for_name_and_ids() because the lxml
1706 TreeBuilder needs to call it.
1707 """
1708 value = name or ""
1709 if pub_id is not None:
1710 value += ' PUBLIC "%s"' % pub_id
1711 if system_id is not None:
1712 value += ' "%s"' % system_id
1713 elif system_id is not None:
1714 value += ' SYSTEM "%s"' % system_id
1715 return value
1717 PREFIX: str = "<!DOCTYPE "
1718 SUFFIX: str = ">\n"
1721class Stylesheet(NavigableString):
1722 """A `NavigableString` representing the contents of a `<style> HTML
1723 tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_
1724 (probably CSS).
1726 Used to distinguish embedded stylesheets from textual content.
1727 """
1730class Script(NavigableString):
1731 """A `NavigableString` representing the contents of a `<script>
1732 HTML tag
1733 <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_
1734 (probably Javascript).
1736 Used to distinguish executable code from textual content.
1737 """
1740class TemplateString(NavigableString):
1741 """A `NavigableString` representing a string found inside an `HTML
1742 <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_
1743 embedded in a larger document.
1745 Used to distinguish such strings from the main body of the document.
1746 """
1749class RubyTextString(NavigableString):
1750 """A NavigableString representing the contents of an `<rt> HTML
1751 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_.
1753 Can be used to distinguish such strings from the strings they're
1754 annotating.
1755 """
1758class RubyParenthesisString(NavigableString):
1759 """A NavigableString representing the contents of an `<rp> HTML
1760 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_.
1761 """
1764class Tag(PageElement):
1765 """An HTML or XML tag that is part of a parse tree, along with its
1766 attributes, contents, and relationships to other parts of the tree.
1768 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
1769 create a `Tag` object representing the ``<b>`` tag. You can
1770 instantiate `Tag` objects directly, but it's not necessary unless
1771 you're adding entirely new markup to a parsed document. Most of
1772 the constructor arguments are intended for use by the `TreeBuilder`
1773 that's parsing a document.
1775 :param parser: A `BeautifulSoup` object representing the parse tree this
1776 `Tag` will be part of.
1777 :param builder: The `TreeBuilder` being used to build the tree.
1778 :param name: The name of the tag.
1779 :param namespace: The URI of this tag's XML namespace, if any.
1780 :param prefix: The prefix for this tag's XML namespace, if any.
1781 :param attrs: A dictionary of attribute values.
1782 :param parent: The `Tag` to use as the parent of this `Tag`. May be
1783 the `BeautifulSoup` object itself.
1784 :param previous: The `PageElement` that was parsed immediately before
1785 parsing this tag.
1786 :param is_xml: If True, this is an XML tag. Otherwise, this is an
1787 HTML tag.
1788 :param sourceline: The line number where this tag was found in its
1789 source document.
1790 :param sourcepos: The character position within ``sourceline`` where this
1791 tag was found.
1792 :param can_be_empty_element: If True, this tag should be
1793 represented as <tag/>. If False, this tag should be represented
1794 as <tag></tag>.
1795 :param cdata_list_attributes: A dictionary of attributes whose values should
1796 be parsed as lists of strings if they ever show up on this tag.
1797 :param preserve_whitespace_tags: Names of tags whose contents
1798 should have their whitespace preserved if they are encountered inside
1799 this tag.
1800 :param interesting_string_types: When iterating over this tag's
1801 string contents in methods like `Tag.strings` or
1802 `PageElement.get_text`, these are the types of strings that are
1803 interesting enough to be considered. By default,
1804 `NavigableString` (normal strings) and `CData` (CDATA
1805 sections) are the only interesting string subtypes.
1806 :param namespaces: A dictionary mapping currently active
1807 namespace prefixes to URIs, as of the point in the parsing process when
1808 this tag was encountered. This can be used later to
1809 construct CSS selectors.
1811 """
1813 def __init__(
1814 self,
1815 parser: Optional[BeautifulSoup] = None,
1816 builder: Optional[TreeBuilder] = None,
1817 name: Optional[str] = None,
1818 namespace: Optional[str] = None,
1819 prefix: Optional[str] = None,
1820 attrs: Optional[_RawOrProcessedAttributeValues] = None,
1821 parent: Optional[Union[BeautifulSoup, Tag]] = None,
1822 previous: _AtMostOneElement = None,
1823 is_xml: Optional[bool] = None,
1824 sourceline: Optional[int] = None,
1825 sourcepos: Optional[int] = None,
1826 can_be_empty_element: Optional[bool] = None,
1827 cdata_list_attributes: Optional[Dict[str, Set[str]]] = None,
1828 preserve_whitespace_tags: Optional[Set[str]] = None,
1829 interesting_string_types: Optional[Set[Type[NavigableString]]] = None,
1830 namespaces: Optional[Dict[str, str]] = None,
1831 # NOTE: Any new arguments here need to be mirrored in
1832 # Tag.copy_self, and potentially BeautifulSoup.new_tag
1833 # as well.
1834 ):
1835 if parser is None:
1836 self.parser_class = None
1837 else:
1838 # We don't actually store the parser object: that lets extracted
1839 # chunks be garbage-collected.
1840 self.parser_class = parser.__class__
1841 if name is None:
1842 raise ValueError("No value provided for new tag's name.")
1843 self.name = name
1844 self.namespace = namespace
1845 self._namespaces = namespaces or {}
1846 self.prefix = prefix
1847 if (not builder or builder.store_line_numbers) and (
1848 sourceline is not None or sourcepos is not None
1849 ):
1850 self.sourceline = sourceline
1851 self.sourcepos = sourcepos
1852 else:
1853 self.sourceline = sourceline
1854 self.sourcepos = sourcepos
1856 attr_dict_class: type[AttributeDict]
1857 attribute_value_list_class: type[AttributeValueList]
1858 if builder is None:
1859 if is_xml:
1860 attr_dict_class = XMLAttributeDict
1861 else:
1862 attr_dict_class = HTMLAttributeDict
1863 attribute_value_list_class = AttributeValueList
1864 else:
1865 attr_dict_class = builder.attribute_dict_class
1866 attribute_value_list_class = builder.attribute_value_list_class
1867 self.attribute_value_list_class = attribute_value_list_class
1869 if attrs is None:
1870 self.attrs = attr_dict_class()
1871 else:
1872 if builder is not None and builder.cdata_list_attributes:
1873 self.attrs = builder._replace_cdata_list_attribute_values(
1874 self.name, attrs
1875 )
1876 else:
1877 self.attrs = attr_dict_class()
1878 # Make sure that the values of any multi-valued
1879 # attributes (e.g. when a Tag is copied) are stored in
1880 # new lists.
1881 for k, v in attrs.items():
1882 if isinstance(v, list):
1883 v = v.__class__(v)
1884 self.attrs[k] = v
1886 # If possible, determine ahead of time whether this tag is an
1887 # XML tag.
1888 if builder:
1889 self.known_xml = builder.is_xml
1890 else:
1891 self.known_xml = is_xml
1892 self.contents: List[PageElement] = []
1893 self.setup(parent, previous)
1894 self.hidden = False
1896 if builder is None:
1897 # In the absence of a TreeBuilder, use whatever values were
1898 # passed in here. They're probably None, unless this is a copy of some
1899 # other tag.
1900 self.can_be_empty_element = can_be_empty_element
1901 self.cdata_list_attributes = cdata_list_attributes
1902 self.preserve_whitespace_tags = preserve_whitespace_tags
1903 self.interesting_string_types = interesting_string_types
1904 else:
1905 # Set up any substitutions for this tag, such as the charset in a META tag.
1906 self.attribute_value_list_class = builder.attribute_value_list_class
1907 builder.set_up_substitutions(self)
1909 # Ask the TreeBuilder whether this tag might be an empty-element tag.
1910 self.can_be_empty_element = builder.can_be_empty_element(name)
1912 # Keep track of the list of attributes of this tag that
1913 # might need to be treated as a list.
1914 #
1915 # For performance reasons, we store the whole data structure
1916 # rather than asking the question of every tag. Asking would
1917 # require building a new data structure every time, and
1918 # (unlike can_be_empty_element), we almost never need
1919 # to check this.
1920 self.cdata_list_attributes = builder.cdata_list_attributes
1922 # Keep track of the names that might cause this tag to be treated as a
1923 # whitespace-preserved tag.
1924 self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1926 if self.name in builder.string_containers:
1927 # This sort of tag uses a special string container
1928 # subclass for most of its strings. We need to be able
1929 # to look up the proper container subclass.
1930 self.interesting_string_types = {builder.string_containers[self.name]}
1931 else:
1932 self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES
1934 parser_class: Optional[type[BeautifulSoup]]
1935 name: str
1936 namespace: Optional[str]
1937 prefix: Optional[str]
1938 attrs: _AttributeValues
1939 sourceline: Optional[int]
1940 sourcepos: Optional[int]
1941 known_xml: Optional[bool]
1942 contents: List[PageElement]
1943 hidden: bool
1944 interesting_string_types: Optional[Set[Type[NavigableString]]]
1946 can_be_empty_element: Optional[bool]
1947 cdata_list_attributes: Optional[Dict[str, Set[str]]]
1948 preserve_whitespace_tags: Optional[Set[str]]
1950 #: :meta private:
1951 parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0")
1953 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self:
1954 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
1955 Its contents are a copy of the old Tag's contents.
1956 """
1957 clone = self.copy_self()
1959 if recursive:
1960 # Clone this tag's descendants recursively, but without
1961 # making any recursive function calls.
1962 tag_stack: List[Tag] = [clone]
1963 for event, element in self._event_stream(self.descendants):
1964 if event is Tag.END_ELEMENT_EVENT:
1965 # Stop appending incoming Tags to the Tag that was
1966 # just closed.
1967 tag_stack.pop()
1968 else:
1969 descendant_clone = element.__deepcopy__(memo, recursive=False)
1970 # Add to its parent's .contents
1971 tag_stack[-1].append(descendant_clone)
1973 if event is Tag.START_ELEMENT_EVENT:
1974 # Add the Tag itself to the stack so that its
1975 # children will be .appended to it.
1976 tag_stack.append(cast(Tag, descendant_clone))
1977 return clone
1979 def copy_self(self) -> Self:
1980 """Create a new Tag just like this one, but with no
1981 contents and unattached to any parse tree.
1983 This is the first step in the deepcopy process, but you can
1984 call it on its own to create a copy of a Tag without copying its
1985 contents.
1986 """
1987 clone = type(self)(
1988 None,
1989 None,
1990 self.name,
1991 self.namespace,
1992 self.prefix,
1993 self.attrs,
1994 is_xml=self._is_xml,
1995 sourceline=self.sourceline,
1996 sourcepos=self.sourcepos,
1997 can_be_empty_element=self.can_be_empty_element,
1998 cdata_list_attributes=self.cdata_list_attributes,
1999 preserve_whitespace_tags=self.preserve_whitespace_tags,
2000 interesting_string_types=self.interesting_string_types,
2001 namespaces=self._namespaces,
2002 )
2003 for attr in ("can_be_empty_element", "hidden"):
2004 setattr(clone, attr, getattr(self, attr))
2005 return clone
2007 @property
2008 def is_empty_element(self) -> bool:
2009 """Is this tag an empty-element tag? (aka a self-closing tag)
2011 A tag that has contents is never an empty-element tag.
2013 A tag that has no contents may or may not be an empty-element
2014 tag. It depends on the `TreeBuilder` used to create the
2015 tag. If the builder has a designated list of empty-element
2016 tags, then only a tag whose name shows up in that list is
2017 considered an empty-element tag. This is usually the case
2018 for HTML documents.
2020 If the builder has no designated list of empty-element, then
2021 any tag with no contents is an empty-element tag. This is usually
2022 the case for XML documents.
2023 """
2024 return len(self.contents) == 0 and self.can_be_empty_element is True
2026 @_deprecated("is_empty_element", "4.0.0")
2027 def isSelfClosing(self) -> bool:
2028 ": :meta private:"
2029 return self.is_empty_element
2031 @property
2032 def string(self) -> Optional[str]:
2033 """Convenience property to get the single string within this
2034 `Tag`, assuming there is just one.
2036 :return: If this `Tag` has a single child that's a
2037 `NavigableString`, the return value is that string. If this
2038 element has one child `Tag`, the return value is that child's
2039 `Tag.string`, recursively. If this `Tag` has no children,
2040 or has more than one child, the return value is ``None``.
2042 If this property is unexpectedly returning ``None`` for you,
2043 it's probably because your `Tag` has more than one thing
2044 inside it.
2045 """
2046 if len(self.contents) != 1:
2047 return None
2048 child = self.contents[0]
2049 if isinstance(child, NavigableString):
2050 return child
2051 elif isinstance(child, Tag):
2052 return child.string
2053 return None
2055 @string.setter
2056 def string(self, string: str) -> None:
2057 """Replace the `Tag.contents` of this `Tag` with a single string."""
2058 self.clear()
2059 if isinstance(string, NavigableString):
2060 new_class = string.__class__
2061 else:
2062 new_class = NavigableString
2063 self.append(new_class(string))
2065 #: :meta private:
2066 MAIN_CONTENT_STRING_TYPES = {NavigableString, CData}
2068 def _all_strings(
2069 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
2070 ) -> Iterator[str]:
2071 """Yield all strings of certain classes, possibly stripping them.
2073 :param strip: If True, all strings will be stripped before being
2074 yielded.
2076 :param types: A tuple of NavigableString subclasses. Any strings of
2077 a subclass not found in this list will be ignored. By
2078 default, the subclasses considered are the ones found in
2079 self.interesting_string_types. If that's not specified,
2080 only NavigableString and CData objects will be
2081 considered. That means no comments, processing
2082 instructions, etc.
2083 """
2084 if types is self.default:
2085 if self.interesting_string_types is None:
2086 types = self.MAIN_CONTENT_STRING_TYPES
2087 else:
2088 types = self.interesting_string_types
2090 for descendant in self.descendants:
2091 if not isinstance(descendant, NavigableString):
2092 continue
2093 descendant_type = type(descendant)
2094 if isinstance(types, type):
2095 if descendant_type is not types:
2096 # We're not interested in strings of this type.
2097 continue
2098 elif types is not None and descendant_type not in types:
2099 # We're not interested in strings of this type.
2100 continue
2101 if strip:
2102 stripped = descendant.strip()
2103 if len(stripped) == 0:
2104 continue
2105 yield stripped
2106 else:
2107 yield descendant
2109 strings = property(_all_strings)
2111 def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]:
2112 """Insert one or more new PageElements as a child of this `Tag`.
2114 This works similarly to :py:meth:`list.insert`, except you can insert
2115 multiple elements at once.
2117 :param position: The numeric position that should be occupied
2118 in this Tag's `Tag.children` by the first new `PageElement`.
2120 :param new_children: The PageElements to insert.
2122 :return The newly inserted PageElements.
2123 """
2124 inserted: List[PageElement] = []
2125 for new_child in new_children:
2126 inserted.extend(self._insert(position, new_child))
2127 position += 1
2128 return inserted
2130 def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]:
2131 if new_child is None:
2132 raise ValueError("Cannot insert None into a tag.")
2133 if new_child is self:
2134 raise ValueError("Cannot insert a tag into itself.")
2135 if isinstance(new_child, str) and not isinstance(new_child, NavigableString):
2136 new_child = NavigableString(new_child)
2138 from bs4 import BeautifulSoup
2139 if isinstance(new_child, BeautifulSoup):
2140 # We don't want to end up with a situation where one BeautifulSoup
2141 # object contains another. Insert the BeautifulSoup's children and
2142 # return them.
2143 return self.insert(position, *list(new_child.contents))
2144 position = min(position, len(self.contents))
2145 if hasattr(new_child, "parent") and new_child.parent is not None:
2146 # We're 'inserting' an element that's already one
2147 # of this object's children.
2148 if new_child.parent is self:
2149 current_index = self.index(new_child)
2150 if current_index < position:
2151 # We're moving this element further down the list
2152 # of this object's children. That means that when
2153 # we extract this element, our target index will
2154 # jump down one.
2155 position -= 1
2156 elif current_index == position:
2157 # We're 'inserting' an element into its current location.
2158 # This is a no-op.
2159 return [new_child]
2160 new_child.extract()
2162 new_child.parent = self
2163 previous_child = None
2164 if position == 0:
2165 new_child.previous_sibling = None
2166 new_child.previous_element = self
2167 else:
2168 previous_child = self.contents[position - 1]
2169 new_child.previous_sibling = previous_child
2170 new_child.previous_sibling.next_sibling = new_child
2171 new_child.previous_element = previous_child._last_descendant(False)
2172 if new_child.previous_element is not None:
2173 new_child.previous_element.next_element = new_child
2175 new_childs_last_element = new_child._last_descendant(
2176 is_initialized=False, accept_self=True
2177 )
2178 # new_childs_last_element can't be None because we passed
2179 # accept_self=True into _last_descendant. Worst case,
2180 # new_childs_last_element will be new_child itself. Making
2181 # this cast removes several mypy complaints later on as we
2182 # manipulate new_childs_last_element.
2183 new_childs_last_element = cast(PageElement, new_childs_last_element)
2185 if position >= len(self.contents):
2186 new_child.next_sibling = None
2188 parent: Optional[Tag] = self
2189 parents_next_sibling = None
2190 while parents_next_sibling is None and parent is not None:
2191 parents_next_sibling = parent.next_sibling
2192 parent = parent.parent
2193 if parents_next_sibling is not None:
2194 # We found the element that comes next in the document.
2195 break
2196 if parents_next_sibling is not None:
2197 new_childs_last_element.next_element = parents_next_sibling
2198 else:
2199 # The last element of this tag is the last element in
2200 # the document.
2201 new_childs_last_element.next_element = None
2202 else:
2203 next_child = self.contents[position]
2204 new_child.next_sibling = next_child
2205 if new_child.next_sibling is not None:
2206 new_child.next_sibling.previous_sibling = new_child
2207 new_childs_last_element.next_element = next_child
2209 if new_childs_last_element.next_element is not None:
2210 new_childs_last_element.next_element.previous_element = (
2211 new_childs_last_element
2212 )
2213 self.contents.insert(position, new_child)
2215 return [new_child]
2217 def unwrap(self) -> Self:
2218 """Replace this `PageElement` with its contents.
2220 :return: This object, no longer part of the tree.
2221 """
2222 my_parent = self.parent
2223 if my_parent is None:
2224 raise ValueError(
2225 "Cannot replace an element with its contents when that "
2226 "element is not part of a tree."
2227 )
2228 my_index = my_parent.index(self)
2229 self.extract(_self_index=my_index)
2230 for child in reversed(self.contents[:]):
2231 my_parent.insert(my_index, child)
2232 return self
2234 replace_with_children = unwrap
2236 @_deprecated("unwrap", "4.0.0")
2237 def replaceWithChildren(self) -> _OneElement:
2238 ": :meta private:"
2239 return self.unwrap()
2241 def append(self, tag: _InsertableElement) -> PageElement|List[PageElement]:
2242 """Appends the given `PageElement` to the contents of this `Tag`.
2244 :param tag: A PageElement. If this is another BeautifulSoup
2245 object, all of its contents will be inserted into this
2246 `Tag`, since one BeautifulSoup object can't contain another
2247 one.
2249 :return: The object that was just appended, or (if `tag` was a BeautifulSoup
2250 object) all such objects.
2251 """
2252 inserted = self.insert(len(self.contents), tag)
2253 if isinstance(tag, Tag) and tag.name == "[document]": # TODO: can't reference BeautifulSoup class in this module
2254 return inserted
2255 else:
2256 return inserted[0]
2258 def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]:
2259 """Appends one or more objects to the contents of this
2260 `Tag`.
2262 :param tags: If a list of `PageElement` objects is provided,
2263 they will be appended to this tag's contents, one at a time.
2264 If a single `Tag` is provided, its `Tag.contents` will be
2265 used to extend this object's `Tag.contents`.
2267 :return The list of PageElements that were appended.
2268 """
2269 tag_list: Iterable[_InsertableElement]
2271 if isinstance(tags, Tag):
2272 tag_list = list(tags.contents)
2273 elif isinstance(tags, (PageElement, str)):
2274 # The caller should really be using append() instead,
2275 # but we can make it work.
2276 warnings.warn(
2277 "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.",
2278 UserWarning,
2279 stacklevel=2,
2280 )
2281 if isinstance(tags, str) and not isinstance(tags, PageElement):
2282 tags = NavigableString(tags)
2283 tag_list = [tags]
2284 elif isinstance(tags, Iterable):
2285 # Moving items around the tree may change their position in
2286 # the original list. Make a list that won't change.
2287 tag_list = list(tags)
2289 results: List[PageElement] = []
2290 for tag in tag_list:
2291 appended = self.append(tag)
2292 if isinstance(appended, list):
2293 # This can happen if you pass in a mixture of Tag and BeautifulSoup objects.
2294 results.extend(appended)
2295 else:
2296 results.append(appended)
2298 return results
2300 def clear(self, decompose: bool = False) -> None:
2301 """Destroy all children of this `Tag` by calling
2302 `PageElement.extract` on them.
2304 :param decompose: If this is True, `PageElement.decompose` (a
2305 more destructive method) will be called instead of
2306 `PageElement.extract`.
2307 """
2308 for element in self.contents[:]:
2309 if decompose:
2310 element.decompose()
2311 else:
2312 element.extract()
2314 def smooth(self) -> None:
2315 """Smooth out the children of this `Tag` by consolidating consecutive
2316 strings.
2318 If you perform a lot of operations that modify the tree,
2319 calling this method afterwards can make pretty-printed output
2320 look more natural.
2321 """
2322 # Mark the first position of every pair of children that need
2323 # to be consolidated. Do this rather than making a copy of
2324 # self.contents, since in most cases very few strings will be
2325 # affected.
2326 marked = []
2327 for i, a in enumerate(self.contents):
2328 if isinstance(a, Tag):
2329 # Recursively smooth children.
2330 a.smooth()
2331 if i == len(self.contents) - 1:
2332 # This is the last item in .contents, and it's not a
2333 # tag. There's no chance it needs any work.
2334 continue
2335 b = self.contents[i + 1]
2336 if (
2337 isinstance(a, NavigableString)
2338 and isinstance(b, NavigableString)
2339 and not isinstance(a, PreformattedString)
2340 and not isinstance(b, PreformattedString)
2341 ):
2342 marked.append(i)
2344 # Go over the marked positions in reverse order, so that
2345 # removing items from .contents won't affect the remaining
2346 # positions.
2347 for i in reversed(marked):
2348 a = cast(NavigableString, self.contents[i])
2349 b = cast(NavigableString, self.contents[i + 1])
2350 b.extract()
2351 n = NavigableString(a + b)
2352 a.replace_with(n)
2354 def index(self, element: PageElement) -> int:
2355 """Find the index of a child of this `Tag` (by identity, not value).
2357 Doing this by identity avoids issues when a `Tag` contains two
2358 children that have string equality.
2360 :param element: Look for this `PageElement` in this object's contents.
2361 """
2362 for i, child in enumerate(self.contents):
2363 if child is element:
2364 return i
2365 raise ValueError("Tag.index: element not in tag")
2367 def get(
2368 self, key: str, default: Optional[_AttributeValue] = None
2369 ) -> Optional[_AttributeValue]:
2370 """Returns the value of the 'key' attribute for the tag, or
2371 the value given for 'default' if it doesn't have that
2372 attribute.
2374 :param key: The attribute to look for.
2375 :param default: Use this value if the attribute is not present
2376 on this `Tag`.
2377 """
2378 return self.attrs.get(key, default)
2380 def get_attribute_list(
2381 self, key: str, default: Optional[AttributeValueList] = None
2382 ) -> AttributeValueList:
2383 """The same as get(), but always returns a (possibly empty) list.
2385 :param key: The attribute to look for.
2386 :param default: Use this value if the attribute is not present
2387 on this `Tag`.
2388 :return: A list of strings, usually empty or containing only a single
2389 value.
2390 """
2391 list_value: AttributeValueList
2392 value = self.get(key, default)
2393 if value is None:
2394 list_value = self.attribute_value_list_class()
2395 elif isinstance(value, list):
2396 list_value = value
2397 else:
2398 if not isinstance(value, str):
2399 value = cast(str, value)
2400 list_value = self.attribute_value_list_class([value])
2401 return list_value
2403 def has_attr(self, key: str) -> bool:
2404 """Does this `Tag` have an attribute with the given name?"""
2405 return key in self.attrs
2407 def __hash__(self) -> int:
2408 return str(self).__hash__()
2410 def __getitem__(self, key: str) -> _AttributeValue:
2411 """tag[key] returns the value of the 'key' attribute for the Tag,
2412 and throws an exception if it's not there."""
2413 return self.attrs[key]
2415 def __iter__(self) -> Iterator[PageElement]:
2416 "Iterating over a Tag iterates over its contents."
2417 return iter(self.contents)
2419 def __len__(self) -> int:
2420 "The length of a Tag is the length of its list of contents."
2421 return len(self.contents)
2423 def __contains__(self, x: Any) -> bool:
2424 return x in self.contents
2426 def __bool__(self) -> bool:
2427 "A tag is non-None even if it has no contents."
2428 return True
2430 def __setitem__(self, key: str, value: _AttributeValue) -> None:
2431 """Setting tag[key] sets the value of the 'key' attribute for the
2432 tag."""
2433 self.attrs[key] = value
2435 def __delitem__(self, key: str) -> None:
2436 "Deleting tag[key] deletes all 'key' attributes for the tag."
2437 self.attrs.pop(key, None)
2439 @overload
2440 def __call__( # pyright: ignore [reportOverlappingOverload]
2441 self,
2442 name: _FindMethodName = None,
2443 attrs: Optional[_StrainableAttributes] = None,
2444 recursive: bool = True,
2445 string: None = None,
2446 limit: Optional[int] = None,
2447 _stacklevel: int = 2,
2448 **kwargs: _StrainableAttribute,
2449 ) -> _SomeTags:
2450 ...
2452 @overload
2453 def __call__(
2454 self,
2455 name: None = None,
2456 attrs: None = None,
2457 recursive: bool = True,
2458 string: _StrainableString = "",
2459 limit: Optional[int] = None,
2460 _stacklevel: int = 2,
2461 **kwargs: _StrainableAttribute,
2462 ) -> _SomeNavigableStrings:
2463 ...
2465 def __call__(
2466 self,
2467 name: _FindMethodName = None,
2468 attrs: Optional[_StrainableAttributes] = None,
2469 recursive: bool = True,
2470 string: Optional[_StrainableString] = None,
2471 limit: Optional[int] = None,
2472 _stacklevel: int = 2,
2473 **kwargs: _StrainableAttribute,
2474 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
2475 """Calling a Tag like a function is the same as calling its
2476 find_all() method. Eg. tag('a') returns a list of all the A tags
2477 found within this tag."""
2478 if string is not None and (name is not None or attrs is not None or kwargs):
2479 # TODO: Using the @overload decorator to express the three ways you
2480 # could get into this path is way too much code for a rarely(?) used
2481 # feature.
2482 return cast(ResultSet[Tag], self.find_all(name, attrs, recursive, string, limit, _stacklevel, **kwargs)) #type: ignore
2484 if string is None:
2485 # If string is None, we're searching for tags.
2486 tags:ResultSet[Tag] = self.find_all(
2487 name, attrs, recursive, None, limit, _stacklevel, **kwargs
2488 )
2489 return tags
2491 # Otherwise, we're searching for strings.
2492 strings:ResultSet[NavigableString] = self.find_all(
2493 None, None, recursive, string, limit, _stacklevel, **kwargs
2494 )
2495 return strings
2497 def __getattr__(self, subtag: str) -> Optional[Tag]:
2498 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
2499 # print("Getattr %s.%s" % (self.__class__, tag))
2500 result: _AtMostOneElement
2501 if len(subtag) > 3 and subtag.endswith("Tag"):
2502 # BS3: soup.aTag -> "soup.find("a")
2503 tag_name = subtag[:-3]
2504 warnings.warn(
2505 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")'
2506 % dict(name=tag_name),
2507 DeprecationWarning,
2508 stacklevel=2,
2509 )
2510 result = self.find(tag_name)
2511 # We special case contents to avoid recursion.
2512 elif not subtag.startswith("__") and not subtag == "contents":
2513 result = self.find(subtag)
2514 else:
2515 raise AttributeError(
2516 "'%s' object has no attribute '%s'" % (self.__class__, subtag)
2517 )
2518 return result
2520 def __eq__(self, other: Any) -> bool:
2521 """Returns true iff this Tag has the same name, the same attributes,
2522 and the same contents (recursively) as `other`."""
2523 if self is other:
2524 return True
2525 if not isinstance(other, Tag):
2526 return False
2527 if (
2528 not hasattr(other, "name")
2529 or not hasattr(other, "attrs")
2530 or not hasattr(other, "contents")
2531 or self.name != other.name
2532 or self.attrs != other.attrs
2533 or len(self) != len(other)
2534 ):
2535 return False
2536 for i, my_child in enumerate(self.contents):
2537 if my_child != other.contents[i]:
2538 return False
2539 return True
2541 def __ne__(self, other: Any) -> bool:
2542 """Returns true iff this Tag is not identical to `other`,
2543 as defined in __eq__."""
2544 return not self == other
2546 def __repr__(self) -> str:
2547 """Renders this `Tag` as a string."""
2548 return self.decode()
2550 __str__ = __unicode__ = __repr__
2552 def encode(
2553 self,
2554 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2555 indent_level: Optional[int] = None,
2556 formatter: _FormatterOrName = "minimal",
2557 errors: str = "xmlcharrefreplace",
2558 ) -> bytes:
2559 """Render this `Tag` and its contents as a bytestring.
2561 :param encoding: The encoding to use when converting to
2562 a bytestring. This may also affect the text of the document,
2563 specifically any encoding declarations within the document.
2564 :param indent_level: Each line of the rendering will be
2565 indented this many levels. (The ``formatter`` decides what a
2566 'level' means, in terms of spaces or other characters
2567 output.) This is used internally in recursive calls while
2568 pretty-printing.
2569 :param formatter: Either a `Formatter` object, or a string naming one of
2570 the standard formatters.
2571 :param errors: An error handling strategy such as
2572 'xmlcharrefreplace'. This value is passed along into
2573 :py:meth:`str.encode` and its value should be one of the `error
2574 handling constants defined by Python's codecs module
2575 <https://docs.python.org/3/library/codecs.html#error-handlers>`_.
2576 """
2577 # Turn the data structure into Unicode, then encode the
2578 # Unicode.
2579 u = self.decode(indent_level, encoding, formatter)
2580 return u.encode(encoding, errors)
2582 def decode(
2583 self,
2584 indent_level: Optional[int] = None,
2585 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2586 formatter: _FormatterOrName = "minimal",
2587 iterator: Optional[Iterator[PageElement]] = None,
2588 ) -> str:
2589 """Render this `Tag` and its contents as a Unicode string.
2591 :param indent_level: Each line of the rendering will be
2592 indented this many levels. (The ``formatter`` decides what a
2593 'level' means, in terms of spaces or other characters
2594 output.) This is used internally in recursive calls while
2595 pretty-printing.
2596 :param encoding: The encoding you intend to use when
2597 converting the string to a bytestring. decode() is *not*
2598 responsible for performing that encoding. This information
2599 is needed so that a real encoding can be substituted in if
2600 the document contains an encoding declaration (e.g. in a
2601 <meta> tag).
2602 :param formatter: Either a `Formatter` object, or a string
2603 naming one of the standard formatters.
2604 :param iterator: The iterator to use when navigating over the
2605 parse tree. This is only used by `Tag.decode_contents` and
2606 you probably won't need to use it.
2607 """
2608 pieces = []
2609 # First off, turn a non-Formatter `formatter` into a Formatter
2610 # object. This will stop the lookup from happening over and
2611 # over again.
2612 if not isinstance(formatter, Formatter):
2613 formatter = self.formatter_for_name(formatter)
2615 if indent_level is True:
2616 indent_level = 0
2618 # The currently active tag that put us into string literal
2619 # mode. Until this element is closed, children will be treated
2620 # as string literals and not pretty-printed. String literal
2621 # mode is turned on immediately after this tag begins, and
2622 # turned off immediately before it's closed. This means there
2623 # will be whitespace before and after the tag itself.
2624 string_literal_tag = None
2626 for event, element in self._event_stream(iterator):
2627 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
2628 element = cast(Tag, element)
2629 piece = element._format_tag(eventual_encoding, formatter, opening=True)
2630 elif event is Tag.END_ELEMENT_EVENT:
2631 element = cast(Tag, element)
2632 piece = element._format_tag(eventual_encoding, formatter, opening=False)
2633 if indent_level is not None:
2634 indent_level -= 1
2635 else:
2636 element = cast(NavigableString, element)
2637 piece = element.output_ready(formatter)
2639 # Now we need to apply the 'prettiness' -- extra
2640 # whitespace before and/or after this tag. This can get
2641 # complicated because certain tags, like <pre> and
2642 # <script>, can't be prettified, since adding whitespace would
2643 # change the meaning of the content.
2645 # The default behavior is to add whitespace before and
2646 # after an element when string literal mode is off, and to
2647 # leave things as they are when string literal mode is on.
2648 if string_literal_tag:
2649 indent_before = indent_after = False
2650 else:
2651 indent_before = indent_after = True
2653 # The only time the behavior is more complex than that is
2654 # when we encounter an opening or closing tag that might
2655 # put us into or out of string literal mode.
2656 if (
2657 event is Tag.START_ELEMENT_EVENT
2658 and not string_literal_tag
2659 and not cast(Tag, element)._should_pretty_print()
2660 ):
2661 # We are about to enter string literal mode. Add
2662 # whitespace before this tag, but not after. We
2663 # will stay in string literal mode until this tag
2664 # is closed.
2665 indent_before = True
2666 indent_after = False
2667 string_literal_tag = element
2668 elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag:
2669 # We are about to exit string literal mode by closing
2670 # the tag that sent us into that mode. Add whitespace
2671 # after this tag, but not before.
2672 indent_before = False
2673 indent_after = True
2674 string_literal_tag = None
2676 # Now we know whether to add whitespace before and/or
2677 # after this element.
2678 if indent_level is not None:
2679 if indent_before or indent_after:
2680 if isinstance(element, NavigableString):
2681 piece = piece.strip()
2682 if piece:
2683 piece = self._indent_string(
2684 piece, indent_level, formatter, indent_before, indent_after
2685 )
2686 if event == Tag.START_ELEMENT_EVENT:
2687 indent_level += 1
2688 pieces.append(piece)
2689 return "".join(pieces)
2691 class _TreeTraversalEvent(object):
2692 """An internal class representing an event in the process
2693 of traversing a parse tree.
2695 :meta private:
2696 """
2698 # Stand-ins for the different events yielded by _event_stream
2699 START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2700 END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2701 EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2702 STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2704 def _event_stream(
2705 self, iterator: Optional[Iterator[PageElement]] = None
2706 ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]:
2707 """Yield a sequence of events that can be used to reconstruct the DOM
2708 for this element.
2710 This lets us recreate the nested structure of this element
2711 (e.g. when formatting it as a string) without using recursive
2712 method calls.
2714 This is similar in concept to the SAX API, but it's a simpler
2715 interface designed for internal use. The events are different
2716 from SAX and the arguments associated with the events are Tags
2717 and other Beautiful Soup objects.
2719 :param iterator: An alternate iterator to use when traversing
2720 the tree.
2721 """
2722 tag_stack: List[Tag] = []
2724 iterator = iterator or self.self_and_descendants
2726 for c in iterator:
2727 # If the parent of the element we're about to yield is not
2728 # the tag currently on the stack, it means that the tag on
2729 # the stack closed before this element appeared.
2730 while tag_stack and c.parent != tag_stack[-1]:
2731 now_closed_tag = tag_stack.pop()
2732 yield Tag.END_ELEMENT_EVENT, now_closed_tag
2734 if isinstance(c, Tag):
2735 if c.is_empty_element:
2736 yield Tag.EMPTY_ELEMENT_EVENT, c
2737 else:
2738 yield Tag.START_ELEMENT_EVENT, c
2739 tag_stack.append(c)
2740 continue
2741 else:
2742 yield Tag.STRING_ELEMENT_EVENT, c
2744 while tag_stack:
2745 now_closed_tag = tag_stack.pop()
2746 yield Tag.END_ELEMENT_EVENT, now_closed_tag
2748 def _indent_string(
2749 self,
2750 s: str,
2751 indent_level: int,
2752 formatter: Formatter,
2753 indent_before: bool,
2754 indent_after: bool,
2755 ) -> str:
2756 """Add indentation whitespace before and/or after a string.
2758 :param s: The string to amend with whitespace.
2759 :param indent_level: The indentation level; affects how much
2760 whitespace goes before the string.
2761 :param indent_before: Whether or not to add whitespace
2762 before the string.
2763 :param indent_after: Whether or not to add whitespace
2764 (a newline) after the string.
2765 """
2766 space_before = ""
2767 if indent_before and indent_level:
2768 space_before = formatter.indent * indent_level
2770 space_after = ""
2771 if indent_after:
2772 space_after = "\n"
2774 return space_before + s + space_after
2776 def _format_tag(
2777 self, eventual_encoding: str, formatter: Formatter, opening: bool
2778 ) -> str:
2779 if self.hidden:
2780 # A hidden tag is invisible, although its contents
2781 # are visible.
2782 return ""
2784 # A tag starts with the < character (see below).
2786 # Then the / character, if this is a closing tag.
2787 closing_slash = ""
2788 if not opening:
2789 closing_slash = "/"
2791 # Then an optional namespace prefix.
2792 prefix = ""
2793 if self.prefix:
2794 prefix = self.prefix + ":"
2796 # Then a list of attribute values, if this is an opening tag.
2797 attribute_string = ""
2798 if opening:
2799 attributes = formatter.attributes(self)
2800 attrs = []
2801 for key, val in attributes:
2802 if val is None:
2803 decoded = key
2804 else:
2805 if isinstance(val, list) or isinstance(val, tuple):
2806 val = " ".join(val)
2807 elif not isinstance(val, str):
2808 val = str(val)
2809 elif (
2810 isinstance(val, AttributeValueWithCharsetSubstitution)
2811 and eventual_encoding is not None
2812 ):
2813 val = val.substitute_encoding(eventual_encoding)
2815 text = formatter.attribute_value(val)
2816 decoded = str(key) + "=" + formatter.quoted_attribute_value(text)
2817 attrs.append(decoded)
2818 if attrs:
2819 attribute_string = " " + " ".join(attrs)
2821 # Then an optional closing slash (for a void element in an
2822 # XML document).
2823 void_element_closing_slash = ""
2824 if self.is_empty_element:
2825 void_element_closing_slash = formatter.void_element_close_prefix or ""
2827 # Put it all together.
2828 return (
2829 "<"
2830 + closing_slash
2831 + prefix
2832 + self.name
2833 + attribute_string
2834 + void_element_closing_slash
2835 + ">"
2836 )
2838 def _should_pretty_print(self, indent_level: int = 1) -> bool:
2839 """Should this tag be pretty-printed?
2841 Most of them should, but some (such as <pre> in HTML
2842 documents) should not.
2843 """
2844 return indent_level is not None and (
2845 not self.preserve_whitespace_tags
2846 or self.name not in self.preserve_whitespace_tags
2847 )
2849 @overload
2850 def prettify(
2851 self,
2852 encoding: None = None,
2853 formatter: _FormatterOrName = "minimal",
2854 ) -> str:
2855 ...
2857 @overload
2858 def prettify(
2859 self,
2860 encoding: _Encoding,
2861 formatter: _FormatterOrName = "minimal",
2862 ) -> bytes:
2863 ...
2865 def prettify(
2866 self,
2867 encoding: Optional[_Encoding] = None,
2868 formatter: _FormatterOrName = "minimal",
2869 ) -> Union[str, bytes]:
2870 """Pretty-print this `Tag` as a string or bytestring.
2872 :param encoding: The encoding of the bytestring, or None if you want Unicode.
2873 :param formatter: A Formatter object, or a string naming one of
2874 the standard formatters.
2875 :return: A string (if no ``encoding`` is provided) or a bytestring
2876 (otherwise).
2877 """
2878 if encoding is None:
2879 return self.decode(indent_level=0, formatter=formatter)
2880 else:
2881 return self.encode(encoding=encoding, indent_level=0, formatter=formatter)
2883 def decode_contents(
2884 self,
2885 indent_level: Optional[int] = None,
2886 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2887 formatter: _FormatterOrName = "minimal",
2888 ) -> str:
2889 """Renders the contents of this tag as a Unicode string.
2891 :param indent_level: Each line of the rendering will be
2892 indented this many levels. (The formatter decides what a
2893 'level' means in terms of spaces or other characters
2894 output.) Used internally in recursive calls while
2895 pretty-printing.
2897 :param eventual_encoding: The tag is destined to be
2898 encoded into this encoding. decode_contents() is *not*
2899 responsible for performing that encoding. This information
2900 is needed so that a real encoding can be substituted in if
2901 the document contains an encoding declaration (e.g. in a
2902 <meta> tag).
2904 :param formatter: A `Formatter` object, or a string naming one of
2905 the standard Formatters.
2906 """
2907 return self.decode(
2908 indent_level, eventual_encoding, formatter, iterator=self.descendants
2909 )
2911 def encode_contents(
2912 self,
2913 indent_level: Optional[int] = None,
2914 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2915 formatter: _FormatterOrName = "minimal",
2916 ) -> bytes:
2917 """Renders the contents of this PageElement as a bytestring.
2919 :param indent_level: Each line of the rendering will be
2920 indented this many levels. (The ``formatter`` decides what a
2921 'level' means, in terms of spaces or other characters
2922 output.) This is used internally in recursive calls while
2923 pretty-printing.
2924 :param formatter: Either a `Formatter` object, or a string naming one of
2925 the standard formatters.
2926 :param encoding: The bytestring will be in this encoding.
2927 """
2928 contents = self.decode_contents(indent_level, encoding, formatter)
2929 return contents.encode(encoding)
2931 @_deprecated("encode_contents", "4.0.0")
2932 def renderContents(
2933 self,
2934 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2935 prettyPrint: bool = False,
2936 indentLevel: Optional[int] = 0,
2937 ) -> bytes:
2938 """Deprecated method for BS3 compatibility.
2940 :meta private:
2941 """
2942 if not prettyPrint:
2943 indentLevel = None
2944 return self.encode_contents(indent_level=indentLevel, encoding=encoding)
2946 # Soup methods
2948 @overload
2949 def find(
2950 self,
2951 name: _FindMethodName = None,
2952 attrs: Optional[_StrainableAttributes] = None,
2953 recursive: bool = True,
2954 string: None=None,
2955 **kwargs: _StrainableAttribute,
2956 ) -> _AtMostOneTag:
2957 ...
2959 @overload
2960 def find(
2961 self,
2962 name: None=None,
2963 attrs: None=None,
2964 recursive: bool = True,
2965 string: _StrainableString="",
2966 ) -> _AtMostOneNavigableString:
2967 ...
2969 def find(
2970 self,
2971 name: _FindMethodName = None,
2972 attrs: Optional[_StrainableAttributes] = None,
2973 recursive: bool = True,
2974 string: Optional[_StrainableString] = None,
2975 **kwargs: _StrainableAttribute,
2976 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
2977 """Look in the children of this PageElement and find the first
2978 PageElement that matches the given criteria.
2980 All find_* methods take a common set of arguments. See the online
2981 documentation for detailed explanations.
2983 :param name: A filter on tag name.
2984 :param attrs: Additional filters on attribute values.
2985 :param recursive: If this is True, find() will perform a
2986 recursive search of this Tag's children. Otherwise,
2987 only the direct children will be considered.
2988 :param string: A filter on the `Tag.string` attribute.
2989 :kwargs: Additional filters on attribute values.
2990 """
2991 if string is not None and (name is not None or attrs is not None or kwargs):
2992 # TODO: Using the @overload decorator to express the three ways you
2993 # could get into this path is way too much code for a rarely(?) used
2994 # feature.
2995 elements = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) # type:ignore
2996 if elements:
2997 return cast(Tag, elements[0])
2998 elif string is None:
2999 tags = self.find_all(name, attrs, recursive, None, 1, _stacklevel=3, **kwargs)
3000 if tags:
3001 return cast(Tag, tags[0])
3002 else:
3003 strings = self.find_all(None, None, recursive, string, 1, _stacklevel=3, **kwargs)
3004 if strings:
3005 return cast(NavigableString, strings[0])
3006 return None
3008 findChild = _deprecated_function_alias("findChild", "find", "3.0.0")
3010 @overload
3011 def find_all( # pyright: ignore [reportOverlappingOverload]
3012 self,
3013 name: _FindMethodName = None,
3014 attrs: Optional[_StrainableAttributes] = None,
3015 recursive: bool = True,
3016 string: None = None,
3017 limit: Optional[int] = None,
3018 _stacklevel: int = 2,
3019 **kwargs: _StrainableAttribute,
3020 ) -> _SomeTags:
3021 ...
3023 @overload
3024 def find_all(
3025 self,
3026 name: None = None,
3027 attrs: None = None,
3028 recursive: bool = True,
3029 string: _StrainableString = "",
3030 limit: Optional[int] = None,
3031 _stacklevel: int = 2,
3032 **kwargs: _StrainableAttribute,
3033 ) -> _SomeNavigableStrings:
3034 ...
3036 def find_all(
3037 self,
3038 name: _FindMethodName = None,
3039 attrs: Optional[_StrainableAttributes] = None,
3040 recursive: bool = True,
3041 string: Optional[_StrainableString] = None,
3042 limit: Optional[int] = None,
3043 _stacklevel: int = 2,
3044 **kwargs: _StrainableAttribute,
3045 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
3046 """Look in the children of this `PageElement` and find all
3047 `PageElement` objects that match the given criteria.
3049 All find_* methods take a common set of arguments. See the online
3050 documentation for detailed explanations.
3052 :param name: A filter on tag name.
3053 :param attrs: Additional filters on attribute values.
3054 :param recursive: If this is True, find_all() will perform a
3055 recursive search of this PageElement's children. Otherwise,
3056 only the direct children will be considered.
3057 :param limit: Stop looking after finding this many results.
3058 :param _stacklevel: Used internally to improve warning messages.
3059 :kwargs: Additional filters on attribute values.
3060 """
3061 generator = self.descendants
3062 if not recursive:
3063 generator = self.children
3064 _stacklevel += 1
3066 if string is not None and (name is not None or attrs is not None or kwargs):
3067 # TODO: Using the @overload decorator to express the three ways you
3068 # could get into this path is way too much code for a rarely(?) used
3069 # feature.
3070 return cast(ResultSet[Tag],
3071 self._find_all(name, attrs, string, limit, generator,
3072 _stacklevel=_stacklevel, **kwargs)
3073 )
3075 if string is None:
3076 # If string is None, we're searching for tags.
3077 return cast(ResultSet[Tag], self._find_all(
3078 name, attrs, None, limit, generator, _stacklevel=_stacklevel, **kwargs
3079 ))
3081 # Otherwise, we're searching for strings.
3082 return cast(ResultSet[NavigableString], self._find_all(
3083 None, None, string, limit, generator, _stacklevel=_stacklevel, **kwargs
3084 ))
3086 findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")
3087 findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")
3089 # Generator methods
3090 @property
3091 def children(self) -> Iterator[PageElement]:
3092 """Iterate over all direct children of this `PageElement`."""
3093 return (x for x in self.contents)
3095 @property
3096 def self_and_descendants(self) -> Iterator[PageElement]:
3097 """Iterate over this `Tag` and its children in a
3098 breadth-first sequence.
3099 """
3100 return self._self_and(self.descendants)
3102 @property
3103 def descendants(self) -> Iterator[PageElement]:
3104 """Iterate over all children of this `Tag` in a
3105 breadth-first sequence.
3106 """
3107 if not len(self.contents):
3108 return
3109 # _last_descendant() can't return None here because
3110 # accept_self is True. Worst case, last_descendant will end up
3111 # as self.
3112 last_descendant = cast(PageElement, self._last_descendant(accept_self=True))
3113 stopNode = last_descendant.next_element
3114 current: _AtMostOneElement = self.contents[0]
3115 while current is not stopNode and current is not None:
3116 successor = current.next_element
3117 yield current
3118 current = successor
3120 # CSS selector code
3121 def select_one(
3122 self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any
3123 ) -> Optional[Tag]:
3124 """Perform a CSS selection operation on the current element.
3126 :param selector: A CSS selector.
3128 :param namespaces: A dictionary mapping namespace prefixes
3129 used in the CSS selector to namespace URIs. By default,
3130 Beautiful Soup will use the prefixes it encountered while
3131 parsing the document.
3133 :param kwargs: Keyword arguments to be passed into Soup Sieve's
3134 soupsieve.select() method.
3135 """
3136 return self.css.select_one(selector, namespaces, **kwargs)
3138 def select(
3139 self,
3140 selector: str,
3141 namespaces: Optional[Dict[str, str]] = None,
3142 limit: int = 0,
3143 **kwargs: Any,
3144 ) -> ResultSet[Tag]:
3145 """Perform a CSS selection operation on the current element.
3147 This uses the SoupSieve library.
3149 :param selector: A string containing a CSS selector.
3151 :param namespaces: A dictionary mapping namespace prefixes
3152 used in the CSS selector to namespace URIs. By default,
3153 Beautiful Soup will use the prefixes it encountered while
3154 parsing the document.
3156 :param limit: After finding this number of results, stop looking.
3158 :param kwargs: Keyword arguments to be passed into SoupSieve's
3159 soupsieve.select() method.
3160 """
3161 return self.css.select(selector, namespaces, limit, **kwargs)
3163 @property
3164 def css(self) -> CSS:
3165 """Return an interface to the CSS selector API."""
3166 return CSS(self)
3168 # Old names for backwards compatibility
3169 @_deprecated("children", "4.0.0")
3170 def childGenerator(self) -> Iterator[PageElement]:
3171 """Deprecated generator.
3173 :meta private:
3174 """
3175 return self.children
3177 @_deprecated("descendants", "4.0.0")
3178 def recursiveChildGenerator(self) -> Iterator[PageElement]:
3179 """Deprecated generator.
3181 :meta private:
3182 """
3183 return self.descendants
3185 @_deprecated("has_attr", "4.0.0")
3186 def has_key(self, key: str) -> bool:
3187 """Deprecated method. This was kind of misleading because has_key()
3188 (attributes) was different from __in__ (contents).
3190 has_key() is gone in Python 3, anyway.
3192 :meta private:
3193 """
3194 return self.has_attr(key)
3197_PageElementT = TypeVar("_PageElementT", bound=PageElement)
3199class ResultSet(List[_PageElementT], Generic[_PageElementT]):
3200 """A ResultSet is a list of `PageElement` objects, gathered as the result
3201 of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of
3202 search results.
3203 """
3205 source: Optional[ElementFilter]
3207 def __init__(
3208 self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = ()
3209 ) -> None:
3210 super(ResultSet, self).__init__(result)
3211 self.source = source
3213 def __getattr__(self, key: str) -> None:
3214 """Raise a helpful exception to explain a common code fix."""
3215 raise AttributeError(
3216 f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""
3217 )
3219# Now that all the classes used by SoupStrainer have been defined,
3220# import SoupStrainer itself into this module to preserve the
3221# backwards compatibility of anyone who imports
3222# bs4.element.SoupStrainer.
3223from bs4.filter import SoupStrainer # noqa: E402