Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/element.py: 32%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3# Use of this source code is governed by the MIT license.
4__license__ = "MIT"
6import re
7import warnings
9from bs4.css import CSS
10from bs4._deprecation import (
11 _deprecated,
12 _deprecated_alias,
13 _deprecated_function_alias,
14)
15from bs4.formatter import (
16 Formatter,
17 HTMLFormatter,
18 XMLFormatter,
19)
20from bs4._warnings import AttributeResemblesVariableWarning
22from typing import (
23 Any,
24 Callable,
25 Dict,
26 Generic,
27 Iterable,
28 Iterator,
29 List,
30 Mapping,
31 MutableSequence,
32 Optional,
33 Pattern,
34 Set,
35 TYPE_CHECKING,
36 Tuple,
37 Type,
38 TypeVar,
39 Union,
40 cast,
41 overload,
42)
43from typing_extensions import (
44 Self,
45 TypeAlias,
46)
48if TYPE_CHECKING:
49 from bs4 import BeautifulSoup
50 from bs4.builder import TreeBuilder
51 from bs4.filter import ElementFilter
52 from bs4.formatter import (
53 _EntitySubstitutionFunction,
54 _FormatterOrName,
55 )
56 from bs4._typing import (
57 _AtMostOneElement,
58 _AtMostOneTag,
59 _AtMostOneNavigableString,
60 _AttributeValue,
61 _AttributeValues,
62 _Encoding,
63 _InsertableElement,
64 _OneElement,
65 _QueryResults,
66 _RawOrProcessedAttributeValues,
67 _StrainableElement,
68 _StrainableAttribute,
69 _StrainableAttributes,
70 _StrainableString,
71 _SomeNavigableStrings,
72 _SomeTags,
73 )
75_OneOrMoreStringTypes: TypeAlias = Union[
76 Type["NavigableString"], Iterable[Type["NavigableString"]]
77]
79_FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]]
81# Deprecated module-level attributes.
82# See https://peps.python.org/pep-0562/
83_deprecated_names = dict(
84 whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy."
85)
86#: :meta private:
87_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+")
90def __getattr__(name: str) -> Any:
91 if name in _deprecated_names:
92 message = _deprecated_names[name]
93 warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2)
95 return globals()[f"_deprecated_{name}"]
96 raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
99#: Documents output by Beautiful Soup will be encoded with
100#: this encoding unless you specify otherwise.
101DEFAULT_OUTPUT_ENCODING: str = "utf-8"
103#: A regular expression that can be used to split on whitespace.
104nonwhitespace_re: Pattern[str] = re.compile(r"\S+")
106#: These encodings are recognized by Python (so `Tag.encode`
107#: could theoretically support them) but XML and HTML don't recognize
108#: them (so they should not show up in an XML or HTML document as that
109#: document's encoding).
110#:
111#: If an XML document is encoded in one of these encodings, no encoding
112#: will be mentioned in the XML declaration. If an HTML document is
113#: encoded in one of these encodings, and the HTML document has a
114#: <meta> tag that mentions an encoding, the encoding will be given as
115#: the empty string.
116#:
117#: Source:
118#: Python documentation, `Python Specific Encodings <https://docs.python.org/3/library/codecs.html#python-specific-encodings>`_
119PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set(
120 [
121 "idna",
122 "mbcs",
123 "oem",
124 "palmos",
125 "punycode",
126 "raw_unicode_escape",
127 "undefined",
128 "unicode_escape",
129 "raw-unicode-escape",
130 "unicode-escape",
131 "string-escape",
132 "string_escape",
133 ]
134)
137class NamespacedAttribute(str):
138 """A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"')
139 which remembers the namespace prefix ('xml') and the name ('lang')
140 that were used to create it.
141 """
143 prefix: Optional[str]
144 name: Optional[str]
145 namespace: Optional[str]
147 def __new__(
148 cls,
149 prefix: Optional[str],
150 name: Optional[str] = None,
151 namespace: Optional[str] = None,
152 ) -> Self:
153 if not name:
154 # This is the default namespace. Its name "has no value"
155 # per https://www.w3.org/TR/xml-names/#defaulting
156 name = None
158 if not name:
159 obj = str.__new__(cls, prefix)
160 elif not prefix:
161 # Not really namespaced.
162 obj = str.__new__(cls, name)
163 else:
164 obj = str.__new__(cls, prefix + ":" + name)
165 obj.prefix = prefix
166 obj.name = name
167 obj.namespace = namespace
168 return obj
171class AttributeValueWithCharsetSubstitution(str):
172 """An abstract class standing in for a character encoding specified
173 inside an HTML ``<meta>`` tag.
175 Subclasses exist for each place such a character encoding might be
176 found: either inside the ``charset`` attribute
177 (`CharsetMetaAttributeValue`) or inside the ``content`` attribute
178 (`ContentMetaAttributeValue`)
180 This allows Beautiful Soup to replace that part of the HTML file
181 with a different encoding when ouputting a tree as a string.
182 """
184 # The original, un-encoded value of the ``content`` attribute.
185 #: :meta private:
186 original_value: str
188 def substitute_encoding(self, eventual_encoding: str) -> str:
189 """Do whatever's necessary in this implementation-specific
190 portion an HTML document to substitute in a specific encoding.
191 """
192 raise NotImplementedError()
195class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
196 """A generic stand-in for the value of a ``<meta>`` tag's ``charset``
197 attribute.
199 When Beautiful Soup parses the markup ``<meta charset="utf8">``, the
200 value of the ``charset`` attribute will become one of these objects.
202 If the document is later encoded to an encoding other than UTF-8, its
203 ``<meta>`` tag will mention the new encoding instead of ``utf8``.
204 """
206 def __new__(cls, original_value: str) -> Self:
207 # We don't need to use the original value for anything, but
208 # it might be useful for the user to know.
209 obj = str.__new__(cls, original_value)
210 obj.original_value = original_value
211 return obj
213 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
214 """When an HTML document is being encoded to a given encoding, the
215 value of a ``<meta>`` tag's ``charset`` becomes the name of
216 the encoding.
217 """
218 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
219 return ""
220 return eventual_encoding
223class AttributeValueList(List[str]):
224 """Class for the list used to hold the values of attributes which
225 have multiple values (such as HTML's 'class'). It's just a regular
226 list, but you can subclass it and pass it in to the TreeBuilder
227 constructor as attribute_value_list_class, to have your subclass
228 instantiated instead.
229 """
232class AttributeDict(Dict[Any,Any]):
233 """Superclass for the dictionary used to hold a tag's
234 attributes. You can use this, but it's just a regular dict with no
235 special logic.
236 """
239class XMLAttributeDict(AttributeDict):
240 """A dictionary for holding a Tag's attributes, which processes
241 incoming values for consistency with the HTML spec.
242 """
244 def __setitem__(self, key: str, value: Any) -> None:
245 """Set an attribute value, possibly modifying it to comply with
246 the XML spec.
248 This just means converting common non-string values to
249 strings: XML attributes may have "any literal string as a
250 value."
251 """
252 if value is None:
253 value = ""
254 if isinstance(value, bool):
255 # XML does not define any rules for boolean attributes.
256 # Preserve the old Beautiful Soup behavior (a bool that
257 # gets converted to a string on output) rather than
258 # guessing what the value should be.
259 pass
260 elif isinstance(value, (int, float)):
261 # It's dangerous to convert _every_ attribute value into a
262 # plain string, since an attribute value may be a more
263 # sophisticated string-like object
264 # (e.g. CharsetMetaAttributeValue). But we can definitely
265 # convert numeric values and booleans, which are the most common.
266 value = str(value)
268 super().__setitem__(key, value)
271class HTMLAttributeDict(AttributeDict):
272 """A dictionary for holding a Tag's attributes, which processes
273 incoming values for consistency with the HTML spec, which says
274 'Attribute values are a mixture of text and character
275 references...'
277 Basically, this means converting common non-string values into
278 strings, like XMLAttributeDict, though HTML also has some rules
279 around boolean attributes that XML doesn't have.
280 """
282 def __setitem__(self, key: str, value: Any) -> None:
283 """Set an attribute value, possibly modifying it to comply
284 with the HTML spec,
285 """
286 if value in (False, None):
287 # 'The values "true" and "false" are not allowed on
288 # boolean attributes. To represent a false value, the
289 # attribute has to be omitted altogether.'
290 if key in self:
291 del self[key]
292 return
293 if isinstance(value, bool):
294 # 'If the [boolean] attribute is present, its value must
295 # either be the empty string or a value that is an ASCII
296 # case-insensitive match for the attribute's canonical
297 # name, with no leading or trailing whitespace.'
298 #
299 # [fixme] It's not clear to me whether "canonical name"
300 # means fully-qualified name, unqualified name, or
301 # (probably not) name with namespace prefix. For now I'm
302 # going with unqualified name.
303 if isinstance(key, NamespacedAttribute):
304 value = key.name
305 else:
306 value = key
307 elif isinstance(value, (int, float)):
308 # See note in XMLAttributeDict for the reasoning why we
309 # only do this to numbers.
310 value = str(value)
311 super().__setitem__(key, value)
314class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
315 """A generic stand-in for the value of a ``<meta>`` tag's ``content``
316 attribute.
318 When Beautiful Soup parses the markup:
319 ``<meta http-equiv="content-type" content="text/html; charset=utf8">``
321 The value of the ``content`` attribute will become one of these objects.
323 If the document is later encoded to an encoding other than UTF-8, its
324 ``<meta>`` tag will mention the new encoding instead of ``utf8``.
325 """
327 #: Match the 'charset' argument inside the 'content' attribute
328 #: of a <meta> tag.
329 #: :meta private:
330 CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
332 def __new__(cls, original_value: str) -> Self:
333 cls.CHARSET_RE.search(original_value)
334 obj = str.__new__(cls, original_value)
335 obj.original_value = original_value
336 return obj
338 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
339 """When an HTML document is being encoded to a given encoding, the
340 value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes
341 the name of the encoding.
342 """
343 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
344 return self.CHARSET_RE.sub("", self.original_value)
346 def rewrite(match: re.Match[str]) -> str:
347 return match.group(1) + eventual_encoding
349 return self.CHARSET_RE.sub(rewrite, self.original_value)
352class PageElement(object):
353 """An abstract class representing a single element in the parse tree.
355 `NavigableString`, `Tag`, etc. are all subclasses of
356 `PageElement`. For this reason you'll see a lot of methods that
357 return `PageElement`, but you'll never see an actual `PageElement`
358 object. For the most part you can think of `PageElement` as
359 meaning "a `Tag` or a `NavigableString`."
360 """
362 #: In general, we can't tell just by looking at an element whether
363 #: it's contained in an XML document or an HTML document. But for
364 #: `Tag` objects (q.v.) we can store this information at parse time.
365 #: :meta private:
366 known_xml: Optional[bool] = None
368 #: Whether or not this element has been decomposed from the tree
369 #: it was created in.
370 _decomposed: bool
372 parent: Optional[Tag]
373 next_element: _AtMostOneElement
374 previous_element: _AtMostOneElement
375 next_sibling: _AtMostOneElement
376 previous_sibling: _AtMostOneElement
378 #: Whether or not this element is hidden from generated output.
379 #: Only the `BeautifulSoup` object itself is hidden.
380 hidden: bool = False
382 def setup(
383 self,
384 parent: Optional[Tag] = None,
385 previous_element: _AtMostOneElement = None,
386 next_element: _AtMostOneElement = None,
387 previous_sibling: _AtMostOneElement = None,
388 next_sibling: _AtMostOneElement = None,
389 ) -> None:
390 """Sets up the initial relations between this element and
391 other elements.
393 :param parent: The parent of this element.
395 :param previous_element: The element parsed immediately before
396 this one.
398 :param next_element: The element parsed immediately after
399 this one.
401 :param previous_sibling: The most recently encountered element
402 on the same level of the parse tree as this one.
404 :param previous_sibling: The next element to be encountered
405 on the same level of the parse tree as this one.
406 """
407 self.parent = parent
409 self.previous_element = previous_element
410 if self.previous_element is not None:
411 self.previous_element.next_element = self
413 self.next_element = next_element
414 if self.next_element is not None:
415 self.next_element.previous_element = self
417 self.next_sibling = next_sibling
418 if self.next_sibling is not None:
419 self.next_sibling.previous_sibling = self
421 if (
422 previous_sibling is None
423 and self.parent is not None
424 and self.parent.contents
425 ):
426 previous_sibling = self.parent.contents[-1]
428 self.previous_sibling = previous_sibling
429 if self.previous_sibling is not None:
430 self.previous_sibling.next_sibling = self
432 def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str:
433 """Format the given string using the given formatter.
435 :param s: A string.
436 :param formatter: A Formatter object, or a string naming one of the standard formatters.
437 """
438 if formatter is None:
439 return s
440 if not isinstance(formatter, Formatter):
441 formatter = self.formatter_for_name(formatter)
442 output = formatter.substitute(s)
443 return output
445 def formatter_for_name(
446 self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction]
447 ) -> Formatter:
448 """Look up or create a Formatter for the given identifier,
449 if necessary.
451 :param formatter: Can be a `Formatter` object (used as-is), a
452 function (used as the entity substitution hook for an
453 `bs4.formatter.XMLFormatter` or
454 `bs4.formatter.HTMLFormatter`), or a string (used to look
455 up an `bs4.formatter.XMLFormatter` or
456 `bs4.formatter.HTMLFormatter` in the appropriate registry.
458 """
459 if isinstance(formatter_name, Formatter):
460 return formatter_name
461 c: type[Formatter]
462 registry: Mapping[Optional[str], Formatter]
463 if self._is_xml:
464 c = XMLFormatter
465 registry = XMLFormatter.REGISTRY
466 else:
467 c = HTMLFormatter
468 registry = HTMLFormatter.REGISTRY
469 if callable(formatter_name):
470 return c(entity_substitution=formatter_name)
471 return registry[formatter_name]
473 @property
474 def _is_xml(self) -> bool:
475 """Is this element part of an XML tree or an HTML tree?
477 This is used in formatter_for_name, when deciding whether an
478 XMLFormatter or HTMLFormatter is more appropriate. It can be
479 inefficient, but it should be called very rarely.
480 """
481 if self.known_xml is not None:
482 # Most of the time we will have determined this when the
483 # document is parsed.
484 return self.known_xml
486 # Otherwise, it's likely that this element was created by
487 # direct invocation of the constructor from within the user's
488 # Python code.
489 if self.parent is None:
490 # This is the top-level object. It should have .known_xml set
491 # from tree creation. If not, take a guess--BS is usually
492 # used on HTML markup.
493 return getattr(self, "is_xml", False)
494 return self.parent._is_xml
496 nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0")
497 previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0")
499 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
500 raise NotImplementedError()
502 def __copy__(self) -> Self:
503 """A copy of a PageElement can only be a deep copy, because
504 only one PageElement can occupy a given place in a parse tree.
505 """
506 return self.__deepcopy__({})
508 default: Iterable[type[NavigableString]] = tuple() #: :meta private:
510 def _all_strings(
511 self, strip: bool = False, types: Iterable[type[NavigableString]] = default
512 ) -> Iterator[str]:
513 """Yield all strings of certain classes, possibly stripping them.
515 This is implemented differently in `Tag` and `NavigableString`.
516 """
517 raise NotImplementedError()
519 @property
520 def stripped_strings(self) -> Iterator[str]:
521 """Yield all interesting strings in this PageElement, stripping them
522 first.
524 See `Tag` for information on which strings are considered
525 interesting in a given context.
526 """
527 for string in self._all_strings(True):
528 yield string
530 def get_text(
531 self,
532 separator: str = "",
533 strip: bool = False,
534 types: Iterable[Type[NavigableString]] = default,
535 ) -> str:
536 """Get all child strings of this PageElement, concatenated using the
537 given separator.
539 :param separator: Strings will be concatenated using this separator.
541 :param strip: If True, strings will be stripped before being
542 concatenated.
544 :param types: A tuple of NavigableString subclasses. Any
545 strings of a subclass not found in this list will be
546 ignored. Although there are exceptions, the default
547 behavior in most cases is to consider only NavigableString
548 and CData objects. That means no comments, processing
549 instructions, etc.
551 :return: A string.
552 """
553 return separator.join([s for s in self._all_strings(strip, types=types)])
555 getText = get_text
556 text = property(get_text)
558 def replace_with(self, *args: _InsertableElement) -> Self:
559 """Replace this `PageElement` with one or more other elements,
560 objects, keeping the rest of the tree the same.
562 :return: This `PageElement`, no longer part of the tree.
563 """
564 if self.parent is None:
565 raise ValueError(
566 "Cannot replace one element with another when the "
567 "element to be replaced is not part of a tree."
568 )
569 if len(args) == 1 and args[0] is self:
570 # Replacing an element with itself is a no-op.
571 return self
572 if any(x is self.parent for x in args):
573 raise ValueError("Cannot replace a Tag with its parent.")
574 old_parent = self.parent
575 my_index = self.parent.index(self)
576 self.extract(_self_index=my_index)
577 for idx, replace_with in enumerate(args, start=my_index):
578 old_parent.insert(idx, replace_with)
579 return self
581 replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0")
583 def wrap(self, wrap_inside: Tag) -> Tag:
584 """Wrap this `PageElement` inside a `Tag`.
586 :return: ``wrap_inside``, occupying the position in the tree that used
587 to be occupied by this object, and with this object now inside it.
588 """
589 me = self.replace_with(wrap_inside)
590 wrap_inside.append(me)
591 return wrap_inside
593 def extract(self, _self_index: Optional[int] = None) -> Self:
594 """Destructively rips this element out of the tree.
596 :param _self_index: The location of this element in its parent's
597 .contents, if known. Passing this in allows for a performance
598 optimization.
600 :return: this `PageElement`, no longer part of the tree.
601 """
602 if self.parent is not None:
603 if _self_index is None:
604 _self_index = self.parent.index(self)
605 del self.parent.contents[_self_index]
607 # Find the two elements that would be next to each other if
608 # this element (and any children) hadn't been parsed. Connect
609 # the two.
610 last_child = self._last_descendant()
612 # last_child can't be None because we passed accept_self=True
613 # into _last_descendant. Worst case, last_child will be
614 # self. Making this cast removes several mypy complaints later
615 # on as we manipulate last_child.
616 last_child = cast(PageElement, last_child)
617 next_element = last_child.next_element
619 if self.previous_element is not None:
620 if self.previous_element is not next_element:
621 self.previous_element.next_element = next_element
622 if next_element is not None and next_element is not self.previous_element:
623 next_element.previous_element = self.previous_element
624 self.previous_element = None
625 last_child.next_element = None
627 self.parent = None
628 if (
629 self.previous_sibling is not None
630 and self.previous_sibling is not self.next_sibling
631 ):
632 self.previous_sibling.next_sibling = self.next_sibling
633 if (
634 self.next_sibling is not None
635 and self.next_sibling is not self.previous_sibling
636 ):
637 self.next_sibling.previous_sibling = self.previous_sibling
638 self.previous_sibling = self.next_sibling = None
639 return self
641 def decompose(self) -> None:
642 """Recursively destroys this `PageElement` and its children.
644 The element will be removed from the tree and wiped out; so
645 will everything beneath it.
647 The behavior of a decomposed `PageElement` is undefined and you
648 should never use one for anything, but if you need to *check*
649 whether an element has been decomposed, you can use the
650 `PageElement.decomposed` property.
651 """
652 self.extract()
653 e: _AtMostOneElement = self
654 next_up: _AtMostOneElement = None
655 while e is not None:
656 next_up = e.next_element
657 e.__dict__.clear()
658 if isinstance(e, Tag):
659 e.name = ""
660 e.contents = []
661 e._decomposed = True
662 e = next_up
664 def _last_descendant(
665 self, is_initialized: bool = True, accept_self: bool = True
666 ) -> _AtMostOneElement:
667 """Finds the last element beneath this object to be parsed.
669 Special note to help you figure things out if your type
670 checking is tripped up by the fact that this method returns
671 _AtMostOneElement instead of PageElement: the only time
672 this method returns None is if `accept_self` is False and the
673 `PageElement` has no children--either it's a NavigableString
674 or an empty Tag.
676 :param is_initialized: Has `PageElement.setup` been called on
677 this `PageElement` yet?
679 :param accept_self: Is ``self`` an acceptable answer to the
680 question?
681 """
682 if is_initialized and self.next_sibling is not None:
683 last_child = self.next_sibling.previous_element
684 else:
685 last_child = self
686 while isinstance(last_child, Tag) and last_child.contents:
687 last_child = last_child.contents[-1]
688 if not accept_self and last_child is self:
689 last_child = None
690 return last_child
692 _lastRecursiveChild = _deprecated_alias(
693 "_lastRecursiveChild", "_last_descendant", "4.0.0"
694 )
696 def insert_before(self, *args: _InsertableElement) -> List[PageElement]:
697 """Makes the given element(s) the immediate predecessor of this one.
699 All the elements will have the same `PageElement.parent` as
700 this one, and the given elements will occur immediately before
701 this one.
703 :param args: One or more PageElements.
705 :return The list of PageElements that were inserted.
706 """
707 parent = self.parent
708 if parent is None:
709 raise ValueError("Element has no parent, so 'before' has no meaning.")
710 if any(x is self for x in args):
711 raise ValueError("Can't insert an element before itself.")
712 results: List[PageElement] = []
713 for predecessor in args:
714 # Extract first so that the index won't be screwed up if they
715 # are siblings.
716 if isinstance(predecessor, PageElement):
717 predecessor.extract()
718 index = parent.index(self)
719 results.extend(parent.insert(index, predecessor))
721 return results
723 def insert_after(self, *args: _InsertableElement) -> List[PageElement]:
724 """Makes the given element(s) the immediate successor of this one.
726 The elements will have the same `PageElement.parent` as this
727 one, and the given elements will occur immediately after this
728 one.
730 :param args: One or more PageElements.
732 :return The list of PageElements that were inserted.
733 """
734 # Do all error checking before modifying the tree.
735 parent = self.parent
736 if parent is None:
737 raise ValueError("Element has no parent, so 'after' has no meaning.")
738 if any(x is self for x in args):
739 raise ValueError("Can't insert an element after itself.")
741 offset = 0
742 results: List[PageElement] = []
743 for successor in args:
744 # Extract first so that the index won't be screwed up if they
745 # are siblings.
746 if isinstance(successor, PageElement):
747 successor.extract()
748 index = parent.index(self)
749 results.extend(parent.insert(index + 1 + offset, successor))
750 offset += 1
752 return results
754 # For the suppression of this pyright warning, see discussion here:
755 # https://github.com/microsoft/pyright/issues/10929
756 @overload
757 def find_next( # pyright: ignore [reportOverlappingOverload]
758 self,
759 name: _FindMethodName = None,
760 attrs: Optional[_StrainableAttributes] = None,
761 string: None=None,
762 **kwargs: _StrainableAttribute,
763 ) -> _AtMostOneTag:
764 ...
766 @overload
767 def find_next(
768 self,
769 name: None=None,
770 attrs: None=None,
771 string: _StrainableString="",
772 **kwargs: _StrainableAttribute,
773 ) -> _AtMostOneNavigableString:
774 ...
776 def find_next(
777 self,
778 name: _FindMethodName = None,
779 attrs: Optional[_StrainableAttributes] = None,
780 string: Optional[_StrainableString] = None,
781 **kwargs: _StrainableAttribute,
782 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
783 """Find the first PageElement that matches the given criteria and
784 appears later in the document than this PageElement.
786 All find_* methods take a common set of arguments. See the online
787 documentation for detailed explanations.
789 :param name: A filter on tag name.
790 :param attrs: Additional filters on attribute values.
791 :param string: A filter for a NavigableString with specific text.
792 :kwargs: Additional filters on attribute values.
793 """
794 return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
796 findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0")
798 @overload
799 def find_all_next( # pyright: ignore [reportOverlappingOverload]
800 self,
801 name: _FindMethodName = None,
802 attrs: Optional[_StrainableAttributes] = None,
803 string: None = None,
804 limit: Optional[int] = None,
805 _stacklevel: int = 2,
806 **kwargs: _StrainableAttribute,
807 ) -> _SomeTags:
808 ...
810 @overload
811 def find_all_next(
812 self,
813 name: None = None,
814 attrs: None = None,
815 string: _StrainableString = "",
816 limit: Optional[int] = None,
817 _stacklevel: int = 2,
818 **kwargs: _StrainableAttribute,
819 ) -> _SomeNavigableStrings:
820 ...
822 def find_all_next(
823 self,
824 name: _FindMethodName = None,
825 attrs: Optional[_StrainableAttributes] = None,
826 string: Optional[_StrainableString] = None,
827 limit: Optional[int] = None,
828 _stacklevel: int = 2,
829 **kwargs: _StrainableAttribute,
830 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
831 """Find all `PageElement` objects that match the given criteria and
832 appear later in the document than this `PageElement`.
834 All find_* methods take a common set of arguments. See the online
835 documentation for detailed explanations.
837 :param name: A filter on tag name.
838 :param attrs: Additional filters on attribute values.
839 :param string: A filter for a NavigableString with specific text.
840 :param limit: Stop looking after finding this many results.
841 :param _stacklevel: Used internally to improve warning messages.
842 :kwargs: Additional filters on attribute values.
843 """
844 return self._find_all(
845 name,
846 attrs,
847 string,
848 limit,
849 self.next_elements,
850 _stacklevel=_stacklevel + 1,
851 **kwargs,
852 )
854 findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0")
856 @overload
857 def find_next_sibling( # pyright: ignore [reportOverlappingOverload]
858 self,
859 name: _FindMethodName = None,
860 attrs: Optional[_StrainableAttributes] = None,
861 string: None=None,
862 **kwargs: _StrainableAttribute,
863 ) -> _AtMostOneTag:
864 ...
866 @overload
867 def find_next_sibling(
868 self,
869 name: None=None,
870 attrs: None=None,
871 string: _StrainableString="",
872 **kwargs: _StrainableAttribute,
873 ) -> _AtMostOneNavigableString:
874 ...
876 def find_next_sibling(
877 self,
878 name: _FindMethodName = None,
879 attrs: Optional[_StrainableAttributes] = None,
880 string: Optional[_StrainableString] = None,
881 **kwargs: _StrainableAttribute,
882 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
883 """Find the closest sibling to this PageElement that matches the
884 given criteria and appears later in the document.
886 All find_* methods take a common set of arguments. See the
887 online documentation for detailed explanations.
889 :param name: A filter on tag name.
890 :param attrs: Additional filters on attribute values.
891 :param string: A filter for a `NavigableString` with specific text.
892 :kwargs: Additional filters on attribute values.
893 """
894 return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs)
896 findNextSibling = _deprecated_function_alias(
897 "findNextSibling", "find_next_sibling", "4.0.0"
898 )
900 @overload
901 def find_next_siblings( # pyright: ignore [reportOverlappingOverload]
902 self,
903 name: _FindMethodName = None,
904 attrs: Optional[_StrainableAttributes] = None,
905 string: None = None,
906 limit: Optional[int] = None,
907 _stacklevel: int = 2,
908 **kwargs: _StrainableAttribute,
909 ) -> _SomeTags:
910 ...
912 @overload
913 def find_next_siblings(
914 self,
915 name: None = None,
916 attrs: None = None,
917 string: _StrainableString = "",
918 limit: Optional[int] = None,
919 _stacklevel: int = 2,
920 **kwargs: _StrainableAttribute,
921 ) -> _SomeNavigableStrings:
922 ...
924 def find_next_siblings(
925 self,
926 name: _FindMethodName = None,
927 attrs: Optional[_StrainableAttributes] = None,
928 string: Optional[_StrainableString] = None,
929 limit: Optional[int] = None,
930 _stacklevel: int = 2,
931 **kwargs: _StrainableAttribute,
932 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
933 """Find all siblings of this `PageElement` that match the given criteria
934 and appear later in the document.
936 All find_* methods take a common set of arguments. See the online
937 documentation for detailed explanations.
939 :param name: A filter on tag name.
940 :param attrs: Additional filters on attribute values.
941 :param string: A filter for a `NavigableString` with specific text.
942 :param limit: Stop looking after finding this many results.
943 :param _stacklevel: Used internally to improve warning messages.
944 :kwargs: Additional filters on attribute values.
945 """
946 return self._find_all(
947 name,
948 attrs,
949 string,
950 limit,
951 self.next_siblings,
952 _stacklevel=_stacklevel + 1,
953 **kwargs,
954 )
956 findNextSiblings = _deprecated_function_alias(
957 "findNextSiblings", "find_next_siblings", "4.0.0"
958 )
959 fetchNextSiblings = _deprecated_function_alias(
960 "fetchNextSiblings", "find_next_siblings", "3.0.0"
961 )
963 @overload
964 def find_previous( # pyright: ignore [reportOverlappingOverload]
965 self,
966 name: _FindMethodName = None,
967 attrs: Optional[_StrainableAttributes] = None,
968 string: None=None,
969 **kwargs: _StrainableAttribute,
970 ) -> _AtMostOneTag:
971 ...
973 @overload
974 def find_previous(
975 self,
976 name: None=None,
977 attrs: None=None,
978 string: _StrainableString="",
979 **kwargs: _StrainableAttribute,
980 ) -> _AtMostOneNavigableString:
981 ...
983 def find_previous(
984 self,
985 name: _FindMethodName = None,
986 attrs: Optional[_StrainableAttributes] = None,
987 string: Optional[_StrainableString] = None,
988 **kwargs: _StrainableAttribute,
989 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
990 """Look backwards in the document from this `PageElement` and find the
991 first `PageElement` that matches the given criteria.
993 All find_* methods take a common set of arguments. See the online
994 documentation for detailed explanations.
996 :param name: A filter on tag name.
997 :param attrs: Additional filters on attribute values.
998 :param string: A filter for a `NavigableString` with specific text.
999 :kwargs: Additional filters on attribute values.
1000 """
1001 return self._find_one(self.find_all_previous, name, attrs, string, **kwargs)
1003 findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0")
1005 @overload
1006 def find_all_previous( # pyright: ignore [reportOverlappingOverload]
1007 self,
1008 name: _FindMethodName = None,
1009 attrs: Optional[_StrainableAttributes] = None,
1010 string: None = None,
1011 limit: Optional[int] = None,
1012 _stacklevel: int = 2,
1013 **kwargs: _StrainableAttribute,
1014 ) -> _SomeTags:
1015 ...
1017 @overload
1018 def find_all_previous(
1019 self,
1020 name: None = None,
1021 attrs: None = None,
1022 string: _StrainableString = "",
1023 limit: Optional[int] = None,
1024 _stacklevel: int = 2,
1025 **kwargs: _StrainableAttribute,
1026 ) -> _SomeNavigableStrings:
1027 ...
1029 def find_all_previous(
1030 self,
1031 name: _FindMethodName = None,
1032 attrs: Optional[_StrainableAttributes] = None,
1033 string: Optional[_StrainableString] = None,
1034 limit: Optional[int] = None,
1035 _stacklevel: int = 2,
1036 **kwargs: _StrainableAttribute,
1037 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
1038 """Look backwards in the document from this `PageElement` and find all
1039 `PageElement` that match the given criteria.
1041 All find_* methods take a common set of arguments. See the online
1042 documentation for detailed explanations.
1044 :param name: A filter on tag name.
1045 :param attrs: Additional filters on attribute values.
1046 :param string: A filter for a `NavigableString` with specific text.
1047 :param limit: Stop looking after finding this many results.
1048 :param _stacklevel: Used internally to improve warning messages.
1049 :kwargs: Additional filters on attribute values.
1050 """
1051 return self._find_all(
1052 name,
1053 attrs,
1054 string,
1055 limit,
1056 self.previous_elements,
1057 _stacklevel=_stacklevel + 1,
1058 **kwargs,
1059 )
1061 findAllPrevious = _deprecated_function_alias(
1062 "findAllPrevious", "find_all_previous", "4.0.0"
1063 )
1064 fetchAllPrevious = _deprecated_function_alias(
1065 "fetchAllPrevious", "find_all_previous", "3.0.0"
1066 )
1068 @overload
1069 def find_previous_sibling( # pyright: ignore [reportOverlappingOverload]
1070 self,
1071 name: _FindMethodName = None,
1072 attrs: Optional[_StrainableAttributes] = None,
1073 string: None=None,
1074 **kwargs: _StrainableAttribute,
1075 ) -> _AtMostOneTag:
1076 ...
1078 @overload
1079 def find_previous_sibling(
1080 self,
1081 name: None=None,
1082 attrs: None=None,
1083 string: _StrainableString="",
1084 **kwargs: _StrainableAttribute,
1085 ) -> _AtMostOneNavigableString:
1086 ...
1088 def find_previous_sibling(
1089 self,
1090 name: _FindMethodName = None,
1091 attrs: Optional[_StrainableAttributes] = None,
1092 string: Optional[_StrainableString] = None,
1093 **kwargs: _StrainableAttribute,
1094 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
1095 """Returns the closest sibling to this `PageElement` that matches the
1096 given criteria and appears earlier in the document.
1098 All find_* methods take a common set of arguments. See the online
1099 documentation for detailed explanations.
1101 :param name: A filter on tag name.
1102 :param attrs: Additional filters on attribute values.
1103 :param string: A filter for a `NavigableString` with specific text.
1104 :kwargs: Additional filters on attribute values.
1105 """
1106 return self._find_one(
1107 self.find_previous_siblings, name, attrs, string, **kwargs
1108 )
1110 findPreviousSibling = _deprecated_function_alias(
1111 "findPreviousSibling", "find_previous_sibling", "4.0.0"
1112 )
1114 @overload
1115 def find_previous_siblings( # pyright: ignore [reportOverlappingOverload]
1116 self,
1117 name: _FindMethodName = None,
1118 attrs: Optional[_StrainableAttributes] = None,
1119 string: None = None,
1120 limit: Optional[int] = None,
1121 _stacklevel: int = 2,
1122 **kwargs: _StrainableAttribute,
1123 ) -> _SomeTags:
1124 ...
1126 @overload
1127 def find_previous_siblings(
1128 self,
1129 name: None = None,
1130 attrs: None = None,
1131 string: _StrainableString = "",
1132 limit: Optional[int] = None,
1133 _stacklevel: int = 2,
1134 **kwargs: _StrainableAttribute,
1135 ) -> _SomeNavigableStrings:
1136 ...
1138 def find_previous_siblings(
1139 self,
1140 name: _FindMethodName = None,
1141 attrs: Optional[_StrainableAttributes] = None,
1142 string: Optional[_StrainableString] = None,
1143 limit: Optional[int] = None,
1144 _stacklevel: int = 2,
1145 **kwargs: _StrainableAttribute,
1146 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
1147 """Returns all siblings to this PageElement that match the
1148 given criteria and appear earlier in the document.
1150 All find_* methods take a common set of arguments. See the online
1151 documentation for detailed explanations.
1153 :param name: A filter on tag name.
1154 :param attrs: Additional filters on attribute values.
1155 :param string: A filter for a NavigableString with specific text.
1156 :param limit: Stop looking after finding this many results.
1157 :param _stacklevel: Used internally to improve warning messages.
1158 :kwargs: Additional filters on attribute values.
1159 """
1160 return self._find_all(
1161 name,
1162 attrs,
1163 string,
1164 limit,
1165 self.previous_siblings,
1166 _stacklevel=_stacklevel + 1,
1167 **kwargs,
1168 )
1170 findPreviousSiblings = _deprecated_function_alias(
1171 "findPreviousSiblings", "find_previous_siblings", "4.0.0"
1172 )
1173 fetchPreviousSiblings = _deprecated_function_alias(
1174 "fetchPreviousSiblings", "find_previous_siblings", "3.0.0"
1175 )
1177 def find_parent(
1178 self,
1179 name: _FindMethodName = None,
1180 attrs: Optional[_StrainableAttributes] = None,
1181 **kwargs: _StrainableAttribute,
1182 ) -> _AtMostOneTag:
1183 """Find the closest parent of this PageElement that matches the given
1184 criteria.
1186 All find_* methods take a common set of arguments. See the online
1187 documentation for detailed explanations.
1189 :param name: A filter on tag name.
1190 :param attrs: Additional filters on attribute values.
1191 :param self: Whether the PageElement itself should be considered
1192 as one of its 'parents'.
1193 :kwargs: Additional filters on attribute values.
1194 """
1195 # NOTE: We can't use _find_one because findParents takes a different
1196 # set of arguments.
1197 r = None
1198 results = self.find_parents(
1199 name, attrs, 1, _stacklevel=3, **kwargs
1200 )
1201 if results:
1202 r = results[0]
1203 return r
1205 findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0")
1207 def find_parents(
1208 self,
1209 name: _FindMethodName = None,
1210 attrs: Optional[_StrainableAttributes] = None,
1211 limit: Optional[int] = None,
1212 _stacklevel: int = 2,
1213 **kwargs: _StrainableAttribute,
1214 ) -> _SomeTags:
1215 """Find all parents of this `PageElement` that match the given criteria.
1217 All find_* methods take a common set of arguments. See the online
1218 documentation for detailed explanations.
1220 :param name: A filter on tag name.
1221 :param attrs: Additional filters on attribute values.
1222 :param limit: Stop looking after finding this many results.
1223 :param _stacklevel: Used internally to improve warning messages.
1224 :kwargs: Additional filters on attribute values.
1225 """
1226 iterator = self.parents
1227 # Only Tags can have children, so this ResultSet will contain
1228 # nothing but Tags.
1229 return cast(ResultSet[Tag], self._find_all(
1230 name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs
1231 ))
1233 findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0")
1234 fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0")
1236 @property
1237 def next(self) -> _AtMostOneElement:
1238 """The `PageElement`, if any, that was parsed just after this one."""
1239 return self.next_element
1241 @property
1242 def previous(self) -> _AtMostOneElement:
1243 """The `PageElement`, if any, that was parsed just before this one."""
1244 return self.previous_element
1246 # These methods do the real heavy lifting.
1248 def _find_one(
1249 self,
1250 # TODO-TYPING: "There is no syntax to indicate optional or
1251 # keyword arguments; such function types are rarely used
1252 # as callback types." - So, not sure how to get more
1253 # specific here.
1254 method: Callable,
1255 name: _FindMethodName,
1256 attrs: Optional[_StrainableAttributes],
1257 string: Optional[_StrainableString],
1258 **kwargs: _StrainableAttribute,
1259 ) -> _AtMostOneElement:
1260 r: _AtMostOneElement = None
1261 results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
1262 if results:
1263 r = results[0]
1264 return r
1266 def _find_all(
1267 self,
1268 name: _FindMethodName,
1269 attrs: Optional[_StrainableAttributes],
1270 string: Optional[_StrainableString],
1271 limit: Optional[int],
1272 generator: Iterator[PageElement],
1273 _stacklevel: int = 3,
1274 **kwargs: _StrainableAttribute,
1275 ) -> _QueryResults:
1276 """Iterates over a generator looking for things that match."""
1278 if string is None and "text" in kwargs:
1279 string = kwargs.pop("text")
1280 warnings.warn(
1281 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
1282 DeprecationWarning,
1283 stacklevel=_stacklevel,
1284 )
1286 if "_class" in kwargs:
1287 warnings.warn(
1288 AttributeResemblesVariableWarning.MESSAGE
1289 % dict(
1290 original="_class",
1291 autocorrect="class_",
1292 ),
1293 AttributeResemblesVariableWarning,
1294 stacklevel=_stacklevel,
1295 )
1297 from bs4.filter import ElementFilter
1299 if isinstance(name, ElementFilter):
1300 matcher = name
1301 else:
1302 matcher = SoupStrainer(name, attrs, string, **kwargs)
1304 result: MutableSequence[_OneElement]
1305 if string is None and not limit and not attrs and not kwargs:
1306 if name is True or name is None:
1307 # Optimization to find all tags.
1308 result = [element for element in generator if isinstance(element, Tag)]
1309 return ResultSet(matcher, result)
1310 elif isinstance(name, str):
1311 # Optimization to find all tags with a given name.
1312 if name.count(":") == 1:
1313 # This is a name with a prefix. If this is a namespace-aware document,
1314 # we need to match the local name against tag.name. If not,
1315 # we need to match the fully-qualified name against tag.name.
1316 prefix, local_name = name.split(":", 1)
1317 else:
1318 prefix = None
1319 local_name = name
1320 result = []
1321 for element in generator:
1322 if not isinstance(element, Tag):
1323 continue
1324 if element.name == name or (
1325 element.name == local_name
1326 and (prefix is None or element.prefix == prefix)
1327 ):
1328 result.append(element)
1329 return ResultSet(matcher, result)
1330 return matcher.find_all(generator, limit)
1332 # These generators can be used to navigate starting from both
1333 # NavigableStrings and Tags.
1334 @property
1335 def next_elements(self) -> Iterator[PageElement]:
1336 """All PageElements that were parsed after this one."""
1337 i = self.next_element
1338 while i is not None:
1339 successor = i.next_element
1340 yield i
1341 i = successor
1343 @property
1344 def self_and_next_elements(self) -> Iterator[PageElement]:
1345 """This PageElement, then all PageElements that were parsed after it."""
1346 return self._self_and(self.next_elements)
1348 @property
1349 def next_siblings(self) -> Iterator[PageElement]:
1350 """All PageElements that are siblings of this one but were parsed
1351 later.
1352 """
1353 i = self.next_sibling
1354 while i is not None:
1355 successor = i.next_sibling
1356 yield i
1357 i = successor
1359 @property
1360 def self_and_next_siblings(self) -> Iterator[PageElement]:
1361 """This PageElement, then all of its siblings."""
1362 return self._self_and(self.next_siblings)
1364 @property
1365 def previous_elements(self) -> Iterator[PageElement]:
1366 """All PageElements that were parsed before this one.
1368 :yield: A sequence of PageElements.
1369 """
1370 i = self.previous_element
1371 while i is not None:
1372 successor = i.previous_element
1373 yield i
1374 i = successor
1376 @property
1377 def self_and_previous_elements(self) -> Iterator[PageElement]:
1378 """This PageElement, then all elements that were parsed
1379 earlier."""
1380 return self._self_and(self.previous_elements)
1382 @property
1383 def previous_siblings(self) -> Iterator[PageElement]:
1384 """All PageElements that are siblings of this one but were parsed
1385 earlier.
1387 :yield: A sequence of PageElements.
1388 """
1389 i = self.previous_sibling
1390 while i is not None:
1391 successor = i.previous_sibling
1392 yield i
1393 i = successor
1395 @property
1396 def self_and_previous_siblings(self) -> Iterator[PageElement]:
1397 """This PageElement, then all of its siblings that were parsed
1398 earlier."""
1399 return self._self_and(self.previous_siblings)
1401 @property
1402 def parents(self) -> Iterator[Tag]:
1403 """All elements that are parents of this PageElement.
1405 :yield: A sequence of Tags, ending with a BeautifulSoup object.
1406 """
1407 i = self.parent
1408 while i is not None:
1409 successor = i.parent
1410 yield i
1411 i = successor
1413 @property
1414 def self_and_parents(self) -> Iterator[PageElement]:
1415 """This element, then all of its parents.
1417 :yield: A sequence of PageElements, ending with a BeautifulSoup object.
1418 """
1419 return self._self_and(self.parents)
1421 def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]:
1422 """Modify a generator by yielding this element, then everything
1423 yielded by the other generator.
1424 """
1425 if not self.hidden:
1426 yield self
1427 for i in other_generator:
1428 yield i
1430 @property
1431 def decomposed(self) -> bool:
1432 """Check whether a PageElement has been decomposed."""
1433 return getattr(self, "_decomposed", False) or False
1435 @_deprecated("next_elements", "4.0.0")
1436 def nextGenerator(self) -> Iterator[PageElement]:
1437 ":meta private:"
1438 return self.next_elements
1440 @_deprecated("next_siblings", "4.0.0")
1441 def nextSiblingGenerator(self) -> Iterator[PageElement]:
1442 ":meta private:"
1443 return self.next_siblings
1445 @_deprecated("previous_elements", "4.0.0")
1446 def previousGenerator(self) -> Iterator[PageElement]:
1447 ":meta private:"
1448 return self.previous_elements
1450 @_deprecated("previous_siblings", "4.0.0")
1451 def previousSiblingGenerator(self) -> Iterator[PageElement]:
1452 ":meta private:"
1453 return self.previous_siblings
1455 @_deprecated("parents", "4.0.0")
1456 def parentGenerator(self) -> Iterator[PageElement]:
1457 ":meta private:"
1458 return self.parents
1461class NavigableString(str, PageElement):
1462 """A Python string that is part of a parse tree.
1464 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
1465 create a `NavigableString` for the string "penguin".
1466 """
1468 #: A string prepended to the body of the 'real' string
1469 #: when formatting it as part of a document, such as the '<!--'
1470 #: in an HTML comment.
1471 PREFIX: str = ""
1473 #: A string appended to the body of the 'real' string
1474 #: when formatting it as part of a document, such as the '-->'
1475 #: in an HTML comment.
1476 SUFFIX: str = ""
1478 def __new__(cls, value: Union[str, bytes]) -> Self:
1479 """Create a new NavigableString.
1481 When unpickling a NavigableString, this method is called with
1482 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
1483 passed in to the superclass's __new__ or the superclass won't know
1484 how to handle non-ASCII characters.
1485 """
1486 if isinstance(value, str):
1487 u = str.__new__(cls, value)
1488 else:
1489 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
1490 u.hidden = False
1491 u.setup()
1492 return u
1494 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
1495 """A copy of a NavigableString has the same contents and class
1496 as the original, but it is not connected to the parse tree.
1498 :param recursive: This parameter is ignored; it's only defined
1499 so that NavigableString.__deepcopy__ implements the same
1500 signature as Tag.__deepcopy__.
1501 """
1502 return type(self)(self)
1504 def __getnewargs__(self) -> Tuple[str]:
1505 return (str(self),)
1507 # TODO-TYPING This should be SupportsIndex|slice but SupportsIndex
1508 # is introduced in 3.8. This can be changed once 3.7 support is dropped.
1509 def __getitem__(self, key: Union[int|slice]) -> str: # type:ignore
1510 """Raise an exception """
1511 if isinstance(key, str):
1512 raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__))
1513 return super(NavigableString, self).__getitem__(key)
1515 @property
1516 def string(self) -> str:
1517 """Convenience property defined to match `Tag.string`.
1519 :return: This property always returns the `NavigableString` it was
1520 called on.
1522 :meta private:
1523 """
1524 return self
1526 def output_ready(self, formatter: _FormatterOrName = "minimal") -> str:
1527 """Run the string through the provided formatter, making it
1528 ready for output as part of an HTML or XML document.
1530 :param formatter: A `Formatter` object, or a string naming one
1531 of the standard formatters.
1532 """
1533 output = self.format_string(self, formatter)
1534 return self.PREFIX + output + self.SUFFIX
1536 @property
1537 def name(self) -> None:
1538 """Since a NavigableString is not a Tag, it has no .name.
1540 This property is implemented so that code like this doesn't crash
1541 when run on a mixture of Tag and NavigableString objects:
1542 [x.name for x in tag.children]
1544 :meta private:
1545 """
1546 return None
1548 @name.setter
1549 def name(self, name: str) -> None:
1550 """Prevent NavigableString.name from ever being set.
1552 :meta private:
1553 """
1554 raise AttributeError("A NavigableString cannot be given a name.")
1556 def _all_strings(
1557 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
1558 ) -> Iterator[str]:
1559 """Yield all strings of certain classes, possibly stripping them.
1561 This makes it easy for NavigableString to implement methods
1562 like get_text() as conveniences, creating a consistent
1563 text-extraction API across all PageElements.
1565 :param strip: If True, all strings will be stripped before being
1566 yielded.
1568 :param types: A tuple of NavigableString subclasses. If this
1569 NavigableString isn't one of those subclasses, the
1570 sequence will be empty. By default, the subclasses
1571 considered are NavigableString and CData objects. That
1572 means no comments, processing instructions, etc.
1574 :yield: A sequence that either contains this string, or is empty.
1575 """
1576 if types is self.default:
1577 # This is kept in Tag because it's full of subclasses of
1578 # this class, which aren't defined until later in the file.
1579 types = Tag.MAIN_CONTENT_STRING_TYPES
1581 # Do nothing if the caller is looking for specific types of
1582 # string, and we're of a different type.
1583 #
1584 # We check specific types instead of using isinstance(self,
1585 # types) because all of these classes subclass
1586 # NavigableString. Anyone who's using this feature probably
1587 # wants generic NavigableStrings but not other stuff.
1588 my_type = type(self)
1589 if types is not None:
1590 if isinstance(types, type):
1591 # Looking for a single type.
1592 if my_type is not types:
1593 return
1594 elif my_type not in types:
1595 # Looking for one of a list of types.
1596 return
1598 value = self
1599 if strip:
1600 final_value = value.strip()
1601 else:
1602 final_value = self
1603 if len(final_value) > 0:
1604 yield final_value
1606 @property
1607 def strings(self) -> Iterator[str]:
1608 """Yield this string, but only if it is interesting.
1610 This is defined the way it is for compatibility with
1611 `Tag.strings`. See `Tag` for information on which strings are
1612 interesting in a given context.
1614 :yield: A sequence that either contains this string, or is empty.
1615 """
1616 return self._all_strings()
1619class PreformattedString(NavigableString):
1620 """A `NavigableString` not subject to the normal formatting rules.
1622 This is an abstract class used for special kinds of strings such
1623 as comments (`Comment`) and CDATA blocks (`CData`).
1624 """
1626 PREFIX: str = ""
1627 SUFFIX: str = ""
1629 def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str:
1630 """Make this string ready for output by adding any subclass-specific
1631 prefix or suffix.
1633 :param formatter: A `Formatter` object, or a string naming one
1634 of the standard formatters. The string will be passed into the
1635 `Formatter`, but only to trigger any side effects: the return
1636 value is ignored.
1638 :return: The string, with any subclass-specific prefix and
1639 suffix added on.
1640 """
1641 if formatter is not None:
1642 self.format_string(self, formatter)
1643 return self.PREFIX + self + self.SUFFIX
1646class CData(PreformattedString):
1647 """A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_."""
1649 PREFIX: str = "<![CDATA["
1650 SUFFIX: str = "]]>"
1653class ProcessingInstruction(PreformattedString):
1654 """A SGML processing instruction."""
1656 PREFIX: str = "<?"
1657 SUFFIX: str = ">"
1660class XMLProcessingInstruction(ProcessingInstruction):
1661 """An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_."""
1663 PREFIX: str = "<?"
1664 SUFFIX: str = "?>"
1667class Comment(PreformattedString):
1668 """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_."""
1670 PREFIX: str = "<!--"
1671 SUFFIX: str = "-->"
1674class Declaration(PreformattedString):
1675 """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_."""
1677 PREFIX: str = "<?"
1678 SUFFIX: str = "?>"
1681class Doctype(PreformattedString):
1682 """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_."""
1684 @classmethod
1685 def for_name_and_ids(
1686 cls, name: str, pub_id: Optional[str], system_id: Optional[str]
1687 ) -> Doctype:
1688 """Generate an appropriate document type declaration for a given
1689 public ID and system ID.
1691 :param name: The name of the document's root element, e.g. 'html'.
1692 :param pub_id: The Formal Public Identifier for this document type,
1693 e.g. '-//W3C//DTD XHTML 1.1//EN'
1694 :param system_id: The system identifier for this document type,
1695 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1696 """
1697 return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id))
1699 @classmethod
1700 def _string_for_name_and_ids(
1701 cls, name: str, pub_id: Optional[str], system_id: Optional[str]
1702 ) -> str:
1703 """Generate a string to be used as the basis of a Doctype object.
1705 This is a separate method from for_name_and_ids() because the lxml
1706 TreeBuilder needs to call it.
1707 """
1708 value = name or ""
1709 if pub_id is not None:
1710 value += ' PUBLIC "%s"' % pub_id
1711 if system_id is not None:
1712 value += ' "%s"' % system_id
1713 elif system_id is not None:
1714 value += ' SYSTEM "%s"' % system_id
1715 return value
1717 PREFIX: str = "<!DOCTYPE "
1718 SUFFIX: str = ">\n"
1721class Stylesheet(NavigableString):
1722 """A `NavigableString` representing the contents of a `<style> HTML
1723 tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_
1724 (probably CSS).
1726 Used to distinguish embedded stylesheets from textual content.
1727 """
1730class Script(NavigableString):
1731 """A `NavigableString` representing the contents of a `<script>
1732 HTML tag
1733 <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_
1734 (probably Javascript).
1736 Used to distinguish executable code from textual content.
1737 """
1740class TemplateString(NavigableString):
1741 """A `NavigableString` representing a string found inside an `HTML
1742 <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_
1743 embedded in a larger document.
1745 Used to distinguish such strings from the main body of the document.
1746 """
1749class RubyTextString(NavigableString):
1750 """A NavigableString representing the contents of an `<rt> HTML
1751 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_.
1753 Can be used to distinguish such strings from the strings they're
1754 annotating.
1755 """
1758class RubyParenthesisString(NavigableString):
1759 """A NavigableString representing the contents of an `<rp> HTML
1760 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_.
1761 """
1764class Tag(PageElement):
1765 """An HTML or XML tag that is part of a parse tree, along with its
1766 attributes, contents, and relationships to other parts of the tree.
1768 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
1769 create a `Tag` object representing the ``<b>`` tag. You can
1770 instantiate `Tag` objects directly, but it's not necessary unless
1771 you're adding entirely new markup to a parsed document. Most of
1772 the constructor arguments are intended for use by the `TreeBuilder`
1773 that's parsing a document.
1775 :param parser: A `BeautifulSoup` object representing the parse tree this
1776 `Tag` will be part of.
1777 :param builder: The `TreeBuilder` being used to build the tree.
1778 :param name: The name of the tag.
1779 :param namespace: The URI of this tag's XML namespace, if any.
1780 :param prefix: The prefix for this tag's XML namespace, if any.
1781 :param attrs: A dictionary of attribute values.
1782 :param parent: The `Tag` to use as the parent of this `Tag`. May be
1783 the `BeautifulSoup` object itself.
1784 :param previous: The `PageElement` that was parsed immediately before
1785 parsing this tag.
1786 :param is_xml: If True, this is an XML tag. Otherwise, this is an
1787 HTML tag.
1788 :param sourceline: The line number where this tag was found in its
1789 source document.
1790 :param sourcepos: The character position within ``sourceline`` where this
1791 tag was found.
1792 :param can_be_empty_element: If True, this tag should be
1793 represented as <tag/>. If False, this tag should be represented
1794 as <tag></tag>.
1795 :param cdata_list_attributes: A dictionary of attributes whose values should
1796 be parsed as lists of strings if they ever show up on this tag.
1797 :param preserve_whitespace_tags: Names of tags whose contents
1798 should have their whitespace preserved if they are encountered inside
1799 this tag.
1800 :param interesting_string_types: When iterating over this tag's
1801 string contents in methods like `Tag.strings` or
1802 `PageElement.get_text`, these are the types of strings that are
1803 interesting enough to be considered. By default,
1804 `NavigableString` (normal strings) and `CData` (CDATA
1805 sections) are the only interesting string subtypes.
1806 :param namespaces: A dictionary mapping currently active
1807 namespace prefixes to URIs, as of the point in the parsing process when
1808 this tag was encountered. This can be used later to
1809 construct CSS selectors.
1811 """
1813 def __init__(
1814 self,
1815 parser: Optional[BeautifulSoup] = None,
1816 builder: Optional[TreeBuilder] = None,
1817 name: Optional[str] = None,
1818 namespace: Optional[str] = None,
1819 prefix: Optional[str] = None,
1820 attrs: Optional[_RawOrProcessedAttributeValues] = None,
1821 parent: Optional[Union[BeautifulSoup, Tag]] = None,
1822 previous: _AtMostOneElement = None,
1823 is_xml: Optional[bool] = None,
1824 sourceline: Optional[int] = None,
1825 sourcepos: Optional[int] = None,
1826 can_be_empty_element: Optional[bool] = None,
1827 cdata_list_attributes: Optional[Dict[str, Set[str]]] = None,
1828 preserve_whitespace_tags: Optional[Set[str]] = None,
1829 interesting_string_types: Optional[Set[Type[NavigableString]]] = None,
1830 namespaces: Optional[Dict[str, str]] = None,
1831 # NOTE: Any new arguments here need to be mirrored in
1832 # Tag.copy_self, and potentially BeautifulSoup.new_tag
1833 # as well.
1834 ):
1835 if parser is None:
1836 self.parser_class = None
1837 else:
1838 # We don't actually store the parser object: that lets extracted
1839 # chunks be garbage-collected.
1840 self.parser_class = parser.__class__
1841 if name is None:
1842 raise ValueError("No value provided for new tag's name.")
1843 self.name = name
1844 self.namespace = namespace
1845 self._namespaces = namespaces or {}
1846 self.prefix = prefix
1847 if (not builder or builder.store_line_numbers) and (
1848 sourceline is not None or sourcepos is not None
1849 ):
1850 self.sourceline = sourceline
1851 self.sourcepos = sourcepos
1852 else:
1853 self.sourceline = sourceline
1854 self.sourcepos = sourcepos
1856 attr_dict_class: type[AttributeDict]
1857 attribute_value_list_class: type[AttributeValueList]
1858 if builder is None:
1859 if is_xml:
1860 attr_dict_class = XMLAttributeDict
1861 else:
1862 attr_dict_class = HTMLAttributeDict
1863 attribute_value_list_class = AttributeValueList
1864 else:
1865 attr_dict_class = builder.attribute_dict_class
1866 attribute_value_list_class = builder.attribute_value_list_class
1867 self.attribute_value_list_class = attribute_value_list_class
1869 if attrs is None:
1870 self.attrs = attr_dict_class()
1871 else:
1872 if builder is not None and builder.cdata_list_attributes:
1873 self.attrs = builder._replace_cdata_list_attribute_values(
1874 self.name, attrs
1875 )
1876 else:
1877 self.attrs = attr_dict_class()
1878 # Make sure that the values of any multi-valued
1879 # attributes (e.g. when a Tag is copied) are stored in
1880 # new lists.
1881 for k, v in attrs.items():
1882 if isinstance(v, list):
1883 v = v.__class__(v)
1884 self.attrs[k] = v
1886 # If possible, determine ahead of time whether this tag is an
1887 # XML tag.
1888 if builder:
1889 self.known_xml = builder.is_xml
1890 else:
1891 self.known_xml = is_xml
1892 self.contents: List[PageElement] = []
1893 self.setup(parent, previous)
1894 self.hidden = False
1896 if builder is None:
1897 # In the absence of a TreeBuilder, use whatever values were
1898 # passed in here. They're probably None, unless this is a copy of some
1899 # other tag.
1900 self.can_be_empty_element = can_be_empty_element
1901 self.cdata_list_attributes = cdata_list_attributes
1902 self.preserve_whitespace_tags = preserve_whitespace_tags
1903 self.interesting_string_types = interesting_string_types
1904 else:
1905 # Set up any substitutions for this tag, such as the charset in a META tag.
1906 self.attribute_value_list_class = builder.attribute_value_list_class
1907 builder.set_up_substitutions(self)
1909 # Ask the TreeBuilder whether this tag might be an empty-element tag.
1910 self.can_be_empty_element = builder.can_be_empty_element(name)
1912 # Keep track of the list of attributes of this tag that
1913 # might need to be treated as a list.
1914 #
1915 # For performance reasons, we store the whole data structure
1916 # rather than asking the question of every tag. Asking would
1917 # require building a new data structure every time, and
1918 # (unlike can_be_empty_element), we almost never need
1919 # to check this.
1920 self.cdata_list_attributes = builder.cdata_list_attributes
1922 # Keep track of the names that might cause this tag to be treated as a
1923 # whitespace-preserved tag.
1924 self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1926 if self.name in builder.string_containers:
1927 # This sort of tag uses a special string container
1928 # subclass for most of its strings. We need to be able
1929 # to look up the proper container subclass.
1930 self.interesting_string_types = {builder.string_containers[self.name]}
1931 else:
1932 self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES
1934 parser_class: Optional[type[BeautifulSoup]]
1935 name: str
1936 namespace: Optional[str]
1937 prefix: Optional[str]
1938 attrs: _AttributeValues
1939 sourceline: Optional[int]
1940 sourcepos: Optional[int]
1941 known_xml: Optional[bool]
1942 contents: List[PageElement]
1943 hidden: bool
1944 interesting_string_types: Optional[Set[Type[NavigableString]]]
1946 can_be_empty_element: Optional[bool]
1947 cdata_list_attributes: Optional[Dict[str, Set[str]]]
1948 preserve_whitespace_tags: Optional[Set[str]]
1950 #: :meta private:
1951 parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0")
1953 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self:
1954 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
1955 Its contents are a copy of the old Tag's contents.
1956 """
1957 clone = self.copy_self()
1959 if recursive:
1960 # Clone this tag's descendants recursively, but without
1961 # making any recursive function calls.
1962 tag_stack: List[Tag] = [clone]
1963 for event, element in self._event_stream(self.descendants):
1964 if event is Tag.END_ELEMENT_EVENT:
1965 # Stop appending incoming Tags to the Tag that was
1966 # just closed.
1967 tag_stack.pop()
1968 else:
1969 descendant_clone = element.__deepcopy__(memo, recursive=False)
1970 # Add to its parent's .contents
1971 tag_stack[-1].append(descendant_clone)
1973 if event is Tag.START_ELEMENT_EVENT:
1974 # Add the Tag itself to the stack so that its
1975 # children will be .appended to it.
1976 tag_stack.append(cast(Tag, descendant_clone))
1977 return clone
1979 def copy_self(self) -> Self:
1980 """Create a new Tag just like this one, but with no
1981 contents and unattached to any parse tree.
1983 This is the first step in the deepcopy process, but you can
1984 call it on its own to create a copy of a Tag without copying its
1985 contents.
1986 """
1987 clone = type(self)(
1988 None,
1989 None,
1990 self.name,
1991 self.namespace,
1992 self.prefix,
1993 self.attrs,
1994 is_xml=self._is_xml,
1995 sourceline=self.sourceline,
1996 sourcepos=self.sourcepos,
1997 can_be_empty_element=self.can_be_empty_element,
1998 cdata_list_attributes=self.cdata_list_attributes,
1999 preserve_whitespace_tags=self.preserve_whitespace_tags,
2000 interesting_string_types=self.interesting_string_types,
2001 namespaces=self._namespaces,
2002 )
2003 for attr in ("can_be_empty_element", "hidden"):
2004 setattr(clone, attr, getattr(self, attr))
2005 return clone
2007 @property
2008 def is_empty_element(self) -> bool:
2009 """Is this tag an empty-element tag? (aka a self-closing tag)
2011 A tag that has contents is never an empty-element tag.
2013 A tag that has no contents may or may not be an empty-element
2014 tag. It depends on the `TreeBuilder` used to create the
2015 tag. If the builder has a designated list of empty-element
2016 tags, then only a tag whose name shows up in that list is
2017 considered an empty-element tag. This is usually the case
2018 for HTML documents.
2020 If the builder has no designated list of empty-element, then
2021 any tag with no contents is an empty-element tag. This is usually
2022 the case for XML documents.
2023 """
2024 return len(self.contents) == 0 and self.can_be_empty_element is True
2026 @_deprecated("is_empty_element", "4.0.0")
2027 def isSelfClosing(self) -> bool:
2028 ": :meta private:"
2029 return self.is_empty_element
2031 @property
2032 def string(self) -> Optional[str]:
2033 """Convenience property to get the single string within this
2034 `Tag`, assuming there is just one.
2036 :return: If this `Tag` has a single child that's a
2037 `NavigableString`, the return value is that string. If this
2038 element has one child `Tag`, the return value is that child's
2039 `Tag.string`, recursively. If this `Tag` has no children,
2040 or has more than one child, the return value is ``None``.
2042 If this property is unexpectedly returning ``None`` for you,
2043 it's probably because your `Tag` has more than one thing
2044 inside it.
2045 """
2046 if len(self.contents) != 1:
2047 return None
2048 child = self.contents[0]
2049 if isinstance(child, NavigableString):
2050 return child
2051 elif isinstance(child, Tag):
2052 return child.string
2053 return None
2055 @string.setter
2056 def string(self, string: str) -> None:
2057 """Replace the `Tag.contents` of this `Tag` with a single string."""
2058 self.clear()
2059 if isinstance(string, NavigableString):
2060 new_class = string.__class__
2061 else:
2062 new_class = NavigableString
2063 self.append(new_class(string))
2065 #: :meta private:
2066 MAIN_CONTENT_STRING_TYPES = {NavigableString, CData}
2068 def _all_strings(
2069 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
2070 ) -> Iterator[str]:
2071 """Yield all strings of certain classes, possibly stripping them.
2073 :param strip: If True, all strings will be stripped before being
2074 yielded.
2076 :param types: A tuple of NavigableString subclasses. Any strings of
2077 a subclass not found in this list will be ignored. By
2078 default, the subclasses considered are the ones found in
2079 self.interesting_string_types. If that's not specified,
2080 only NavigableString and CData objects will be
2081 considered. That means no comments, processing
2082 instructions, etc.
2083 """
2084 if types is self.default:
2085 if self.interesting_string_types is None:
2086 types = self.MAIN_CONTENT_STRING_TYPES
2087 else:
2088 types = self.interesting_string_types
2090 for descendant in self.descendants:
2091 if not isinstance(descendant, NavigableString):
2092 continue
2093 descendant_type = type(descendant)
2094 if isinstance(types, type):
2095 if descendant_type is not types:
2096 # We're not interested in strings of this type.
2097 continue
2098 elif types is not None and descendant_type not in types:
2099 # We're not interested in strings of this type.
2100 continue
2101 if strip:
2102 stripped = descendant.strip()
2103 if len(stripped) == 0:
2104 continue
2105 yield stripped
2106 else:
2107 yield descendant
2109 strings = property(_all_strings)
2111 def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]:
2112 """Insert one or more new PageElements as a child of this `Tag`.
2114 This works similarly to :py:meth:`list.insert`, except you can insert
2115 multiple elements at once.
2117 :param position: The numeric position that should be occupied
2118 in this Tag's `Tag.children` by the first new `PageElement`.
2120 :param new_children: The PageElements to insert.
2122 :return The newly inserted PageElements.
2123 """
2124 inserted: List[PageElement] = []
2125 for new_child in new_children:
2126 inserted.extend(self._insert(position, new_child))
2127 position += 1
2128 return inserted
2130 def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]:
2131 if new_child is None:
2132 raise ValueError("Cannot insert None into a tag.")
2133 if new_child is self:
2134 raise ValueError("Cannot insert a tag into itself.")
2135 if isinstance(new_child, str) and not isinstance(new_child, NavigableString):
2136 new_child = NavigableString(new_child)
2138 from bs4 import BeautifulSoup
2139 if isinstance(new_child, BeautifulSoup):
2140 # We don't want to end up with a situation where one BeautifulSoup
2141 # object contains another. Insert the BeautifulSoup's children and
2142 # return them.
2143 return self.insert(position, *list(new_child.contents))
2144 position = min(position, len(self.contents))
2145 if hasattr(new_child, "parent") and new_child.parent is not None:
2146 # We're 'inserting' an element that's already one
2147 # of this object's children.
2148 if new_child.parent is self:
2149 current_index = self.index(new_child)
2150 if current_index < position:
2151 # We're moving this element further down the list
2152 # of this object's children. That means that when
2153 # we extract this element, our target index will
2154 # jump down one.
2155 position -= 1
2156 elif current_index == position:
2157 # We're 'inserting' an element into its current location.
2158 # This is a no-op.
2159 return [new_child]
2160 new_child.extract()
2162 new_child.parent = self
2163 previous_child = None
2164 if position == 0:
2165 new_child.previous_sibling = None
2166 new_child.previous_element = self
2167 else:
2168 previous_child = self.contents[position - 1]
2169 new_child.previous_sibling = previous_child
2170 new_child.previous_sibling.next_sibling = new_child
2171 new_child.previous_element = previous_child._last_descendant(False)
2172 if new_child.previous_element is not None:
2173 new_child.previous_element.next_element = new_child
2175 new_childs_last_element = new_child._last_descendant(
2176 is_initialized=False, accept_self=True
2177 )
2178 # new_childs_last_element can't be None because we passed
2179 # accept_self=True into _last_descendant. Worst case,
2180 # new_childs_last_element will be new_child itself. Making
2181 # this cast removes several mypy complaints later on as we
2182 # manipulate new_childs_last_element.
2183 new_childs_last_element = cast(PageElement, new_childs_last_element)
2185 if position >= len(self.contents):
2186 new_child.next_sibling = None
2188 parent: Optional[Tag] = self
2189 parents_next_sibling = None
2190 while parents_next_sibling is None and parent is not None:
2191 parents_next_sibling = parent.next_sibling
2192 parent = parent.parent
2193 if parents_next_sibling is not None:
2194 # We found the element that comes next in the document.
2195 break
2196 if parents_next_sibling is not None:
2197 new_childs_last_element.next_element = parents_next_sibling
2198 else:
2199 # The last element of this tag is the last element in
2200 # the document.
2201 new_childs_last_element.next_element = None
2202 else:
2203 next_child = self.contents[position]
2204 new_child.next_sibling = next_child
2205 if new_child.next_sibling is not None:
2206 new_child.next_sibling.previous_sibling = new_child
2207 new_childs_last_element.next_element = next_child
2209 if new_childs_last_element.next_element is not None:
2210 new_childs_last_element.next_element.previous_element = (
2211 new_childs_last_element
2212 )
2213 self.contents.insert(position, new_child)
2215 return [new_child]
2217 def unwrap(self) -> Self:
2218 """Replace this `PageElement` with its contents.
2220 :return: This object, no longer part of the tree.
2221 """
2222 my_parent = self.parent
2223 if my_parent is None:
2224 raise ValueError(
2225 "Cannot replace an element with its contents when that "
2226 "element is not part of a tree."
2227 )
2228 my_index = my_parent.index(self)
2229 self.extract(_self_index=my_index)
2230 for child in reversed(self.contents[:]):
2231 my_parent.insert(my_index, child)
2232 return self
2234 replace_with_children = unwrap
2236 @_deprecated("unwrap", "4.0.0")
2237 def replaceWithChildren(self) -> _OneElement:
2238 ": :meta private:"
2239 return self.unwrap()
2241 def append(self, tag: _InsertableElement) -> PageElement:
2242 """
2243 Appends the given `PageElement` to the contents of this `Tag`.
2245 :param tag: A PageElement.
2247 :return The newly appended PageElement.
2248 """
2249 return self.insert(len(self.contents), tag)[0]
2251 def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]:
2252 """Appends one or more objects to the contents of this
2253 `Tag`.
2255 :param tags: If a list of `PageElement` objects is provided,
2256 they will be appended to this tag's contents, one at a time.
2257 If a single `Tag` is provided, its `Tag.contents` will be
2258 used to extend this object's `Tag.contents`.
2260 :return The list of PageElements that were appended.
2261 """
2262 tag_list: Iterable[_InsertableElement]
2264 if isinstance(tags, Tag):
2265 tag_list = list(tags.contents)
2266 elif isinstance(tags, (PageElement, str)):
2267 # The caller should really be using append() instead,
2268 # but we can make it work.
2269 warnings.warn(
2270 "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.",
2271 UserWarning,
2272 stacklevel=2,
2273 )
2274 if isinstance(tags, str) and not isinstance(tags, PageElement):
2275 tags = NavigableString(tags)
2276 tag_list = [tags]
2277 elif isinstance(tags, Iterable):
2278 # Moving items around the tree may change their position in
2279 # the original list. Make a list that won't change.
2280 tag_list = list(tags)
2282 results: List[PageElement] = []
2283 for tag in tag_list:
2284 results.append(self.append(tag))
2286 return results
2288 def clear(self, decompose: bool = False) -> None:
2289 """Destroy all children of this `Tag` by calling
2290 `PageElement.extract` on them.
2292 :param decompose: If this is True, `PageElement.decompose` (a
2293 more destructive method) will be called instead of
2294 `PageElement.extract`.
2295 """
2296 for element in self.contents[:]:
2297 if decompose:
2298 element.decompose()
2299 else:
2300 element.extract()
2302 def smooth(self) -> None:
2303 """Smooth out the children of this `Tag` by consolidating consecutive
2304 strings.
2306 If you perform a lot of operations that modify the tree,
2307 calling this method afterwards can make pretty-printed output
2308 look more natural.
2309 """
2310 # Mark the first position of every pair of children that need
2311 # to be consolidated. Do this rather than making a copy of
2312 # self.contents, since in most cases very few strings will be
2313 # affected.
2314 marked = []
2315 for i, a in enumerate(self.contents):
2316 if isinstance(a, Tag):
2317 # Recursively smooth children.
2318 a.smooth()
2319 if i == len(self.contents) - 1:
2320 # This is the last item in .contents, and it's not a
2321 # tag. There's no chance it needs any work.
2322 continue
2323 b = self.contents[i + 1]
2324 if (
2325 isinstance(a, NavigableString)
2326 and isinstance(b, NavigableString)
2327 and not isinstance(a, PreformattedString)
2328 and not isinstance(b, PreformattedString)
2329 ):
2330 marked.append(i)
2332 # Go over the marked positions in reverse order, so that
2333 # removing items from .contents won't affect the remaining
2334 # positions.
2335 for i in reversed(marked):
2336 a = cast(NavigableString, self.contents[i])
2337 b = cast(NavigableString, self.contents[i + 1])
2338 b.extract()
2339 n = NavigableString(a + b)
2340 a.replace_with(n)
2342 def index(self, element: PageElement) -> int:
2343 """Find the index of a child of this `Tag` (by identity, not value).
2345 Doing this by identity avoids issues when a `Tag` contains two
2346 children that have string equality.
2348 :param element: Look for this `PageElement` in this object's contents.
2349 """
2350 for i, child in enumerate(self.contents):
2351 if child is element:
2352 return i
2353 raise ValueError("Tag.index: element not in tag")
2355 def get(
2356 self, key: str, default: Optional[_AttributeValue] = None
2357 ) -> Optional[_AttributeValue]:
2358 """Returns the value of the 'key' attribute for the tag, or
2359 the value given for 'default' if it doesn't have that
2360 attribute.
2362 :param key: The attribute to look for.
2363 :param default: Use this value if the attribute is not present
2364 on this `Tag`.
2365 """
2366 return self.attrs.get(key, default)
2368 def get_attribute_list(
2369 self, key: str, default: Optional[AttributeValueList] = None
2370 ) -> AttributeValueList:
2371 """The same as get(), but always returns a (possibly empty) list.
2373 :param key: The attribute to look for.
2374 :param default: Use this value if the attribute is not present
2375 on this `Tag`.
2376 :return: A list of strings, usually empty or containing only a single
2377 value.
2378 """
2379 list_value: AttributeValueList
2380 value = self.get(key, default)
2381 if value is None:
2382 list_value = self.attribute_value_list_class()
2383 elif isinstance(value, list):
2384 list_value = value
2385 else:
2386 if not isinstance(value, str):
2387 value = cast(str, value)
2388 list_value = self.attribute_value_list_class([value])
2389 return list_value
2391 def has_attr(self, key: str) -> bool:
2392 """Does this `Tag` have an attribute with the given name?"""
2393 return key in self.attrs
2395 def __hash__(self) -> int:
2396 return str(self).__hash__()
2398 def __getitem__(self, key: str) -> _AttributeValue:
2399 """tag[key] returns the value of the 'key' attribute for the Tag,
2400 and throws an exception if it's not there."""
2401 return self.attrs[key]
2403 def __iter__(self) -> Iterator[PageElement]:
2404 "Iterating over a Tag iterates over its contents."
2405 return iter(self.contents)
2407 def __len__(self) -> int:
2408 "The length of a Tag is the length of its list of contents."
2409 return len(self.contents)
2411 def __contains__(self, x: Any) -> bool:
2412 return x in self.contents
2414 def __bool__(self) -> bool:
2415 "A tag is non-None even if it has no contents."
2416 return True
2418 def __setitem__(self, key: str, value: _AttributeValue) -> None:
2419 """Setting tag[key] sets the value of the 'key' attribute for the
2420 tag."""
2421 self.attrs[key] = value
2423 def __delitem__(self, key: str) -> None:
2424 "Deleting tag[key] deletes all 'key' attributes for the tag."
2425 self.attrs.pop(key, None)
2427 @overload
2428 def __call__( # pyright: ignore [reportOverlappingOverload]
2429 self,
2430 name: _FindMethodName = None,
2431 attrs: Optional[_StrainableAttributes] = None,
2432 recursive: bool = True,
2433 string: None = None,
2434 limit: Optional[int] = None,
2435 _stacklevel: int = 2,
2436 **kwargs: _StrainableAttribute,
2437 ) -> _SomeTags:
2438 ...
2440 @overload
2441 def __call__(
2442 self,
2443 name: None = None,
2444 attrs: None = None,
2445 recursive: bool = True,
2446 string: _StrainableString = "",
2447 limit: Optional[int] = None,
2448 _stacklevel: int = 2,
2449 **kwargs: _StrainableAttribute,
2450 ) -> _SomeNavigableStrings:
2451 ...
2453 def __call__(
2454 self,
2455 name: _FindMethodName = None,
2456 attrs: Optional[_StrainableAttributes] = None,
2457 recursive: bool = True,
2458 string: Optional[_StrainableString] = None,
2459 limit: Optional[int] = None,
2460 _stacklevel: int = 2,
2461 **kwargs: _StrainableAttribute,
2462 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
2463 """Calling a Tag like a function is the same as calling its
2464 find_all() method. Eg. tag('a') returns a list of all the A tags
2465 found within this tag."""
2466 if string is not None and (name is not None or attrs is not None or kwargs):
2467 # TODO: Using the @overload decorator to express the three ways you
2468 # could get into this path is way too much code for a rarely(?) used
2469 # feature.
2470 return cast(ResultSet[Tag], self.find_all(name, attrs, recursive, string, limit, _stacklevel, **kwargs)) #type: ignore
2472 if string is None:
2473 # If string is None, we're searching for tags.
2474 tags:ResultSet[Tag] = self.find_all(
2475 name, attrs, recursive, None, limit, _stacklevel, **kwargs
2476 )
2477 return tags
2479 # Otherwise, we're searching for strings.
2480 strings:ResultSet[NavigableString] = self.find_all(
2481 None, None, recursive, string, limit, _stacklevel, **kwargs
2482 )
2483 return strings
2485 def __getattr__(self, subtag: str) -> Optional[Tag]:
2486 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
2487 # print("Getattr %s.%s" % (self.__class__, tag))
2488 result: _AtMostOneElement
2489 if len(subtag) > 3 and subtag.endswith("Tag"):
2490 # BS3: soup.aTag -> "soup.find("a")
2491 tag_name = subtag[:-3]
2492 warnings.warn(
2493 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")'
2494 % dict(name=tag_name),
2495 DeprecationWarning,
2496 stacklevel=2,
2497 )
2498 result = self.find(tag_name)
2499 # We special case contents to avoid recursion.
2500 elif not subtag.startswith("__") and not subtag == "contents":
2501 result = self.find(subtag)
2502 else:
2503 raise AttributeError(
2504 "'%s' object has no attribute '%s'" % (self.__class__, subtag)
2505 )
2506 return result
2508 def __eq__(self, other: Any) -> bool:
2509 """Returns true iff this Tag has the same name, the same attributes,
2510 and the same contents (recursively) as `other`."""
2511 if self is other:
2512 return True
2513 if not isinstance(other, Tag):
2514 return False
2515 if (
2516 not hasattr(other, "name")
2517 or not hasattr(other, "attrs")
2518 or not hasattr(other, "contents")
2519 or self.name != other.name
2520 or self.attrs != other.attrs
2521 or len(self) != len(other)
2522 ):
2523 return False
2524 for i, my_child in enumerate(self.contents):
2525 if my_child != other.contents[i]:
2526 return False
2527 return True
2529 def __ne__(self, other: Any) -> bool:
2530 """Returns true iff this Tag is not identical to `other`,
2531 as defined in __eq__."""
2532 return not self == other
2534 def __repr__(self) -> str:
2535 """Renders this `Tag` as a string."""
2536 return self.decode()
2538 __str__ = __unicode__ = __repr__
2540 def encode(
2541 self,
2542 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2543 indent_level: Optional[int] = None,
2544 formatter: _FormatterOrName = "minimal",
2545 errors: str = "xmlcharrefreplace",
2546 ) -> bytes:
2547 """Render this `Tag` and its contents as a bytestring.
2549 :param encoding: The encoding to use when converting to
2550 a bytestring. This may also affect the text of the document,
2551 specifically any encoding declarations within the document.
2552 :param indent_level: Each line of the rendering will be
2553 indented this many levels. (The ``formatter`` decides what a
2554 'level' means, in terms of spaces or other characters
2555 output.) This is used internally in recursive calls while
2556 pretty-printing.
2557 :param formatter: Either a `Formatter` object, or a string naming one of
2558 the standard formatters.
2559 :param errors: An error handling strategy such as
2560 'xmlcharrefreplace'. This value is passed along into
2561 :py:meth:`str.encode` and its value should be one of the `error
2562 handling constants defined by Python's codecs module
2563 <https://docs.python.org/3/library/codecs.html#error-handlers>`_.
2564 """
2565 # Turn the data structure into Unicode, then encode the
2566 # Unicode.
2567 u = self.decode(indent_level, encoding, formatter)
2568 return u.encode(encoding, errors)
2570 def decode(
2571 self,
2572 indent_level: Optional[int] = None,
2573 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2574 formatter: _FormatterOrName = "minimal",
2575 iterator: Optional[Iterator[PageElement]] = None,
2576 ) -> str:
2577 """Render this `Tag` and its contents as a Unicode string.
2579 :param indent_level: Each line of the rendering will be
2580 indented this many levels. (The ``formatter`` decides what a
2581 'level' means, in terms of spaces or other characters
2582 output.) This is used internally in recursive calls while
2583 pretty-printing.
2584 :param encoding: The encoding you intend to use when
2585 converting the string to a bytestring. decode() is *not*
2586 responsible for performing that encoding. This information
2587 is needed so that a real encoding can be substituted in if
2588 the document contains an encoding declaration (e.g. in a
2589 <meta> tag).
2590 :param formatter: Either a `Formatter` object, or a string
2591 naming one of the standard formatters.
2592 :param iterator: The iterator to use when navigating over the
2593 parse tree. This is only used by `Tag.decode_contents` and
2594 you probably won't need to use it.
2595 """
2596 pieces = []
2597 # First off, turn a non-Formatter `formatter` into a Formatter
2598 # object. This will stop the lookup from happening over and
2599 # over again.
2600 if not isinstance(formatter, Formatter):
2601 formatter = self.formatter_for_name(formatter)
2603 if indent_level is True:
2604 indent_level = 0
2606 # The currently active tag that put us into string literal
2607 # mode. Until this element is closed, children will be treated
2608 # as string literals and not pretty-printed. String literal
2609 # mode is turned on immediately after this tag begins, and
2610 # turned off immediately before it's closed. This means there
2611 # will be whitespace before and after the tag itself.
2612 string_literal_tag = None
2614 for event, element in self._event_stream(iterator):
2615 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
2616 element = cast(Tag, element)
2617 piece = element._format_tag(eventual_encoding, formatter, opening=True)
2618 elif event is Tag.END_ELEMENT_EVENT:
2619 element = cast(Tag, element)
2620 piece = element._format_tag(eventual_encoding, formatter, opening=False)
2621 if indent_level is not None:
2622 indent_level -= 1
2623 else:
2624 element = cast(NavigableString, element)
2625 piece = element.output_ready(formatter)
2627 # Now we need to apply the 'prettiness' -- extra
2628 # whitespace before and/or after this tag. This can get
2629 # complicated because certain tags, like <pre> and
2630 # <script>, can't be prettified, since adding whitespace would
2631 # change the meaning of the content.
2633 # The default behavior is to add whitespace before and
2634 # after an element when string literal mode is off, and to
2635 # leave things as they are when string literal mode is on.
2636 if string_literal_tag:
2637 indent_before = indent_after = False
2638 else:
2639 indent_before = indent_after = True
2641 # The only time the behavior is more complex than that is
2642 # when we encounter an opening or closing tag that might
2643 # put us into or out of string literal mode.
2644 if (
2645 event is Tag.START_ELEMENT_EVENT
2646 and not string_literal_tag
2647 and not cast(Tag, element)._should_pretty_print()
2648 ):
2649 # We are about to enter string literal mode. Add
2650 # whitespace before this tag, but not after. We
2651 # will stay in string literal mode until this tag
2652 # is closed.
2653 indent_before = True
2654 indent_after = False
2655 string_literal_tag = element
2656 elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag:
2657 # We are about to exit string literal mode by closing
2658 # the tag that sent us into that mode. Add whitespace
2659 # after this tag, but not before.
2660 indent_before = False
2661 indent_after = True
2662 string_literal_tag = None
2664 # Now we know whether to add whitespace before and/or
2665 # after this element.
2666 if indent_level is not None:
2667 if indent_before or indent_after:
2668 if isinstance(element, NavigableString):
2669 piece = piece.strip()
2670 if piece:
2671 piece = self._indent_string(
2672 piece, indent_level, formatter, indent_before, indent_after
2673 )
2674 if event == Tag.START_ELEMENT_EVENT:
2675 indent_level += 1
2676 pieces.append(piece)
2677 return "".join(pieces)
2679 class _TreeTraversalEvent(object):
2680 """An internal class representing an event in the process
2681 of traversing a parse tree.
2683 :meta private:
2684 """
2686 # Stand-ins for the different events yielded by _event_stream
2687 START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2688 END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2689 EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2690 STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2692 def _event_stream(
2693 self, iterator: Optional[Iterator[PageElement]] = None
2694 ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]:
2695 """Yield a sequence of events that can be used to reconstruct the DOM
2696 for this element.
2698 This lets us recreate the nested structure of this element
2699 (e.g. when formatting it as a string) without using recursive
2700 method calls.
2702 This is similar in concept to the SAX API, but it's a simpler
2703 interface designed for internal use. The events are different
2704 from SAX and the arguments associated with the events are Tags
2705 and other Beautiful Soup objects.
2707 :param iterator: An alternate iterator to use when traversing
2708 the tree.
2709 """
2710 tag_stack: List[Tag] = []
2712 iterator = iterator or self.self_and_descendants
2714 for c in iterator:
2715 # If the parent of the element we're about to yield is not
2716 # the tag currently on the stack, it means that the tag on
2717 # the stack closed before this element appeared.
2718 while tag_stack and c.parent != tag_stack[-1]:
2719 now_closed_tag = tag_stack.pop()
2720 yield Tag.END_ELEMENT_EVENT, now_closed_tag
2722 if isinstance(c, Tag):
2723 if c.is_empty_element:
2724 yield Tag.EMPTY_ELEMENT_EVENT, c
2725 else:
2726 yield Tag.START_ELEMENT_EVENT, c
2727 tag_stack.append(c)
2728 continue
2729 else:
2730 yield Tag.STRING_ELEMENT_EVENT, c
2732 while tag_stack:
2733 now_closed_tag = tag_stack.pop()
2734 yield Tag.END_ELEMENT_EVENT, now_closed_tag
2736 def _indent_string(
2737 self,
2738 s: str,
2739 indent_level: int,
2740 formatter: Formatter,
2741 indent_before: bool,
2742 indent_after: bool,
2743 ) -> str:
2744 """Add indentation whitespace before and/or after a string.
2746 :param s: The string to amend with whitespace.
2747 :param indent_level: The indentation level; affects how much
2748 whitespace goes before the string.
2749 :param indent_before: Whether or not to add whitespace
2750 before the string.
2751 :param indent_after: Whether or not to add whitespace
2752 (a newline) after the string.
2753 """
2754 space_before = ""
2755 if indent_before and indent_level:
2756 space_before = formatter.indent * indent_level
2758 space_after = ""
2759 if indent_after:
2760 space_after = "\n"
2762 return space_before + s + space_after
2764 def _format_tag(
2765 self, eventual_encoding: str, formatter: Formatter, opening: bool
2766 ) -> str:
2767 if self.hidden:
2768 # A hidden tag is invisible, although its contents
2769 # are visible.
2770 return ""
2772 # A tag starts with the < character (see below).
2774 # Then the / character, if this is a closing tag.
2775 closing_slash = ""
2776 if not opening:
2777 closing_slash = "/"
2779 # Then an optional namespace prefix.
2780 prefix = ""
2781 if self.prefix:
2782 prefix = self.prefix + ":"
2784 # Then a list of attribute values, if this is an opening tag.
2785 attribute_string = ""
2786 if opening:
2787 attributes = formatter.attributes(self)
2788 attrs = []
2789 for key, val in attributes:
2790 if val is None:
2791 decoded = key
2792 else:
2793 if isinstance(val, list) or isinstance(val, tuple):
2794 val = " ".join(val)
2795 elif not isinstance(val, str):
2796 val = str(val)
2797 elif (
2798 isinstance(val, AttributeValueWithCharsetSubstitution)
2799 and eventual_encoding is not None
2800 ):
2801 val = val.substitute_encoding(eventual_encoding)
2803 text = formatter.attribute_value(val)
2804 decoded = str(key) + "=" + formatter.quoted_attribute_value(text)
2805 attrs.append(decoded)
2806 if attrs:
2807 attribute_string = " " + " ".join(attrs)
2809 # Then an optional closing slash (for a void element in an
2810 # XML document).
2811 void_element_closing_slash = ""
2812 if self.is_empty_element:
2813 void_element_closing_slash = formatter.void_element_close_prefix or ""
2815 # Put it all together.
2816 return (
2817 "<"
2818 + closing_slash
2819 + prefix
2820 + self.name
2821 + attribute_string
2822 + void_element_closing_slash
2823 + ">"
2824 )
2826 def _should_pretty_print(self, indent_level: int = 1) -> bool:
2827 """Should this tag be pretty-printed?
2829 Most of them should, but some (such as <pre> in HTML
2830 documents) should not.
2831 """
2832 return indent_level is not None and (
2833 not self.preserve_whitespace_tags
2834 or self.name not in self.preserve_whitespace_tags
2835 )
2837 @overload
2838 def prettify(
2839 self,
2840 encoding: None = None,
2841 formatter: _FormatterOrName = "minimal",
2842 ) -> str:
2843 ...
2845 @overload
2846 def prettify(
2847 self,
2848 encoding: _Encoding,
2849 formatter: _FormatterOrName = "minimal",
2850 ) -> bytes:
2851 ...
2853 def prettify(
2854 self,
2855 encoding: Optional[_Encoding] = None,
2856 formatter: _FormatterOrName = "minimal",
2857 ) -> Union[str, bytes]:
2858 """Pretty-print this `Tag` as a string or bytestring.
2860 :param encoding: The encoding of the bytestring, or None if you want Unicode.
2861 :param formatter: A Formatter object, or a string naming one of
2862 the standard formatters.
2863 :return: A string (if no ``encoding`` is provided) or a bytestring
2864 (otherwise).
2865 """
2866 if encoding is None:
2867 return self.decode(indent_level=0, formatter=formatter)
2868 else:
2869 return self.encode(encoding=encoding, indent_level=0, formatter=formatter)
2871 def decode_contents(
2872 self,
2873 indent_level: Optional[int] = None,
2874 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2875 formatter: _FormatterOrName = "minimal",
2876 ) -> str:
2877 """Renders the contents of this tag as a Unicode string.
2879 :param indent_level: Each line of the rendering will be
2880 indented this many levels. (The formatter decides what a
2881 'level' means in terms of spaces or other characters
2882 output.) Used internally in recursive calls while
2883 pretty-printing.
2885 :param eventual_encoding: The tag is destined to be
2886 encoded into this encoding. decode_contents() is *not*
2887 responsible for performing that encoding. This information
2888 is needed so that a real encoding can be substituted in if
2889 the document contains an encoding declaration (e.g. in a
2890 <meta> tag).
2892 :param formatter: A `Formatter` object, or a string naming one of
2893 the standard Formatters.
2894 """
2895 return self.decode(
2896 indent_level, eventual_encoding, formatter, iterator=self.descendants
2897 )
2899 def encode_contents(
2900 self,
2901 indent_level: Optional[int] = None,
2902 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2903 formatter: _FormatterOrName = "minimal",
2904 ) -> bytes:
2905 """Renders the contents of this PageElement as a bytestring.
2907 :param indent_level: Each line of the rendering will be
2908 indented this many levels. (The ``formatter`` decides what a
2909 'level' means, in terms of spaces or other characters
2910 output.) This is used internally in recursive calls while
2911 pretty-printing.
2912 :param formatter: Either a `Formatter` object, or a string naming one of
2913 the standard formatters.
2914 :param encoding: The bytestring will be in this encoding.
2915 """
2916 contents = self.decode_contents(indent_level, encoding, formatter)
2917 return contents.encode(encoding)
2919 @_deprecated("encode_contents", "4.0.0")
2920 def renderContents(
2921 self,
2922 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2923 prettyPrint: bool = False,
2924 indentLevel: Optional[int] = 0,
2925 ) -> bytes:
2926 """Deprecated method for BS3 compatibility.
2928 :meta private:
2929 """
2930 if not prettyPrint:
2931 indentLevel = None
2932 return self.encode_contents(indent_level=indentLevel, encoding=encoding)
2934 # Soup methods
2936 @overload
2937 def find(
2938 self,
2939 name: _FindMethodName = None,
2940 attrs: Optional[_StrainableAttributes] = None,
2941 recursive: bool = True,
2942 string: None=None,
2943 **kwargs: _StrainableAttribute,
2944 ) -> _AtMostOneTag:
2945 ...
2947 @overload
2948 def find(
2949 self,
2950 name: None=None,
2951 attrs: None=None,
2952 recursive: bool = True,
2953 string: _StrainableString="",
2954 ) -> _AtMostOneNavigableString:
2955 ...
2957 def find(
2958 self,
2959 name: _FindMethodName = None,
2960 attrs: Optional[_StrainableAttributes] = None,
2961 recursive: bool = True,
2962 string: Optional[_StrainableString] = None,
2963 **kwargs: _StrainableAttribute,
2964 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
2965 """Look in the children of this PageElement and find the first
2966 PageElement that matches the given criteria.
2968 All find_* methods take a common set of arguments. See the online
2969 documentation for detailed explanations.
2971 :param name: A filter on tag name.
2972 :param attrs: Additional filters on attribute values.
2973 :param recursive: If this is True, find() will perform a
2974 recursive search of this Tag's children. Otherwise,
2975 only the direct children will be considered.
2976 :param string: A filter on the `Tag.string` attribute.
2977 :kwargs: Additional filters on attribute values.
2978 """
2979 if string is not None and (name is not None or attrs is not None or kwargs):
2980 # TODO: Using the @overload decorator to express the three ways you
2981 # could get into this path is way too much code for a rarely(?) used
2982 # feature.
2983 elements = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) # type:ignore
2984 if elements:
2985 return cast(Tag, elements[0])
2986 elif string is None:
2987 tags = self.find_all(name, attrs, recursive, None, 1, _stacklevel=3, **kwargs)
2988 if tags:
2989 return cast(Tag, tags[0])
2990 else:
2991 strings = self.find_all(None, None, recursive, string, 1, _stacklevel=3, **kwargs)
2992 if strings:
2993 return cast(NavigableString, strings[0])
2994 return None
2996 findChild = _deprecated_function_alias("findChild", "find", "3.0.0")
2998 @overload
2999 def find_all( # pyright: ignore [reportOverlappingOverload]
3000 self,
3001 name: _FindMethodName = None,
3002 attrs: Optional[_StrainableAttributes] = None,
3003 recursive: bool = True,
3004 string: None = None,
3005 limit: Optional[int] = None,
3006 _stacklevel: int = 2,
3007 **kwargs: _StrainableAttribute,
3008 ) -> _SomeTags:
3009 ...
3011 @overload
3012 def find_all(
3013 self,
3014 name: None = None,
3015 attrs: None = None,
3016 recursive: bool = True,
3017 string: _StrainableString = "",
3018 limit: Optional[int] = None,
3019 _stacklevel: int = 2,
3020 **kwargs: _StrainableAttribute,
3021 ) -> _SomeNavigableStrings:
3022 ...
3024 def find_all(
3025 self,
3026 name: _FindMethodName = None,
3027 attrs: Optional[_StrainableAttributes] = None,
3028 recursive: bool = True,
3029 string: Optional[_StrainableString] = None,
3030 limit: Optional[int] = None,
3031 _stacklevel: int = 2,
3032 **kwargs: _StrainableAttribute,
3033 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
3034 """Look in the children of this `PageElement` and find all
3035 `PageElement` objects that match the given criteria.
3037 All find_* methods take a common set of arguments. See the online
3038 documentation for detailed explanations.
3040 :param name: A filter on tag name.
3041 :param attrs: Additional filters on attribute values.
3042 :param recursive: If this is True, find_all() will perform a
3043 recursive search of this PageElement's children. Otherwise,
3044 only the direct children will be considered.
3045 :param limit: Stop looking after finding this many results.
3046 :param _stacklevel: Used internally to improve warning messages.
3047 :kwargs: Additional filters on attribute values.
3048 """
3049 generator = self.descendants
3050 if not recursive:
3051 generator = self.children
3052 _stacklevel += 1
3054 if string is not None and (name is not None or attrs is not None or kwargs):
3055 # TODO: Using the @overload decorator to express the three ways you
3056 # could get into this path is way too much code for a rarely(?) used
3057 # feature.
3058 return cast(ResultSet[Tag],
3059 self._find_all(name, attrs, string, limit, generator,
3060 _stacklevel=_stacklevel, **kwargs)
3061 )
3063 if string is None:
3064 # If string is None, we're searching for tags.
3065 return cast(ResultSet[Tag], self._find_all(
3066 name, attrs, None, limit, generator, _stacklevel=_stacklevel, **kwargs
3067 ))
3069 # Otherwise, we're searching for strings.
3070 return cast(ResultSet[NavigableString], self._find_all(
3071 None, None, string, limit, generator, _stacklevel=_stacklevel, **kwargs
3072 ))
3074 findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")
3075 findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")
3077 # Generator methods
3078 @property
3079 def children(self) -> Iterator[PageElement]:
3080 """Iterate over all direct children of this `PageElement`."""
3081 return (x for x in self.contents)
3083 @property
3084 def self_and_descendants(self) -> Iterator[PageElement]:
3085 """Iterate over this `Tag` and its children in a
3086 breadth-first sequence.
3087 """
3088 return self._self_and(self.descendants)
3090 @property
3091 def descendants(self) -> Iterator[PageElement]:
3092 """Iterate over all children of this `Tag` in a
3093 breadth-first sequence.
3094 """
3095 if not len(self.contents):
3096 return
3097 # _last_descendant() can't return None here because
3098 # accept_self is True. Worst case, last_descendant will end up
3099 # as self.
3100 last_descendant = cast(PageElement, self._last_descendant(accept_self=True))
3101 stopNode = last_descendant.next_element
3102 current: _AtMostOneElement = self.contents[0]
3103 while current is not stopNode and current is not None:
3104 successor = current.next_element
3105 yield current
3106 current = successor
3108 # CSS selector code
3109 def select_one(
3110 self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any
3111 ) -> Optional[Tag]:
3112 """Perform a CSS selection operation on the current element.
3114 :param selector: A CSS selector.
3116 :param namespaces: A dictionary mapping namespace prefixes
3117 used in the CSS selector to namespace URIs. By default,
3118 Beautiful Soup will use the prefixes it encountered while
3119 parsing the document.
3121 :param kwargs: Keyword arguments to be passed into Soup Sieve's
3122 soupsieve.select() method.
3123 """
3124 return self.css.select_one(selector, namespaces, **kwargs)
3126 def select(
3127 self,
3128 selector: str,
3129 namespaces: Optional[Dict[str, str]] = None,
3130 limit: int = 0,
3131 **kwargs: Any,
3132 ) -> ResultSet[Tag]:
3133 """Perform a CSS selection operation on the current element.
3135 This uses the SoupSieve library.
3137 :param selector: A string containing a CSS selector.
3139 :param namespaces: A dictionary mapping namespace prefixes
3140 used in the CSS selector to namespace URIs. By default,
3141 Beautiful Soup will use the prefixes it encountered while
3142 parsing the document.
3144 :param limit: After finding this number of results, stop looking.
3146 :param kwargs: Keyword arguments to be passed into SoupSieve's
3147 soupsieve.select() method.
3148 """
3149 return self.css.select(selector, namespaces, limit, **kwargs)
3151 @property
3152 def css(self) -> CSS:
3153 """Return an interface to the CSS selector API."""
3154 return CSS(self)
3156 # Old names for backwards compatibility
3157 @_deprecated("children", "4.0.0")
3158 def childGenerator(self) -> Iterator[PageElement]:
3159 """Deprecated generator.
3161 :meta private:
3162 """
3163 return self.children
3165 @_deprecated("descendants", "4.0.0")
3166 def recursiveChildGenerator(self) -> Iterator[PageElement]:
3167 """Deprecated generator.
3169 :meta private:
3170 """
3171 return self.descendants
3173 @_deprecated("has_attr", "4.0.0")
3174 def has_key(self, key: str) -> bool:
3175 """Deprecated method. This was kind of misleading because has_key()
3176 (attributes) was different from __in__ (contents).
3178 has_key() is gone in Python 3, anyway.
3180 :meta private:
3181 """
3182 return self.has_attr(key)
3185_PageElementT = TypeVar("_PageElementT", bound=PageElement)
3187class ResultSet(List[_PageElementT], Generic[_PageElementT]):
3188 """A ResultSet is a list of `PageElement` objects, gathered as the result
3189 of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of
3190 search results.
3191 """
3193 source: Optional[ElementFilter]
3195 def __init__(
3196 self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = ()
3197 ) -> None:
3198 super(ResultSet, self).__init__(result)
3199 self.source = source
3201 def __getattr__(self, key: str) -> None:
3202 """Raise a helpful exception to explain a common code fix."""
3203 raise AttributeError(
3204 f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""
3205 )
3207# Now that all the classes used by SoupStrainer have been defined,
3208# import SoupStrainer itself into this module to preserve the
3209# backwards compatibility of anyone who imports
3210# bs4.element.SoupStrainer.
3211from bs4.filter import SoupStrainer # noqa: E402