Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/element.py: 41%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3# Use of this source code is governed by the MIT license.
4__license__ = "MIT"
6import inspect
7import re
8import warnings
10from bs4.css import CSS
11from bs4._deprecation import (
12 _deprecated,
13 _deprecated_alias,
14 _deprecated_function_alias,
15)
16from bs4.formatter import (
17 Formatter,
18 HTMLFormatter,
19 XMLFormatter,
20)
21from bs4._warnings import AttributeResemblesVariableWarning
23from typing import (
24 Any,
25 Callable,
26 Dict,
27 Generic,
28 Iterable,
29 Iterator,
30 List,
31 Mapping,
32 MutableSequence,
33 Optional,
34 Pattern,
35 Set,
36 TYPE_CHECKING,
37 Tuple,
38 Type,
39 TypeVar,
40 Union,
41 cast,
42 overload,
43)
44from typing_extensions import (
45 Self,
46 TypeAlias,
47)
49if TYPE_CHECKING:
50 from bs4 import BeautifulSoup
51 from bs4.builder import TreeBuilder
52 from bs4.filter import ElementFilter
53 from bs4.formatter import (
54 _EntitySubstitutionFunction,
55 _FormatterOrName,
56 )
57 from bs4._typing import (
58 _AtMostOneElement,
59 _AtMostOneNavigableString,
60 _AtMostOneTag,
61 _AttributeValue,
62 _AttributeValues,
63 _Encoding,
64 _InsertableElement,
65 _OneElement,
66 _QueryResults,
67 _RawAttributeValue,
68 _RawAttributeValues,
69 _RawOrProcessedAttributeValues,
70 _SomeNavigableStrings,
71 _SomeTags,
72 _StrainableAttribute,
73 _StrainableAttributes,
74 _StrainableElement,
75 _StrainableString,
76 )
78_OneOrMoreStringTypes: TypeAlias = Union[
79 Type["NavigableString"], Iterable[Type["NavigableString"]]
80]
82_FindMethodName: TypeAlias = Union["_StrainableElement", "ElementFilter"]
83_OptionalFindMethodName: TypeAlias = Optional[_FindMethodName]
85# Deprecated module-level attributes.
86# See https://peps.python.org/pep-0562/
87_deprecated_names = dict(
88 whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy."
89)
90#: :meta private:
91_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+")
94def __getattr__(name: str) -> Any:
95 if name in _deprecated_names:
96 message = _deprecated_names[name]
97 warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2)
99 return globals()[f"_deprecated_{name}"]
100 raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
103#: Documents output by Beautiful Soup will be encoded with
104#: this encoding unless you specify otherwise.
105DEFAULT_OUTPUT_ENCODING: str = "utf-8"
107#: A regular expression that can be used to split on whitespace.
108nonwhitespace_re: Pattern[str] = re.compile(r"\S+")
110#: These encodings are recognized by Python (so `Tag.encode`
111#: could theoretically support them) but XML and HTML don't recognize
112#: them (so they should not show up in an XML or HTML document as that
113#: document's encoding).
114#:
115#: If an XML document is encoded in one of these encodings, no encoding
116#: will be mentioned in the XML declaration. If an HTML document is
117#: encoded in one of these encodings, and the HTML document has a
118#: <meta> tag that mentions an encoding, the encoding will be given as
119#: the empty string.
120#:
121#: Source:
122#: Python documentation, `Python Specific Encodings <https://docs.python.org/3/library/codecs.html#python-specific-encodings>`_
123PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set(
124 [
125 "idna",
126 "mbcs",
127 "oem",
128 "palmos",
129 "punycode",
130 "raw_unicode_escape",
131 "undefined",
132 "unicode_escape",
133 "raw-unicode-escape",
134 "unicode-escape",
135 "string-escape",
136 "string_escape",
137 ]
138)
141class NamespacedAttribute(str):
142 """A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"')
143 which remembers the namespace prefix ('xml') and the name ('lang')
144 that were used to create it.
145 """
147 prefix: Optional[str]
148 name: Optional[str]
149 namespace: Optional[str]
151 def __new__(
152 cls,
153 prefix: Optional[str],
154 name: Optional[str] = None,
155 namespace: Optional[str] = None,
156 ) -> Self:
157 if not name:
158 # This is the default namespace. Its name "has no value"
159 # per https://www.w3.org/TR/xml-names/#defaulting
160 name = None
162 if not name:
163 obj = str.__new__(cls, prefix)
164 elif not prefix:
165 # Not really namespaced.
166 obj = str.__new__(cls, name)
167 else:
168 obj = str.__new__(cls, prefix + ":" + name)
169 obj.prefix = prefix
170 obj.name = name
171 obj.namespace = namespace
172 return obj
175class AttributeValueWithCharsetSubstitution(str):
176 """An abstract class standing in for a character encoding specified
177 inside an HTML ``<meta>`` tag.
179 Subclasses exist for each place such a character encoding might be
180 found: either inside the ``charset`` attribute
181 (`CharsetMetaAttributeValue`) or inside the ``content`` attribute
182 (`ContentMetaAttributeValue`)
184 This allows Beautiful Soup to replace that part of the HTML file
185 with a different encoding when ouputting a tree as a string.
186 """
188 # The original, un-encoded value of the ``content`` attribute.
189 #: :meta private:
190 original_value: str
192 def substitute_encoding(self, eventual_encoding: str) -> str:
193 """Do whatever's necessary in this implementation-specific
194 portion an HTML document to substitute in a specific encoding.
195 """
196 raise NotImplementedError()
199class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
200 """A generic stand-in for the value of a ``<meta>`` tag's ``charset``
201 attribute.
203 When Beautiful Soup parses the markup ``<meta charset="utf8">``, the
204 value of the ``charset`` attribute will become one of these objects.
206 If the document is later encoded to an encoding other than UTF-8, its
207 ``<meta>`` tag will mention the new encoding instead of ``utf8``.
208 """
210 def __new__(cls, original_value: str) -> Self:
211 # We don't need to use the original value for anything, but
212 # it might be useful for the user to know.
213 obj = str.__new__(cls, original_value)
214 obj.original_value = original_value
215 return obj
217 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
218 """When an HTML document is being encoded to a given encoding, the
219 value of a ``<meta>`` tag's ``charset`` becomes the name of
220 the encoding.
221 """
222 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
223 return ""
224 return eventual_encoding
227class AttributeValueList(List[str]):
228 """Class for the list used to hold the values of attributes which
229 have multiple values (such as HTML's 'class'). It's just a regular
230 list, but you can subclass it and pass it in to the TreeBuilder
231 constructor as attribute_value_list_class, to have your subclass
232 instantiated instead.
233 """
236class AttributeDict(Dict[Any,Any]):
237 """Superclass for the dictionary used to hold a tag's
238 attributes. You can use this, but it's just a regular dict with no
239 special logic.
240 """
243class XMLAttributeDict(AttributeDict):
244 """A dictionary for holding a Tag's attributes, which processes
245 incoming values for consistency with the HTML spec.
246 """
248 def __setitem__(self, key: str, value: Any) -> None:
249 """Set an attribute value, possibly modifying it to comply with
250 the XML spec.
252 This just means converting common non-string values to
253 strings: XML attributes may have "any literal string as a
254 value."
255 """
256 if value is None:
257 value = ""
258 if isinstance(value, bool):
259 # XML does not define any rules for boolean attributes.
260 # Preserve the old Beautiful Soup behavior (a bool that
261 # gets converted to a string on output) rather than
262 # guessing what the value should be.
263 pass
264 elif isinstance(value, (int, float)):
265 # It's dangerous to convert _every_ attribute value into a
266 # plain string, since an attribute value may be a more
267 # sophisticated string-like object
268 # (e.g. CharsetMetaAttributeValue). But we can definitely
269 # convert numeric values and booleans, which are the most common.
270 value = str(value)
272 super().__setitem__(key, value)
275class HTMLAttributeDict(AttributeDict):
276 """A dictionary for holding a Tag's attributes, which processes
277 incoming values for consistency with the HTML spec, which says
278 'Attribute values are a mixture of text and character
279 references...'
281 Basically, this means converting common non-string values into
282 strings, like XMLAttributeDict, though HTML also has some rules
283 around boolean attributes that XML doesn't have.
284 """
286 def __setitem__(self, key: str, value: Any) -> None:
287 """Set an attribute value, possibly modifying it to comply
288 with the HTML spec,
289 """
290 if value in (False, None):
291 # 'The values "true" and "false" are not allowed on
292 # boolean attributes. To represent a false value, the
293 # attribute has to be omitted altogether.'
294 if key in self:
295 del self[key]
296 return
297 if isinstance(value, bool):
298 # 'If the [boolean] attribute is present, its value must
299 # either be the empty string or a value that is an ASCII
300 # case-insensitive match for the attribute's canonical
301 # name, with no leading or trailing whitespace.'
302 #
303 # [fixme] It's not clear to me whether "canonical name"
304 # means fully-qualified name, unqualified name, or
305 # (probably not) name with namespace prefix. For now I'm
306 # going with unqualified name.
307 if isinstance(key, NamespacedAttribute):
308 value = key.name
309 else:
310 value = key
311 elif isinstance(value, (int, float)):
312 # See note in XMLAttributeDict for the reasoning why we
313 # only do this to numbers.
314 value = str(value)
315 super().__setitem__(key, value)
318class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
319 """A generic stand-in for the value of a ``<meta>`` tag's ``content``
320 attribute.
322 When Beautiful Soup parses the markup:
323 ``<meta http-equiv="content-type" content="text/html; charset=utf8">``
325 The value of the ``content`` attribute will become one of these objects.
327 If the document is later encoded to an encoding other than UTF-8, its
328 ``<meta>`` tag will mention the new encoding instead of ``utf8``.
329 """
331 #: Match the 'charset' argument inside the 'content' attribute
332 #: of a <meta> tag.
333 #: :meta private:
334 CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
336 def __new__(cls, original_value: str) -> Self:
337 cls.CHARSET_RE.search(original_value)
338 obj = str.__new__(cls, original_value)
339 obj.original_value = original_value
340 return obj
342 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str:
343 """When an HTML document is being encoded to a given encoding, the
344 value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes
345 the name of the encoding.
346 """
347 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
348 return self.CHARSET_RE.sub("", self.original_value)
350 def rewrite(match: re.Match[str]) -> str:
351 return match.group(1) + eventual_encoding
353 return self.CHARSET_RE.sub(rewrite, self.original_value)
356class PageElement(object):
357 """An abstract class representing a single element in the parse tree.
359 `NavigableString`, `Tag`, etc. are all subclasses of
360 `PageElement`. For this reason you'll see a lot of methods that
361 return `PageElement`, but you'll never see an actual `PageElement`
362 object. For the most part you can think of `PageElement` as
363 meaning "a `Tag` or a `NavigableString`."
364 """
366 #: In general, we can't tell just by looking at an element whether
367 #: it's contained in an XML document or an HTML document. But for
368 #: `Tag` objects (q.v.) we can store this information at parse time.
369 #: :meta private:
370 known_xml: Optional[bool] = None
372 #: Whether or not this element has been decomposed from the tree
373 #: it was created in.
374 _decomposed: bool
376 parent: Optional[Tag]
377 next_element: _AtMostOneElement
378 previous_element: _AtMostOneElement
379 next_sibling: _AtMostOneElement
380 previous_sibling: _AtMostOneElement
382 #: Whether or not this element is hidden from generated output.
383 #: Only the `BeautifulSoup` object itself is hidden.
384 hidden: bool = False
386 def setup(
387 self,
388 parent: Optional[Tag] = None,
389 previous_element: _AtMostOneElement = None,
390 next_element: _AtMostOneElement = None,
391 previous_sibling: _AtMostOneElement = None,
392 next_sibling: _AtMostOneElement = None,
393 ) -> None:
394 """Sets up the initial relations between this element and
395 other elements.
397 :param parent: The parent of this element.
399 :param previous_element: The element parsed immediately before
400 this one.
402 :param next_element: The element parsed immediately after
403 this one.
405 :param previous_sibling: The most recently encountered element
406 on the same level of the parse tree as this one.
408 :param previous_sibling: The next element to be encountered
409 on the same level of the parse tree as this one.
410 """
411 self.parent = parent
413 self.previous_element = previous_element
414 if self.previous_element is not None:
415 self.previous_element.next_element = self
417 self.next_element = next_element
418 if self.next_element is not None:
419 self.next_element.previous_element = self
421 self.next_sibling = next_sibling
422 if self.next_sibling is not None:
423 self.next_sibling.previous_sibling = self
425 if (
426 previous_sibling is None
427 and self.parent is not None
428 and self.parent.contents
429 ):
430 previous_sibling = self.parent.contents[-1]
432 self.previous_sibling = previous_sibling
433 if self.previous_sibling is not None:
434 self.previous_sibling.next_sibling = self
436 def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str:
437 """Format the given string using the given formatter.
439 :param s: A string.
440 :param formatter: A Formatter object, or a string naming one of the standard formatters.
441 """
442 if formatter is None:
443 return s
444 if not isinstance(formatter, Formatter):
445 formatter = self.formatter_for_name(formatter)
446 output = formatter.substitute(s)
447 return output
449 def formatter_for_name(
450 self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction]
451 ) -> Formatter:
452 """Look up or create a Formatter for the given identifier,
453 if necessary.
455 :param formatter: Can be a `Formatter` object (used as-is), a
456 function (used as the entity substitution hook for an
457 `bs4.formatter.XMLFormatter` or
458 `bs4.formatter.HTMLFormatter`), or a string (used to look
459 up an `bs4.formatter.XMLFormatter` or
460 `bs4.formatter.HTMLFormatter` in the appropriate registry.
462 """
463 if isinstance(formatter_name, Formatter):
464 return formatter_name
465 c: type[Formatter]
466 registry: Mapping[Optional[str], Formatter]
467 if self._is_xml:
468 c = XMLFormatter
469 registry = XMLFormatter.REGISTRY
470 else:
471 c = HTMLFormatter
472 registry = HTMLFormatter.REGISTRY
473 if callable(formatter_name):
474 return c(entity_substitution=formatter_name)
475 return registry[formatter_name]
477 @property
478 def _is_xml(self) -> bool:
479 """Is this element part of an XML tree or an HTML tree?
481 This is used in formatter_for_name, when deciding whether an
482 XMLFormatter or HTMLFormatter is more appropriate. It can be
483 inefficient, but it should be called very rarely.
484 """
485 if self.known_xml is not None:
486 # Most of the time we will have determined this when the
487 # document is parsed.
488 return self.known_xml
490 # Otherwise, it's likely that this element was created by
491 # direct invocation of the constructor from within the user's
492 # Python code.
493 if self.parent is None:
494 # This is the top-level object. It should have .known_xml set
495 # from tree creation. If not, take a guess--BS is usually
496 # used on HTML markup.
497 return getattr(self, "is_xml", False)
498 return self.parent._is_xml
500 nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0")
501 previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0")
503 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
504 raise NotImplementedError()
506 def __copy__(self) -> Self:
507 """A copy of a PageElement can only be a deep copy, because
508 only one PageElement can occupy a given place in a parse tree.
509 """
510 return self.__deepcopy__({})
512 default: Iterable[type[NavigableString]] = tuple() #: :meta private:
514 def _all_strings(
515 self, strip: bool = False, types: Iterable[type[NavigableString]] = default
516 ) -> Iterator[str]:
517 """Yield all strings of certain classes, possibly stripping them.
519 This is implemented differently in `Tag` and `NavigableString`.
520 """
521 raise NotImplementedError()
523 @property
524 def stripped_strings(self) -> Iterator[str]:
525 """Yield all interesting strings in this PageElement, stripping them
526 first.
528 See `Tag` for information on which strings are considered
529 interesting in a given context.
530 """
531 for string in self._all_strings(True):
532 yield string
534 def get_text(
535 self,
536 separator: str = "",
537 strip: bool = False,
538 types: Iterable[Type[NavigableString]] = default,
539 ) -> str:
540 """Get all child strings of this PageElement, concatenated using the
541 given separator.
543 :param separator: Strings will be concatenated using this separator.
545 :param strip: If True, strings will be stripped before being
546 concatenated.
548 :param types: A tuple of NavigableString subclasses. Any
549 strings of a subclass not found in this list will be
550 ignored. Although there are exceptions, the default
551 behavior in most cases is to consider only NavigableString
552 and CData objects. That means no comments, processing
553 instructions, etc.
555 :return: A string.
556 """
557 return separator.join([s for s in self._all_strings(strip, types=types)])
559 getText = get_text
561 @property
562 def text(self) -> str:
563 return self.get_text()
565 def replace_with(self, *args: _InsertableElement) -> Self:
566 """Replace this `PageElement` with one or more other elements,
567 objects, keeping the rest of the tree the same.
569 :return: This `PageElement`, no longer part of the tree.
570 """
571 if self.parent is None:
572 raise ValueError(
573 "Cannot replace one element with another when the "
574 "element to be replaced is not part of a tree."
575 )
576 if len(args) == 1 and args[0] is self:
577 # Replacing an element with itself is a no-op.
578 return self
579 if any(x is self.parent for x in args):
580 raise ValueError("Cannot replace a Tag with its parent.")
581 old_parent = self.parent
582 my_index = self.parent.index(self)
583 self.extract(_self_index=my_index)
584 for idx, replace_with in enumerate(args, start=my_index):
585 old_parent.insert(idx, replace_with)
586 return self
588 replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0")
590 def wrap(self, wrap_inside: Tag) -> Tag:
591 """Wrap this `PageElement` inside a `Tag`.
593 :return: ``wrap_inside``, occupying the position in the tree that used
594 to be occupied by this object, and with this object now inside it.
595 """
596 me = self.replace_with(wrap_inside)
597 wrap_inside.append(me)
598 return wrap_inside
600 def extract(self, _self_index: Optional[int] = None) -> Self:
601 """Destructively rips this element out of the tree.
603 :param _self_index: The location of this element in its parent's
604 .contents, if known. Passing this in allows for a performance
605 optimization.
607 :return: this `PageElement`, no longer part of the tree.
608 """
609 if self.parent is not None:
610 if _self_index is None:
611 _self_index = self.parent.index(self)
612 del self.parent.contents[_self_index]
614 # Find the two elements that would be next to each other if
615 # this element (and any children) hadn't been parsed. Connect
616 # the two.
617 last_child = self._last_descendant()
619 # last_child can't be None because we passed accept_self=True
620 # into _last_descendant. Worst case, last_child will be
621 # self. Making this cast removes several mypy complaints later
622 # on as we manipulate last_child.
623 last_child = cast(PageElement, last_child)
624 next_element = last_child.next_element
626 if self.previous_element is not None:
627 if self.previous_element is not next_element:
628 self.previous_element.next_element = next_element
629 if next_element is not None and next_element is not self.previous_element:
630 next_element.previous_element = self.previous_element
631 self.previous_element = None
632 last_child.next_element = None
634 self.parent = None
635 if (
636 self.previous_sibling is not None
637 and self.previous_sibling is not self.next_sibling
638 ):
639 self.previous_sibling.next_sibling = self.next_sibling
640 if (
641 self.next_sibling is not None
642 and self.next_sibling is not self.previous_sibling
643 ):
644 self.next_sibling.previous_sibling = self.previous_sibling
645 self.previous_sibling = self.next_sibling = None
646 return self
648 def decompose(self) -> None:
649 """Recursively destroys this `PageElement` and its children.
651 The element will be removed from the tree and wiped out; so
652 will everything beneath it.
654 The behavior of a decomposed `PageElement` is undefined and you
655 should never use one for anything, but if you need to *check*
656 whether an element has been decomposed, you can use the
657 `PageElement.decomposed` property.
658 """
659 self.extract()
660 e: _AtMostOneElement = self
661 next_up: _AtMostOneElement = None
662 while e is not None:
663 next_up = e.next_element
664 e.__dict__.clear()
665 if isinstance(e, Tag):
666 e.name = ""
667 e.contents = []
668 e._decomposed = True
669 e = next_up
671 def _last_descendant(
672 self, is_initialized: bool = True, accept_self: bool = True
673 ) -> _AtMostOneElement:
674 """Finds the last element beneath this object to be parsed.
676 Special note to help you figure things out if your type
677 checking is tripped up by the fact that this method returns
678 _AtMostOneElement instead of PageElement: the only time
679 this method returns None is if `accept_self` is False and the
680 `PageElement` has no children--either it's a NavigableString
681 or an empty Tag.
683 :param is_initialized: Has `PageElement.setup` been called on
684 this `PageElement` yet?
686 :param accept_self: Is ``self`` an acceptable answer to the
687 question?
688 """
689 if is_initialized and self.next_sibling is not None:
690 last_child = self.next_sibling.previous_element
691 else:
692 last_child = self
693 while isinstance(last_child, Tag) and last_child.contents:
694 last_child = last_child.contents[-1]
695 if not accept_self and last_child is self:
696 last_child = None
697 return last_child
699 _lastRecursiveChild = _deprecated_alias(
700 "_lastRecursiveChild", "_last_descendant", "4.0.0"
701 )
703 def insert_before(self, *args: _InsertableElement) -> List[PageElement]:
704 """Makes the given element(s) the immediate predecessor of this one.
706 All the elements will have the same `PageElement.parent` as
707 this one, and the given elements will occur immediately before
708 this one.
710 :param args: One or more PageElements.
712 :return The list of PageElements that were inserted.
713 """
714 parent = self.parent
715 if parent is None:
716 raise ValueError("Element has no parent, so 'before' has no meaning.")
717 if any(x is self for x in args):
718 raise ValueError("Can't insert an element before itself.")
719 results: List[PageElement] = []
720 for predecessor in args:
721 # Extract first so that the index won't be screwed up if they
722 # are siblings.
723 if isinstance(predecessor, PageElement):
724 predecessor.extract()
725 index = parent.index(self)
726 results.extend(parent.insert(index, predecessor))
728 return results
730 def insert_after(self, *args: _InsertableElement) -> List[PageElement]:
731 """Makes the given element(s) the immediate successor of this one.
733 The elements will have the same `PageElement.parent` as this
734 one, and the given elements will occur immediately after this
735 one.
737 :param args: One or more PageElements.
739 :return The list of PageElements that were inserted.
740 """
741 # Do all error checking before modifying the tree.
742 parent = self.parent
743 if parent is None:
744 raise ValueError("Element has no parent, so 'after' has no meaning.")
745 if any(x is self for x in args):
746 raise ValueError("Can't insert an element after itself.")
748 offset = 0
749 results: List[PageElement] = []
750 for successor in args:
751 # Extract first so that the index won't be screwed up if they
752 # are siblings.
753 if isinstance(successor, PageElement):
754 successor.extract()
755 index = parent.index(self)
756 results.extend(parent.insert(index + 1 + offset, successor))
757 offset += 1
759 return results
761 def new_tag(
762 self,
763 name: str,
764 namespace: Optional[str] = None,
765 nsprefix: Optional[str] = None,
766 attrs: Optional[_RawAttributeValues] = None,
767 sourceline: Optional[int] = None,
768 sourcepos: Optional[int] = None,
769 string: Optional[str] = None,
770 **kwattrs: _RawAttributeValue,
771 ) -> Tag:
772 """Create a new Tag associated with the same BeautifulSoup object as this PageElement is."""
773 root = self._root_object
774 if root is None:
775 raise ValueError("Cannot call new_tag on a PageElement not contained in a BeautifulSoup object")
776 return root.new_tag(name, namespace, nsprefix, attrs, sourceline, sourcepos, string, **kwattrs)
778 def new_string(self, s: str, subclass: Optional[Type[NavigableString]] = None
779 ) -> NavigableString:
780 """Create a new NavigableString associated with the same BeautifulSoup object as this PageElement is."""
781 root = self._root_object
782 if root is None:
783 raise ValueError("Cannot call new_string on a PageElement not contained in a BeautifulSoup object")
784 return root.new_string(s, subclass)
786 @property
787 def _root_object(self) -> Optional[BeautifulSoup]:
788 """Find the BeautifulSoup object used to create this PageElement, assuming it's still attached."""
789 parent:Optional[Tag] = self.parent
790 while parent is not None and not parent._is_root:
791 parent = parent.parent
792 if parent is None:
793 return parent
794 return cast('BeautifulSoup', parent)
796 @property
797 def _is_root(self) -> bool:
798 """No, this object is not the root of its parse tree; only a BeautifulSoup object can be that."""
799 return False
801 # No name or attrs + string -> string
802 @overload
803 def find_next(
804 self,
805 name: None = None,
806 attrs: None = None,
807 *,
808 string: _StrainableString,
809 **kwargs: _StrainableAttribute,
810 ) -> _AtMostOneNavigableString:
811 ...
813 # No string -> tag
814 @overload
815 def find_next(
816 self,
817 name: _OptionalFindMethodName = None,
818 attrs: Optional[_StrainableAttributes] = None,
819 string: None=None,
820 **kwargs: _StrainableAttribute,
821 ) -> _AtMostOneTag:
822 ...
824 def find_next(
825 self,
826 name: _OptionalFindMethodName = None,
827 attrs: Optional[_StrainableAttributes] = None,
828 string: Optional[_StrainableString] = None,
829 **kwargs: _StrainableAttribute,
830 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
831 """Find the first PageElement that matches the given criteria and
832 appears later in the document than this PageElement.
834 All find_* methods take a common set of arguments. See the online
835 documentation for detailed explanations.
837 :param name: A filter on tag name.
838 :param attrs: Additional filters on attribute values.
839 :param string: A filter for a NavigableString with specific text.
840 :kwargs: Additional filters on attribute values.
841 """
842 return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
844 findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0")
846 # No name or attrs + string -> strings
847 @overload
848 def find_all_next(
849 self,
850 name: None = None,
851 attrs: None = None,
852 *,
853 string: _StrainableString,
854 limit: Optional[int] = None,
855 **kwargs: _StrainableAttribute,
856 ) -> _SomeNavigableStrings:
857 ...
859 # No string -> tags
860 @overload
861 def find_all_next(
862 self,
863 name: _OptionalFindMethodName = None,
864 attrs: Optional[_StrainableAttributes] = None,
865 string: None = None,
866 limit: Optional[int] = None,
867 **kwargs: _StrainableAttribute,
868 ) -> _SomeTags:
869 ...
871 def find_all_next(
872 self,
873 name: _OptionalFindMethodName = None,
874 attrs: Optional[_StrainableAttributes] = None,
875 string: Optional[_StrainableString] = None,
876 limit: Optional[int] = None,
877 **kwargs: _StrainableAttribute,
878 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
879 """Find all `PageElement` objects that match the given criteria and
880 appear later in the document than this `PageElement`.
882 All find_* methods take a common set of arguments. See the online
883 documentation for detailed explanations.
885 :param name: A filter on tag name.
886 :param attrs: Additional filters on attribute values.
887 :param string: A filter for a NavigableString with specific text.
888 :param limit: Stop looking after finding this many results.
889 :kwargs: Additional filters on attribute values.
890 """
891 return self._find_all(
892 name,
893 attrs,
894 string,
895 limit,
896 self.next_elements,
897 **kwargs,
898 )
900 findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0")
902 # No name or attrs + string -> strings
903 @overload
904 def find_next_sibling(
905 self,
906 name: None = None,
907 attrs: None = None,
908 *,
909 string: _StrainableString,
910 **kwargs: _StrainableAttribute,
911 ) -> _AtMostOneNavigableString:
912 ...
914 # No string -> tags
915 @overload
916 def find_next_sibling(
917 self,
918 name: _OptionalFindMethodName = None,
919 attrs: Optional[_StrainableAttributes] = None,
920 string: None = None,
921 **kwargs: _StrainableAttribute,
922 ) -> _AtMostOneTag:
923 ...
925 def find_next_sibling(
926 self,
927 name: _OptionalFindMethodName = None,
928 attrs: Optional[_StrainableAttributes] = None,
929 string: Optional[_StrainableString] = None,
930 **kwargs: _StrainableAttribute,
931 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
932 """Find the closest sibling to this PageElement that matches the
933 given criteria and appears later in the document.
935 All find_* methods take a common set of arguments. See the
936 online documentation for detailed explanations.
938 :param name: A filter on tag name.
939 :param attrs: Additional filters on attribute values.
940 :param string: A filter for a `NavigableString` with specific text.
941 :kwargs: Additional filters on attribute values.
942 """
943 return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs)
945 findNextSibling = _deprecated_function_alias(
946 "findNextSibling", "find_next_sibling", "4.0.0"
947 )
949 # No name or attrs + string -> strings
950 @overload
951 def find_next_siblings(
952 self,
953 name: None = None,
954 attrs: None = None,
955 *,
956 string: _StrainableString,
957 limit: Optional[int] = None,
958 **kwargs: _StrainableAttribute,
959 ) -> _SomeNavigableStrings:
960 ...
962 # No string -> tags
963 @overload
964 def find_next_siblings(
965 self,
966 name: _OptionalFindMethodName = None,
967 attrs: Optional[_StrainableAttributes] = None,
968 string: None = None,
969 limit: Optional[int] = None,
970 **kwargs: _StrainableAttribute,
971 ) -> _SomeTags:
972 ...
974 def find_next_siblings(
975 self,
976 name: _OptionalFindMethodName = None,
977 attrs: Optional[_StrainableAttributes] = None,
978 string: Optional[_StrainableString] = None,
979 limit: Optional[int] = None,
980 **kwargs: _StrainableAttribute,
981 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
982 """Find all siblings of this `PageElement` that match the given criteria
983 and appear later in the document.
985 All find_* methods take a common set of arguments. See the online
986 documentation for detailed explanations.
988 :param name: A filter on tag name.
989 :param attrs: Additional filters on attribute values.
990 :param string: A filter for a `NavigableString` with specific text.
991 :param limit: Stop looking after finding this many results.
992 :kwargs: Additional filters on attribute values.
993 """
994 return self._find_all(
995 name,
996 attrs,
997 string,
998 limit,
999 self.next_siblings,
1000 **kwargs,
1001 )
1003 findNextSiblings = _deprecated_function_alias(
1004 "findNextSiblings", "find_next_siblings", "4.0.0"
1005 )
1006 fetchNextSiblings = _deprecated_function_alias(
1007 "fetchNextSiblings", "find_next_siblings", "3.0.0"
1008 )
1010 # No name or attrs + string -> string
1011 @overload
1012 def find_previous(
1013 self,
1014 name: None = None,
1015 attrs: None = None,
1016 *,
1017 string: _StrainableString,
1018 **kwargs: _StrainableAttribute,
1019 ) -> _AtMostOneNavigableString:
1020 ...
1022 # No string -> tag
1023 @overload
1024 def find_previous(
1025 self,
1026 name: _OptionalFindMethodName = None,
1027 attrs: Optional[_StrainableAttributes] = None,
1028 string: None=None,
1029 **kwargs: _StrainableAttribute,
1030 ) -> _AtMostOneTag:
1031 ...
1033 def find_previous(
1034 self,
1035 name: _OptionalFindMethodName = None,
1036 attrs: Optional[_StrainableAttributes] = None,
1037 string: Optional[_StrainableString] = None,
1038 **kwargs: _StrainableAttribute,
1039 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
1040 """Look backwards in the document from this `PageElement` and find the
1041 first `PageElement` that matches the given criteria.
1043 All find_* methods take a common set of arguments. See the online
1044 documentation for detailed explanations.
1046 :param name: A filter on tag name.
1047 :param attrs: Additional filters on attribute values.
1048 :param string: A filter for a `NavigableString` with specific text.
1049 :kwargs: Additional filters on attribute values.
1050 """
1051 return self._find_one(self.find_all_previous, name, attrs, string, **kwargs)
1053 findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0")
1055 # No name or attrs + string -> strings
1056 @overload
1057 def find_all_previous(
1058 self,
1059 name: None = None,
1060 attrs: None = None,
1061 *,
1062 string: _StrainableString,
1063 limit: Optional[int] = None,
1064 **kwargs: _StrainableAttribute,
1065 ) -> _SomeNavigableStrings:
1066 ...
1068 # No string -> tags
1069 @overload
1070 def find_all_previous(
1071 self,
1072 name: _OptionalFindMethodName = None,
1073 attrs: Optional[_StrainableAttributes] = None,
1074 string: None = None,
1075 limit: Optional[int] = None,
1076 **kwargs: _StrainableAttribute,
1077 ) -> _SomeTags:
1078 ...
1080 def find_all_previous(
1081 self,
1082 name: _OptionalFindMethodName = None,
1083 attrs: Optional[_StrainableAttributes] = None,
1084 string: Optional[_StrainableString] = None,
1085 limit: Optional[int] = None,
1086 **kwargs: _StrainableAttribute,
1087 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
1088 """Look backwards in the document from this `PageElement` and find all
1089 `PageElement` that match the given criteria.
1091 All find_* methods take a common set of arguments. See the online
1092 documentation for detailed explanations.
1094 :param name: A filter on tag name.
1095 :param attrs: Additional filters on attribute values.
1096 :param string: A filter for a `NavigableString` with specific text.
1097 :param limit: Stop looking after finding this many results.
1098 :kwargs: Additional filters on attribute values.
1099 """
1100 return self._find_all(
1101 name,
1102 attrs,
1103 string,
1104 limit,
1105 self.previous_elements,
1106 **kwargs,
1107 )
1109 findAllPrevious = _deprecated_function_alias(
1110 "findAllPrevious", "find_all_previous", "4.0.0"
1111 )
1112 fetchAllPrevious = _deprecated_function_alias(
1113 "fetchAllPrevious", "find_all_previous", "3.0.0"
1114 )
1116 # No name or attrs + string -> string
1117 @overload
1118 def find_previous_sibling(
1119 self,
1120 name: None = None,
1121 attrs: None = None,
1122 *,
1123 string: _StrainableString,
1124 **kwargs: _StrainableAttribute,
1125 ) -> _AtMostOneNavigableString:
1126 ...
1128 # No string -> tag
1129 @overload
1130 def find_previous_sibling(
1131 self,
1132 name: _OptionalFindMethodName = None,
1133 attrs: Optional[_StrainableAttributes] = None,
1134 string: None = None,
1135 **kwargs: _StrainableAttribute,
1136 ) -> _AtMostOneTag:
1137 ...
1139 def find_previous_sibling(
1140 self,
1141 name: _OptionalFindMethodName = None,
1142 attrs: Optional[_StrainableAttributes] = None,
1143 string: Optional[_StrainableString] = None,
1144 **kwargs: _StrainableAttribute,
1145 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
1146 """Returns the closest sibling to this `PageElement` that matches the
1147 given criteria and appears earlier in the document.
1149 All find_* methods take a common set of arguments. See the online
1150 documentation for detailed explanations.
1152 :param name: A filter on tag name.
1153 :param attrs: Additional filters on attribute values.
1154 :param string: A filter for a `NavigableString` with specific text.
1155 :kwargs: Additional filters on attribute values.
1156 """
1157 return self._find_one(
1158 self.find_previous_siblings, name, attrs, string, **kwargs
1159 )
1161 findPreviousSibling = _deprecated_function_alias(
1162 "findPreviousSibling", "find_previous_sibling", "4.0.0"
1163 )
1165 # No name or attrs + string -> strings
1166 @overload
1167 def find_previous_siblings(
1168 self,
1169 name: None = None,
1170 attrs: None = None,
1171 *,
1172 string: _StrainableString,
1173 limit: Optional[int] = None,
1174 **kwargs: _StrainableAttribute,
1175 ) -> _SomeNavigableStrings:
1176 ...
1178 # No string -> tags
1179 @overload
1180 def find_previous_siblings(
1181 self,
1182 name: _OptionalFindMethodName = None,
1183 attrs: Optional[_StrainableAttributes] = None,
1184 string: None = None,
1185 limit: Optional[int] = None,
1186 **kwargs: _StrainableAttribute,
1187 ) -> _SomeTags:
1188 ...
1190 def find_previous_siblings(
1191 self,
1192 name: _OptionalFindMethodName = None,
1193 attrs: Optional[_StrainableAttributes] = None,
1194 string: Optional[_StrainableString] = None,
1195 limit: Optional[int] = None,
1196 **kwargs: _StrainableAttribute,
1197 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
1198 """Returns all siblings to this PageElement that match the
1199 given criteria and appear earlier in the document.
1201 All find_* methods take a common set of arguments. See the online
1202 documentation for detailed explanations.
1204 :param name: A filter on tag name.
1205 :param attrs: Additional filters on attribute values.
1206 :param string: A filter for a NavigableString with specific text.
1207 :param limit: Stop looking after finding this many results.
1208 :kwargs: Additional filters on attribute values.
1209 """
1210 return self._find_all(
1211 name,
1212 attrs,
1213 string,
1214 limit,
1215 self.previous_siblings,
1216 **kwargs,
1217 )
1219 findPreviousSiblings = _deprecated_function_alias(
1220 "findPreviousSiblings", "find_previous_siblings", "4.0.0"
1221 )
1222 fetchPreviousSiblings = _deprecated_function_alias(
1223 "fetchPreviousSiblings", "find_previous_siblings", "3.0.0"
1224 )
1226 def find_parent(
1227 self,
1228 name: _OptionalFindMethodName = None,
1229 attrs: Optional[_StrainableAttributes] = None,
1230 **kwargs: _StrainableAttribute,
1231 ) -> _AtMostOneTag:
1232 """Find the closest parent of this PageElement that matches the given
1233 criteria.
1235 All find_* methods take a common set of arguments. See the online
1236 documentation for detailed explanations.
1238 :param name: A filter on tag name.
1239 :param attrs: Additional filters on attribute values.
1240 :param self: Whether the PageElement itself should be considered
1241 as one of its 'parents'.
1242 :kwargs: Additional filters on attribute values.
1243 """
1244 # NOTE: We can't use _find_one because findParents takes a different
1245 # set of arguments.
1246 r = None
1247 results = self.find_parents(
1248 name, attrs, 1, **kwargs
1249 )
1250 if results:
1251 r = results[0]
1252 return r
1254 findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0")
1256 def find_parents(
1257 self,
1258 name: _OptionalFindMethodName = None,
1259 attrs: Optional[_StrainableAttributes] = None,
1260 limit: Optional[int] = None,
1261 **kwargs: _StrainableAttribute,
1262 ) -> _SomeTags:
1263 """Find all parents of this `PageElement` that match the given criteria.
1265 All find_* methods take a common set of arguments. See the online
1266 documentation for detailed explanations.
1268 :param name: A filter on tag name.
1269 :param attrs: Additional filters on attribute values.
1270 :param limit: Stop looking after finding this many results.
1271 :kwargs: Additional filters on attribute values.
1272 """
1273 iterator = self.parents
1274 # Only Tags can have children, so this ResultSet will contain
1275 # nothing but Tags.
1276 return cast(ResultSet[Tag], self._find_all(
1277 name, attrs, None, limit, iterator, **kwargs
1278 ))
1280 findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0")
1281 fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0")
1283 @property
1284 def next(self) -> _AtMostOneElement:
1285 """The `PageElement`, if any, that was parsed just after this one."""
1286 return self.next_element
1288 @property
1289 def previous(self) -> _AtMostOneElement:
1290 """The `PageElement`, if any, that was parsed just before this one."""
1291 return self.previous_element
1293 # These methods do the real heavy lifting.
1295 def _find_one(
1296 self,
1297 # TODO-TYPING: "There is no syntax to indicate optional or
1298 # keyword arguments; such function types are rarely used
1299 # as callback types." - So, not sure how to get more
1300 # specific here.
1301 method: Callable,
1302 name: _OptionalFindMethodName,
1303 attrs: Optional[_StrainableAttributes],
1304 string: Optional[_StrainableString],
1305 **kwargs: _StrainableAttribute,
1306 ) -> _AtMostOneElement:
1307 r: _AtMostOneElement = None
1308 results: _QueryResults = method(name, attrs, string, 1, **kwargs)
1309 if results:
1310 r = results[0]
1311 return r
1313 @property
1314 def _warning_stack_level(self) -> int:
1315 """Find the appropriate stack level to use when issuing a warning relating to one of the find* methods."""
1316 # The find* methods call each other, which makes it
1317 # difficult to track how deep we are in the stack
1318 # vis-a-vis the caller's entry point into the bs4.element
1319 # module. However, we know that all of the find* methods
1320 # are in bs4.element, and there's no code in this module
1321 # that triggers the warnings we need to issue.
1322 #
1323 # (There is _test_ code that triggers the warnings, but that's
1324 # in bs4.tests.)
1325 #
1326 # Therefore we can go up the stack until we leave the
1327 # bs4.element module, and use the distance between here and
1328 # there as the stacklevel.
1329 stacklevel = 0
1330 for frameinfo in inspect.stack(context=0):
1331 if (frameinfo.frame is not None
1332 and frameinfo.frame.f_globals is not None
1333 and frameinfo.frame.f_globals.get('__name__', '') != "bs4.element"):
1334 break
1335 stacklevel += 1
1336 return stacklevel
1338 def _find_all(
1339 self,
1340 name: _OptionalFindMethodName,
1341 attrs: Optional[_StrainableAttributes],
1342 string: Optional[_StrainableString],
1343 limit: Optional[int],
1344 generator: Iterator[PageElement],
1345 **kwargs: _StrainableAttribute,
1346 ) -> _QueryResults:
1347 """Iterates over a generator looking for things that match."""
1349 if string is None and "text" in kwargs:
1350 string = kwargs.pop("text")
1351 warnings.warn(
1352 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
1353 DeprecationWarning,
1354 stacklevel=self._warning_stack_level,
1355 )
1357 if "_class" in kwargs:
1358 warnings.warn(
1359 AttributeResemblesVariableWarning.MESSAGE
1360 % dict(
1361 original="_class",
1362 autocorrect="class_",
1363 ),
1364 AttributeResemblesVariableWarning,
1365 stacklevel=self._warning_stack_level,
1366 )
1368 from bs4.filter import ElementFilter
1370 if isinstance(name, ElementFilter):
1371 matcher = name
1372 else:
1373 matcher = SoupStrainer(name, attrs, string, **kwargs)
1375 result: MutableSequence[_OneElement]
1376 if string is None and not limit and not attrs and not kwargs:
1377 if name is True or name is None:
1378 # Optimization to find all tags.
1379 result = [element for element in generator if isinstance(element, Tag)]
1380 return ResultSet(matcher, result)
1381 elif isinstance(name, str):
1382 # Optimization to find all tags with a given name.
1383 if name.count(":") == 1:
1384 # This is a name with a prefix. If this is a namespace-aware document,
1385 # we need to match the local name against tag.name. If not,
1386 # we need to match the fully-qualified name against tag.name.
1387 prefix, local_name = name.split(":", 1)
1388 else:
1389 prefix = None
1390 local_name = name
1391 result = []
1392 for element in generator:
1393 if not isinstance(element, Tag):
1394 continue
1395 if element.name == name or (
1396 element.name == local_name
1397 and (prefix is None or element.prefix == prefix)
1398 ):
1399 result.append(element)
1400 return ResultSet(matcher, result)
1401 return matcher.find_all(generator, limit)
1403 # These generators can be used to navigate starting from both
1404 # NavigableStrings and Tags.
1405 @property
1406 def next_elements(self) -> Iterator[PageElement]:
1407 """All PageElements that were parsed after this one."""
1408 i = self.next_element
1409 while i is not None:
1410 successor = i.next_element
1411 yield i
1412 i = successor
1414 @property
1415 def self_and_next_elements(self) -> Iterator[PageElement]:
1416 """This PageElement, then all PageElements that were parsed after it."""
1417 return self._self_and(self.next_elements)
1419 @property
1420 def next_siblings(self) -> Iterator[PageElement]:
1421 """All PageElements that are siblings of this one but were parsed
1422 later.
1423 """
1424 i = self.next_sibling
1425 while i is not None:
1426 successor = i.next_sibling
1427 yield i
1428 i = successor
1430 @property
1431 def self_and_next_siblings(self) -> Iterator[PageElement]:
1432 """This PageElement, then all of its siblings."""
1433 return self._self_and(self.next_siblings)
1435 @property
1436 def previous_elements(self) -> Iterator[PageElement]:
1437 """All PageElements that were parsed before this one.
1439 :yield: A sequence of PageElements.
1440 """
1441 i = self.previous_element
1442 while i is not None:
1443 successor = i.previous_element
1444 yield i
1445 i = successor
1447 @property
1448 def self_and_previous_elements(self) -> Iterator[PageElement]:
1449 """This PageElement, then all elements that were parsed
1450 earlier."""
1451 return self._self_and(self.previous_elements)
1453 @property
1454 def previous_siblings(self) -> Iterator[PageElement]:
1455 """All PageElements that are siblings of this one but were parsed
1456 earlier.
1458 :yield: A sequence of PageElements.
1459 """
1460 i = self.previous_sibling
1461 while i is not None:
1462 successor = i.previous_sibling
1463 yield i
1464 i = successor
1466 @property
1467 def self_and_previous_siblings(self) -> Iterator[PageElement]:
1468 """This PageElement, then all of its siblings that were parsed
1469 earlier."""
1470 return self._self_and(self.previous_siblings)
1472 @property
1473 def parents(self) -> Iterator[Tag]:
1474 """All elements that are parents of this PageElement.
1476 :yield: A sequence of Tags, ending with a BeautifulSoup object.
1477 """
1478 i = self.parent
1479 while i is not None:
1480 successor = i.parent
1481 yield i
1482 i = successor
1484 @property
1485 def self_and_parents(self) -> Iterator[PageElement]:
1486 """This element, then all of its parents.
1488 :yield: A sequence of PageElements, ending with a BeautifulSoup object.
1489 """
1490 return self._self_and(self.parents)
1492 def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]:
1493 """Modify a generator by yielding this element, then everything
1494 yielded by the other generator.
1495 """
1496 if not self.hidden:
1497 yield self
1498 for i in other_generator:
1499 yield i
1501 @property
1502 def decomposed(self) -> bool:
1503 """Check whether a PageElement has been decomposed."""
1504 return getattr(self, "_decomposed", False) or False
1506 @_deprecated("next_elements", "4.0.0")
1507 def nextGenerator(self) -> Iterator[PageElement]:
1508 ":meta private:"
1509 return self.next_elements
1511 @_deprecated("next_siblings", "4.0.0")
1512 def nextSiblingGenerator(self) -> Iterator[PageElement]:
1513 ":meta private:"
1514 return self.next_siblings
1516 @_deprecated("previous_elements", "4.0.0")
1517 def previousGenerator(self) -> Iterator[PageElement]:
1518 ":meta private:"
1519 return self.previous_elements
1521 @_deprecated("previous_siblings", "4.0.0")
1522 def previousSiblingGenerator(self) -> Iterator[PageElement]:
1523 ":meta private:"
1524 return self.previous_siblings
1526 @_deprecated("parents", "4.0.0")
1527 def parentGenerator(self) -> Iterator[PageElement]:
1528 ":meta private:"
1529 return self.parents
1532class NavigableString(str, PageElement):
1533 """A Python string that is part of a parse tree.
1535 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
1536 create a `NavigableString` for the string "penguin".
1537 """
1539 #: A string prepended to the body of the 'real' string
1540 #: when formatting it as part of a document, such as the '<!--'
1541 #: in an HTML comment.
1542 PREFIX: str = ""
1544 #: A string appended to the body of the 'real' string
1545 #: when formatting it as part of a document, such as the '-->'
1546 #: in an HTML comment.
1547 SUFFIX: str = ""
1549 def __new__(cls, value: Union[str, bytes]) -> Self:
1550 """Create a new NavigableString.
1552 When unpickling a NavigableString, this method is called with
1553 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
1554 passed in to the superclass's __new__ or the superclass won't know
1555 how to handle non-ASCII characters.
1556 """
1557 if isinstance(value, str):
1558 u = str.__new__(cls, value)
1559 else:
1560 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
1561 u.hidden = False
1562 u.setup()
1563 return u
1565 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self:
1566 """A copy of a NavigableString has the same contents and class
1567 as the original, but it is not connected to the parse tree.
1569 :param recursive: This parameter is ignored; it's only defined
1570 so that NavigableString.__deepcopy__ implements the same
1571 signature as Tag.__deepcopy__.
1572 """
1573 return type(self)(self)
1575 def __getnewargs__(self) -> Tuple[str]:
1576 return (str(self),)
1578 # TODO-TYPING This should be SupportsIndex|slice but SupportsIndex
1579 # is introduced in 3.8. This can be changed once 3.7 support is dropped.
1580 def __getitem__(self, key: Union[int|slice]) -> str: # type:ignore
1581 """Raise an exception """
1582 if isinstance(key, str):
1583 raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__))
1584 return super(NavigableString, self).__getitem__(key)
1586 @property
1587 def string(self) -> str:
1588 """Convenience property defined to match `Tag.string`.
1590 :return: This property always returns the `NavigableString` it was
1591 called on.
1593 :meta private:
1594 """
1595 return self
1597 def output_ready(self, formatter: _FormatterOrName = "minimal") -> str:
1598 """Run the string through the provided formatter, making it
1599 ready for output as part of an HTML or XML document.
1601 :param formatter: A `Formatter` object, or a string naming one
1602 of the standard formatters.
1603 """
1604 output = self.format_string(self, formatter)
1605 return self.PREFIX + output + self.SUFFIX
1607 @property
1608 def name(self) -> None:
1609 """Since a NavigableString is not a Tag, it has no .name.
1611 This property is implemented so that code like this doesn't crash
1612 when run on a mixture of Tag and NavigableString objects:
1613 [x.name for x in tag.children]
1615 :meta private:
1616 """
1617 return None
1619 @name.setter
1620 def name(self, name: str) -> None:
1621 """Prevent NavigableString.name from ever being set.
1623 :meta private:
1624 """
1625 raise AttributeError("A NavigableString cannot be given a name.")
1627 def _all_strings(
1628 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
1629 ) -> Iterator[str]:
1630 """Yield all strings of certain classes, possibly stripping them.
1632 This makes it easy for NavigableString to implement methods
1633 like get_text() as conveniences, creating a consistent
1634 text-extraction API across all PageElements.
1636 :param strip: If True, all strings will be stripped before being
1637 yielded.
1639 :param types: A tuple of NavigableString subclasses. If this
1640 NavigableString isn't one of those subclasses, the
1641 sequence will be empty. By default, the subclasses
1642 considered are NavigableString and CData objects. That
1643 means no comments, processing instructions, etc.
1645 :yield: A sequence that either contains this string, or is empty.
1646 """
1647 if types is self.default:
1648 # This is kept in Tag because it's full of subclasses of
1649 # this class, which aren't defined until later in the file.
1650 types = Tag.MAIN_CONTENT_STRING_TYPES
1652 # Do nothing if the caller is looking for specific types of
1653 # string, and we're of a different type.
1654 #
1655 # We check specific types instead of using isinstance(self,
1656 # types) because all of these classes subclass
1657 # NavigableString. Anyone who's using this feature probably
1658 # wants generic NavigableStrings but not other stuff.
1659 my_type = type(self)
1660 if types is not None:
1661 if isinstance(types, type):
1662 # Looking for a single type.
1663 if my_type is not types:
1664 return
1665 elif my_type not in types:
1666 # Looking for one of a list of types.
1667 return
1669 value = self
1670 if strip:
1671 final_value = value.strip()
1672 else:
1673 final_value = self
1674 if len(final_value) > 0:
1675 yield final_value
1677 @property
1678 def strings(self) -> Iterator[str]:
1679 """Yield this string, but only if it is interesting.
1681 This is defined the way it is for compatibility with
1682 `Tag.strings`. See `Tag` for information on which strings are
1683 interesting in a given context.
1685 :yield: A sequence that either contains this string, or is empty.
1686 """
1687 return self._all_strings()
1690class PreformattedString(NavigableString):
1691 """A `NavigableString` not subject to the normal formatting rules.
1693 This is an abstract class used for special kinds of strings such
1694 as comments (`Comment`) and CDATA blocks (`CData`).
1695 """
1697 PREFIX: str = ""
1698 SUFFIX: str = ""
1700 def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str:
1701 """Make this string ready for output by adding any subclass-specific
1702 prefix or suffix.
1704 :param formatter: A `Formatter` object, or a string naming one
1705 of the standard formatters. The string will be passed into the
1706 `Formatter`, but only to trigger any side effects: the return
1707 value is ignored.
1709 :return: The string, with any subclass-specific prefix and
1710 suffix added on.
1711 """
1712 if formatter is not None:
1713 self.format_string(self, formatter)
1714 return self.PREFIX + self + self.SUFFIX
1717class CData(PreformattedString):
1718 """A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_."""
1720 PREFIX: str = "<![CDATA["
1721 SUFFIX: str = "]]>"
1724class ProcessingInstruction(PreformattedString):
1725 """A SGML processing instruction."""
1727 PREFIX: str = "<?"
1728 SUFFIX: str = ">"
1731class XMLProcessingInstruction(ProcessingInstruction):
1732 """An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_."""
1734 PREFIX: str = "<?"
1735 SUFFIX: str = "?>"
1738class Comment(PreformattedString):
1739 """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_."""
1741 PREFIX: str = "<!--"
1742 SUFFIX: str = "-->"
1745class Declaration(PreformattedString):
1746 """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_."""
1748 PREFIX: str = "<?"
1749 SUFFIX: str = "?>"
1752class Doctype(PreformattedString):
1753 """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_."""
1755 @classmethod
1756 def for_name_and_ids(
1757 cls, name: str, pub_id: Optional[str], system_id: Optional[str]
1758 ) -> Doctype:
1759 """Generate an appropriate document type declaration for a given
1760 public ID and system ID.
1762 :param name: The name of the document's root element, e.g. 'html'.
1763 :param pub_id: The Formal Public Identifier for this document type,
1764 e.g. '-//W3C//DTD XHTML 1.1//EN'
1765 :param system_id: The system identifier for this document type,
1766 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1767 """
1768 return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id))
1770 @classmethod
1771 def _string_for_name_and_ids(
1772 cls, name: str, pub_id: Optional[str], system_id: Optional[str]
1773 ) -> str:
1774 """Generate a string to be used as the basis of a Doctype object.
1776 This is a separate method from for_name_and_ids() because the lxml
1777 TreeBuilder needs to call it.
1778 """
1779 value = name or ""
1780 if pub_id is not None:
1781 value += ' PUBLIC "%s"' % pub_id
1782 if system_id is not None:
1783 value += ' "%s"' % system_id
1784 elif system_id is not None:
1785 value += ' SYSTEM "%s"' % system_id
1786 return value
1788 PREFIX: str = "<!DOCTYPE "
1789 SUFFIX: str = ">\n"
1792class Stylesheet(NavigableString):
1793 """A `NavigableString` representing the contents of a `<style> HTML
1794 tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_
1795 (probably CSS).
1797 Used to distinguish embedded stylesheets from textual content.
1798 """
1801class Script(NavigableString):
1802 """A `NavigableString` representing the contents of a `<script>
1803 HTML tag
1804 <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_
1805 (probably Javascript).
1807 Used to distinguish executable code from textual content.
1808 """
1811class TemplateString(NavigableString):
1812 """A `NavigableString` representing a string found inside an `HTML
1813 <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_
1814 embedded in a larger document.
1816 Used to distinguish such strings from the main body of the document.
1817 """
1820class RubyTextString(NavigableString):
1821 """A NavigableString representing the contents of an `<rt> HTML
1822 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_.
1824 Can be used to distinguish such strings from the strings they're
1825 annotating.
1826 """
1829class RubyParenthesisString(NavigableString):
1830 """A NavigableString representing the contents of an `<rp> HTML
1831 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_.
1832 """
1835class Tag(PageElement):
1836 """An HTML or XML tag that is part of a parse tree, along with its
1837 attributes, contents, and relationships to other parts of the tree.
1839 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will
1840 create a `Tag` object representing the ``<b>`` tag. You can
1841 instantiate `Tag` objects directly, but it's not necessary unless
1842 you're adding entirely new markup to a parsed document. Most of
1843 the constructor arguments are intended for use by the `TreeBuilder`
1844 that's parsing a document.
1846 :param parser: A `BeautifulSoup` object representing the parse tree this
1847 `Tag` will be part of.
1848 :param builder: The `TreeBuilder` being used to build the tree.
1849 :param name: The name of the tag.
1850 :param namespace: The URI of this tag's XML namespace, if any.
1851 :param prefix: The prefix for this tag's XML namespace, if any.
1852 :param attrs: A dictionary of attribute values.
1853 :param parent: The `Tag` to use as the parent of this `Tag`. May be
1854 the `BeautifulSoup` object itself.
1855 :param previous: The `PageElement` that was parsed immediately before
1856 parsing this tag.
1857 :param is_xml: If True, this is an XML tag. Otherwise, this is an
1858 HTML tag.
1859 :param sourceline: The line number where this tag was found in its
1860 source document.
1861 :param sourcepos: The character position within ``sourceline`` where this
1862 tag was found.
1863 :param can_be_empty_element: If True, this tag should be
1864 represented as <tag/>. If False, this tag should be represented
1865 as <tag></tag>.
1866 :param cdata_list_attributes: A dictionary of attributes whose values should
1867 be parsed as lists of strings if they ever show up on this tag.
1868 :param preserve_whitespace_tags: Names of tags whose contents
1869 should have their whitespace preserved if they are encountered inside
1870 this tag.
1871 :param interesting_string_types: When iterating over this tag's
1872 string contents in methods like `Tag.strings` or
1873 `PageElement.get_text`, these are the types of strings that are
1874 interesting enough to be considered. By default,
1875 `NavigableString` (normal strings) and `CData` (CDATA
1876 sections) are the only interesting string subtypes.
1877 :param namespaces: A dictionary mapping currently active
1878 namespace prefixes to URIs, as of the point in the parsing process when
1879 this tag was encountered. This can be used later to
1880 construct CSS selectors.
1882 """
1884 def __init__(
1885 self,
1886 parser: Optional[BeautifulSoup] = None,
1887 builder: Optional[TreeBuilder] = None,
1888 name: Optional[str] = None,
1889 namespace: Optional[str] = None,
1890 prefix: Optional[str] = None,
1891 attrs: Optional[_RawOrProcessedAttributeValues] = None,
1892 parent: Optional[Union[BeautifulSoup, Tag]] = None,
1893 previous: _AtMostOneElement = None,
1894 is_xml: Optional[bool] = None,
1895 sourceline: Optional[int] = None,
1896 sourcepos: Optional[int] = None,
1897 can_be_empty_element: Optional[bool] = None,
1898 cdata_list_attributes: Optional[Dict[str, Set[str]]] = None,
1899 preserve_whitespace_tags: Optional[Set[str]] = None,
1900 interesting_string_types: Optional[Set[Type[NavigableString]]] = None,
1901 namespaces: Optional[Dict[str, str]] = None,
1902 # NOTE: Any new arguments here need to be mirrored in
1903 # Tag.copy_self, and potentially BeautifulSoup.new_tag
1904 # as well.
1905 ):
1906 if parser is None:
1907 self.parser_class = None
1908 else:
1909 # We don't actually store the parser object: that lets extracted
1910 # chunks be garbage-collected.
1911 self.parser_class = parser.__class__
1912 if name is None:
1913 raise ValueError("No value provided for new tag's name.")
1914 self.name = name
1915 self.namespace = namespace
1916 self._namespaces = namespaces or {}
1917 self.prefix = prefix
1918 if (not builder or builder.store_line_numbers) and (
1919 sourceline is not None or sourcepos is not None
1920 ):
1921 self.sourceline = sourceline
1922 self.sourcepos = sourcepos
1923 else:
1924 self.sourceline = sourceline
1925 self.sourcepos = sourcepos
1927 attr_dict_class: type[AttributeDict]
1928 attribute_value_list_class: type[AttributeValueList]
1929 if builder is None:
1930 if is_xml:
1931 attr_dict_class = XMLAttributeDict
1932 else:
1933 attr_dict_class = HTMLAttributeDict
1934 attribute_value_list_class = AttributeValueList
1935 else:
1936 attr_dict_class = builder.attribute_dict_class
1937 attribute_value_list_class = builder.attribute_value_list_class
1938 self.attribute_value_list_class = attribute_value_list_class
1940 if attrs is None:
1941 self.attrs = attr_dict_class()
1942 else:
1943 if builder is not None and builder.cdata_list_attributes:
1944 self.attrs = builder._replace_cdata_list_attribute_values(
1945 self.name, attrs
1946 )
1947 else:
1948 self.attrs = attr_dict_class()
1949 # Make sure that the values of any multi-valued
1950 # attributes (e.g. when a Tag is copied) are stored in
1951 # new lists.
1952 for k, v in attrs.items():
1953 if isinstance(v, list):
1954 v = v.__class__(v)
1955 self.attrs[k] = v
1957 # If possible, determine ahead of time whether this tag is an
1958 # XML tag.
1959 if builder:
1960 self.known_xml = builder.is_xml
1961 else:
1962 self.known_xml = is_xml
1963 self.contents: List[PageElement] = []
1964 self.setup(parent, previous)
1965 self.hidden = False
1967 if builder is None:
1968 # In the absence of a TreeBuilder, use whatever values were
1969 # passed in here. They're probably None, unless this is a copy of some
1970 # other tag.
1971 self.can_be_empty_element = can_be_empty_element
1972 self.cdata_list_attributes = cdata_list_attributes
1973 self.preserve_whitespace_tags = preserve_whitespace_tags
1974 self.interesting_string_types = interesting_string_types
1975 else:
1976 # Set up any substitutions for this tag, such as the charset in a META tag.
1977 self.attribute_value_list_class = builder.attribute_value_list_class
1978 builder.set_up_substitutions(self)
1980 # Ask the TreeBuilder whether this tag might be an empty-element tag.
1981 self.can_be_empty_element = builder.can_be_empty_element(name)
1983 # Keep track of the list of attributes of this tag that
1984 # might need to be treated as a list.
1985 #
1986 # For performance reasons, we store the whole data structure
1987 # rather than asking the question of every tag. Asking would
1988 # require building a new data structure every time, and
1989 # (unlike can_be_empty_element), we almost never need
1990 # to check this.
1991 self.cdata_list_attributes = builder.cdata_list_attributes
1993 # Keep track of the names that might cause this tag to be treated as a
1994 # whitespace-preserved tag.
1995 self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1997 if self.name in builder.string_containers:
1998 # This sort of tag uses a special string container
1999 # subclass for most of its strings. We need to be able
2000 # to look up the proper container subclass.
2001 self.interesting_string_types = {builder.string_containers[self.name]}
2002 else:
2003 self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES
2005 parser_class: Optional[type[BeautifulSoup]]
2006 name: str
2007 namespace: Optional[str]
2008 prefix: Optional[str]
2009 attrs: _AttributeValues
2010 sourceline: Optional[int]
2011 sourcepos: Optional[int]
2012 known_xml: Optional[bool]
2013 contents: List[PageElement]
2014 hidden: bool
2015 interesting_string_types: Optional[Set[Type[NavigableString]]]
2017 can_be_empty_element: Optional[bool]
2018 cdata_list_attributes: Optional[Dict[str, Set[str]]]
2019 preserve_whitespace_tags: Optional[Set[str]]
2021 #: :meta private:
2022 parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0")
2024 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self:
2025 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
2026 Its contents are a copy of the old Tag's contents.
2027 """
2028 clone = self.copy_self()
2030 if recursive:
2031 # Clone this tag's descendants recursively, but without
2032 # making any recursive function calls.
2033 tag_stack: List[Tag] = [clone]
2034 for event, element in self._event_stream(self.descendants):
2035 if event is Tag.END_ELEMENT_EVENT:
2036 # Stop appending incoming Tags to the Tag that was
2037 # just closed.
2038 tag_stack.pop()
2039 else:
2040 descendant_clone = element.__deepcopy__(memo, recursive=False)
2041 # Add to its parent's .contents
2042 tag_stack[-1].append(descendant_clone)
2044 if event is Tag.START_ELEMENT_EVENT:
2045 # Add the Tag itself to the stack so that its
2046 # children will be .appended to it.
2047 tag_stack.append(cast(Tag, descendant_clone))
2048 return clone
2050 def copy_self(self) -> Self:
2051 """Create a new Tag just like this one, but with no
2052 contents and unattached to any parse tree.
2054 This is the first step in the deepcopy process, but you can
2055 call it on its own to create a copy of a Tag without copying its
2056 contents.
2057 """
2058 clone = type(self)(
2059 None,
2060 None,
2061 self.name,
2062 self.namespace,
2063 self.prefix,
2064 self.attrs,
2065 is_xml=self._is_xml,
2066 sourceline=self.sourceline,
2067 sourcepos=self.sourcepos,
2068 can_be_empty_element=self.can_be_empty_element,
2069 cdata_list_attributes=self.cdata_list_attributes,
2070 preserve_whitespace_tags=self.preserve_whitespace_tags,
2071 interesting_string_types=self.interesting_string_types,
2072 namespaces=self._namespaces,
2073 )
2074 for attr in ("can_be_empty_element", "hidden"):
2075 setattr(clone, attr, getattr(self, attr))
2076 return clone
2078 @property
2079 def is_empty_element(self) -> bool:
2080 """Is this tag an empty-element tag? (aka a self-closing tag)
2082 A tag that has contents is never an empty-element tag.
2084 A tag that has no contents may or may not be an empty-element
2085 tag. It depends on the `TreeBuilder` used to create the
2086 tag. If the builder has a designated list of empty-element
2087 tags, then only a tag whose name shows up in that list is
2088 considered an empty-element tag. This is usually the case
2089 for HTML documents.
2091 If the builder has no designated list of empty-element, then
2092 any tag with no contents is an empty-element tag. This is usually
2093 the case for XML documents.
2094 """
2095 return len(self.contents) == 0 and self.can_be_empty_element is True
2097 @_deprecated("is_empty_element", "4.0.0")
2098 def isSelfClosing(self) -> bool:
2099 ": :meta private:"
2100 return self.is_empty_element
2102 @property
2103 def string(self) -> Optional[str]:
2104 """Convenience property to get the single string within this
2105 `Tag`, assuming there is just one.
2107 :return: If this `Tag` has a single child that's a
2108 `NavigableString`, the return value is that string. If this
2109 element has one child `Tag`, the return value is that child's
2110 `Tag.string`, recursively. If this `Tag` has no children,
2111 or has more than one child, the return value is ``None``.
2113 If this property is unexpectedly returning ``None`` for you,
2114 it's probably because your `Tag` has more than one thing
2115 inside it.
2116 """
2117 if len(self.contents) != 1:
2118 return None
2119 child = self.contents[0]
2120 if isinstance(child, NavigableString):
2121 return child
2122 elif isinstance(child, Tag):
2123 return child.string
2124 return None
2126 @string.setter
2127 def string(self, string: str) -> None:
2128 """Replace the `Tag.contents` of this `Tag` with a single string."""
2129 self.clear()
2130 if isinstance(string, NavigableString):
2131 new_class = string.__class__
2132 else:
2133 new_class = NavigableString
2134 self.append(new_class(string))
2136 #: :meta private:
2137 MAIN_CONTENT_STRING_TYPES = {NavigableString, CData}
2139 def _all_strings(
2140 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default
2141 ) -> Iterator[str]:
2142 """Yield all strings of certain classes, possibly stripping them.
2144 :param strip: If True, all strings will be stripped before being
2145 yielded.
2147 :param types: A tuple of NavigableString subclasses. Any strings of
2148 a subclass not found in this list will be ignored. By
2149 default, the subclasses considered are the ones found in
2150 self.interesting_string_types. If that's not specified,
2151 only NavigableString and CData objects will be
2152 considered. That means no comments, processing
2153 instructions, etc.
2154 """
2155 if types is self.default:
2156 if self.interesting_string_types is None:
2157 types = self.MAIN_CONTENT_STRING_TYPES
2158 else:
2159 types = self.interesting_string_types
2161 for descendant in self.descendants:
2162 if not isinstance(descendant, NavigableString):
2163 continue
2164 descendant_type = type(descendant)
2165 if isinstance(types, type):
2166 if descendant_type is not types:
2167 # We're not interested in strings of this type.
2168 continue
2169 elif types is not None and descendant_type not in types:
2170 # We're not interested in strings of this type.
2171 continue
2172 if strip:
2173 stripped = descendant.strip()
2174 if len(stripped) == 0:
2175 continue
2176 yield stripped
2177 else:
2178 yield descendant
2180 strings = property(_all_strings)
2182 def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]:
2183 """Insert one or more new PageElements as a child of this `Tag`.
2185 This works similarly to :py:meth:`list.insert`, except you can insert
2186 multiple elements at once.
2188 :param position: The numeric position that should be occupied
2189 in this Tag's `Tag.children` by the first new `PageElement`.
2191 :param new_children: The PageElements to insert.
2193 :return The newly inserted PageElements.
2194 """
2195 inserted: List[PageElement] = []
2196 for new_child in new_children:
2197 inserted.extend(self._insert(position, new_child))
2198 position += 1
2199 return inserted
2201 def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]:
2202 if new_child is None:
2203 raise ValueError("Cannot insert None into a tag.")
2204 if new_child is self:
2205 raise ValueError("Cannot insert a tag into itself.")
2206 if isinstance(new_child, str) and not isinstance(new_child, NavigableString):
2207 new_child = NavigableString(new_child)
2209 from bs4 import BeautifulSoup
2210 if isinstance(new_child, BeautifulSoup):
2211 # We don't want to end up with a situation where one BeautifulSoup
2212 # object contains another. Insert the BeautifulSoup's children and
2213 # return them.
2214 return self.insert(position, *list(new_child.contents))
2215 position = min(position, len(self.contents))
2216 if hasattr(new_child, "parent") and new_child.parent is not None:
2217 # We're 'inserting' an element that's already one
2218 # of this object's children.
2219 if new_child.parent is self:
2220 current_index = self.index(new_child)
2221 if current_index < position:
2222 # We're moving this element further down the list
2223 # of this object's children. That means that when
2224 # we extract this element, our target index will
2225 # jump down one.
2226 position -= 1
2227 elif current_index == position:
2228 # We're 'inserting' an element into its current location.
2229 # This is a no-op.
2230 return [new_child]
2231 new_child.extract()
2233 new_child.parent = self
2234 previous_child = None
2235 if position == 0:
2236 new_child.previous_sibling = None
2237 new_child.previous_element = self
2238 else:
2239 previous_child = self.contents[position - 1]
2240 new_child.previous_sibling = previous_child
2241 new_child.previous_sibling.next_sibling = new_child
2242 new_child.previous_element = previous_child._last_descendant(False)
2243 if new_child.previous_element is not None:
2244 new_child.previous_element.next_element = new_child
2246 new_childs_last_element = new_child._last_descendant(
2247 is_initialized=False, accept_self=True
2248 )
2249 # new_childs_last_element can't be None because we passed
2250 # accept_self=True into _last_descendant. Worst case,
2251 # new_childs_last_element will be new_child itself. Making
2252 # this cast removes several mypy complaints later on as we
2253 # manipulate new_childs_last_element.
2254 new_childs_last_element = cast(PageElement, new_childs_last_element)
2256 if position >= len(self.contents):
2257 new_child.next_sibling = None
2259 parent: Optional[Tag] = self
2260 parents_next_sibling = None
2261 while parents_next_sibling is None and parent is not None:
2262 parents_next_sibling = parent.next_sibling
2263 parent = parent.parent
2264 if parents_next_sibling is not None:
2265 # We found the element that comes next in the document.
2266 break
2267 if parents_next_sibling is not None:
2268 new_childs_last_element.next_element = parents_next_sibling
2269 else:
2270 # The last element of this tag is the last element in
2271 # the document.
2272 new_childs_last_element.next_element = None
2273 else:
2274 next_child = self.contents[position]
2275 new_child.next_sibling = next_child
2276 if new_child.next_sibling is not None:
2277 new_child.next_sibling.previous_sibling = new_child
2278 new_childs_last_element.next_element = next_child
2280 if new_childs_last_element.next_element is not None:
2281 new_childs_last_element.next_element.previous_element = (
2282 new_childs_last_element
2283 )
2284 self.contents.insert(position, new_child)
2286 return [new_child]
2288 def unwrap(self) -> Self:
2289 """Replace this `PageElement` with its contents.
2291 :return: This object, no longer part of the tree.
2292 """
2293 my_parent = self.parent
2294 if my_parent is None:
2295 raise ValueError(
2296 "Cannot replace an element with its contents when that "
2297 "element is not part of a tree."
2298 )
2299 my_index = my_parent.index(self)
2300 self.extract(_self_index=my_index)
2301 for child in reversed(self.contents[:]):
2302 my_parent.insert(my_index, child)
2303 return self
2305 replace_with_children = unwrap
2307 @_deprecated("unwrap", "4.0.0")
2308 def replaceWithChildren(self) -> _OneElement:
2309 ": :meta private:"
2310 return self.unwrap()
2312 def append(self, tag: _InsertableElement) -> PageElement|List[PageElement]:
2313 """Appends the given `PageElement` to the contents of this `Tag`.
2315 :param tag: A PageElement. If this is another BeautifulSoup
2316 object, all of its contents will be inserted into this
2317 `Tag`, since one BeautifulSoup object can't contain another
2318 one.
2320 :return: The object that was just appended, or (if `tag` was a BeautifulSoup
2321 object) all such objects.
2322 """
2323 inserted = self.insert(len(self.contents), tag)
2324 if isinstance(tag, Tag) and tag.name == "[document]": # TODO: can't reference BeautifulSoup class in this module
2325 return inserted
2326 else:
2327 return inserted[0]
2329 def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]:
2330 """Appends one or more objects to the contents of this
2331 `Tag`.
2333 :param tags: If a list of `PageElement` objects is provided,
2334 they will be appended to this tag's contents, one at a time.
2335 If a single `Tag` is provided, its `Tag.contents` will be
2336 used to extend this object's `Tag.contents`.
2338 :return The list of PageElements that were appended.
2339 """
2340 tag_list: Iterable[_InsertableElement]
2342 if isinstance(tags, Tag):
2343 tag_list = list(tags.contents)
2344 elif isinstance(tags, (PageElement, str)):
2345 # The caller should really be using append() instead,
2346 # but we can make it work.
2347 warnings.warn(
2348 "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.",
2349 UserWarning,
2350 stacklevel=2,
2351 )
2352 if isinstance(tags, str) and not isinstance(tags, PageElement):
2353 tags = NavigableString(tags)
2354 tag_list = [tags]
2355 elif isinstance(tags, Iterable):
2356 # Moving items around the tree may change their position in
2357 # the original list. Make a list that won't change.
2358 tag_list = list(tags)
2360 results: List[PageElement] = []
2361 for tag in tag_list:
2362 appended = self.append(tag)
2363 if isinstance(appended, list):
2364 # This can happen if you pass in a mixture of Tag and BeautifulSoup objects.
2365 results.extend(appended)
2366 else:
2367 results.append(appended)
2369 return results
2371 def clear(self, decompose: bool = False) -> None:
2372 """Destroy all children of this `Tag` by calling
2373 `PageElement.extract` on them.
2375 :param decompose: If this is True, `PageElement.decompose` (a
2376 more destructive method) will be called instead of
2377 `PageElement.extract`.
2378 """
2379 for element in self.contents[:]:
2380 if decompose:
2381 element.decompose()
2382 else:
2383 element.extract()
2385 def smooth(self) -> None:
2386 """Smooth out the children of this `Tag` by consolidating consecutive
2387 strings.
2389 If you perform a lot of operations that modify the tree,
2390 calling this method afterwards can make pretty-printed output
2391 look more natural.
2392 """
2393 # Mark the first position of every pair of children that need
2394 # to be consolidated. Do this rather than making a copy of
2395 # self.contents, since in most cases very few strings will be
2396 # affected.
2397 marked = []
2398 for i, a in enumerate(self.contents):
2399 if isinstance(a, Tag):
2400 # Recursively smooth children.
2401 a.smooth()
2402 if i == len(self.contents) - 1:
2403 # This is the last item in .contents, and it's not a
2404 # tag. There's no chance it needs any work.
2405 continue
2406 b = self.contents[i + 1]
2407 if (
2408 isinstance(a, NavigableString)
2409 and isinstance(b, NavigableString)
2410 and not isinstance(a, PreformattedString)
2411 and not isinstance(b, PreformattedString)
2412 ):
2413 marked.append(i)
2415 # Go over the marked positions in reverse order, so that
2416 # removing items from .contents won't affect the remaining
2417 # positions.
2418 for i in reversed(marked):
2419 a = cast(NavigableString, self.contents[i])
2420 b = cast(NavigableString, self.contents[i + 1])
2421 b.extract()
2422 n = NavigableString(a + b)
2423 a.replace_with(n)
2425 def index(self, element: PageElement) -> int:
2426 """Find the index of a child of this `Tag` (by identity, not value).
2428 Doing this by identity avoids issues when a `Tag` contains two
2429 children that have string equality.
2431 :param element: Look for this `PageElement` in this object's contents.
2432 """
2433 for i, child in enumerate(self.contents):
2434 if child is element:
2435 return i
2436 raise ValueError("Tag.index: element not in tag")
2438 def get(
2439 self, key: str, default: Optional[_AttributeValue] = None
2440 ) -> Optional[_AttributeValue]:
2441 """Returns the value of the 'key' attribute for the tag, or
2442 the value given for 'default' if it doesn't have that
2443 attribute.
2445 :param key: The attribute to look for.
2446 :param default: Use this value if the attribute is not present
2447 on this `Tag`.
2448 """
2449 return self.attrs.get(key, default)
2451 def get_attribute_list(
2452 self, key: str, default: Optional[AttributeValueList] = None
2453 ) -> AttributeValueList:
2454 """The same as get(), but always returns a (possibly empty) list.
2456 :param key: The attribute to look for.
2457 :param default: Use this value if the attribute is not present
2458 on this `Tag`.
2459 :return: A list of strings, usually empty or containing only a single
2460 value.
2461 """
2462 list_value: AttributeValueList
2463 value = self.get(key, default)
2464 if value is None:
2465 list_value = self.attribute_value_list_class()
2466 elif isinstance(value, list):
2467 list_value = value
2468 else:
2469 if not isinstance(value, str):
2470 value = cast(str, value)
2471 list_value = self.attribute_value_list_class([value])
2472 return list_value
2474 def has_attr(self, key: str) -> bool:
2475 """Does this `Tag` have an attribute with the given name?"""
2476 return key in self.attrs
2478 def __hash__(self) -> int:
2479 return str(self).__hash__()
2481 def __getitem__(self, key: str) -> _AttributeValue:
2482 """tag[key] returns the value of the 'key' attribute for the Tag,
2483 and throws an exception if it's not there."""
2484 return self.attrs[key]
2486 def __iter__(self) -> Iterator[PageElement]:
2487 "Iterating over a Tag iterates over its contents."
2488 return iter(self.contents)
2490 def __len__(self) -> int:
2491 "The length of a Tag is the length of its list of contents."
2492 return len(self.contents)
2494 def __contains__(self, x: Any) -> bool:
2495 return x in self.contents
2497 def __bool__(self) -> bool:
2498 "A tag is non-None even if it has no contents."
2499 return True
2501 def __setitem__(self, key: str, value: _AttributeValue) -> None:
2502 """Setting tag[key] sets the value of the 'key' attribute for the
2503 tag."""
2504 self.attrs[key] = value
2506 def __delitem__(self, key: str) -> None:
2507 "Deleting tag[key] deletes all 'key' attributes for the tag."
2508 self.attrs.pop(key, None)
2510 # Since Tag.__call__ is effectively the same as PageElement.find_all, see find_all for notes
2511 # on these overloads.
2513 @overload
2514 def __call__(
2515 self,
2516 name: None = None,
2517 attrs: None = None,
2518 recursive: bool = True,
2519 *,
2520 string: _StrainableString,
2521 limit: Optional[int] = None,
2522 **kwargs: _StrainableAttribute,
2523 ) -> _SomeNavigableStrings:
2524 ...
2526 @overload
2527 def __call__(
2528 self,
2529 name: None = None,
2530 attrs: None = None,
2531 recursive: bool = True,
2532 string: None = None,
2533 limit: Optional[int] = None,
2534 **kwargs: _StrainableAttribute,
2535 ) -> _SomeTags:
2536 ...
2538 @overload
2539 def __call__(
2540 self,
2541 name: None,
2542 attrs: _StrainableAttributes,
2543 recursive: bool = True,
2544 string: None = None,
2545 limit: Optional[int] = None,
2546 **kwargs: _StrainableAttribute,
2547 ) -> _SomeTags:
2548 ...
2550 @overload
2551 def __call__(
2552 self,
2553 name: _FindMethodName,
2554 attrs: Optional[_StrainableAttributes] = None,
2555 recursive: bool = True,
2556 string: Optional[_StrainableString] = None,
2557 limit: Optional[int] = None,
2558 **kwargs: _StrainableAttribute,
2559 ) -> _SomeTags:
2560 ...
2562 def __call__(
2563 self,
2564 name: _OptionalFindMethodName = None,
2565 attrs: Optional[_StrainableAttributes] = None,
2566 recursive: bool = True,
2567 string: Optional[_StrainableString] = None,
2568 limit: Optional[int] = None,
2569 **kwargs: _StrainableAttribute,
2570 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
2571 """Calling a Tag like a function is the same as calling its
2572 find_all() method.
2574 Eg. tag('a') returns a list of all the A tags found within this tag.
2575 """
2576 return self._find_all(name, attrs, string, limit, self._generator_for_recursive(recursive), **kwargs)
2578 def __getattr__(self, subtag: str) -> Optional[Tag]:
2579 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
2580 # print("Getattr %s.%s" % (self.__class__, tag))
2581 result: _AtMostOneElement
2582 if len(subtag) > 3 and subtag.endswith("Tag"):
2583 # BS3: soup.aTag -> "soup.find("a")
2584 tag_name = subtag[:-3]
2585 warnings.warn(
2586 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")'
2587 % dict(name=tag_name),
2588 DeprecationWarning,
2589 stacklevel=2,
2590 )
2591 result = self.find(tag_name)
2592 # We special case contents to avoid recursion.
2593 elif not subtag.startswith("__") and not subtag == "contents":
2594 result = self.find(subtag)
2595 else:
2596 raise AttributeError(
2597 "'%s' object has no attribute '%s'" % (self.__class__, subtag)
2598 )
2599 return result
2601 def __eq__(self, other: Any) -> bool:
2602 """Returns true iff this Tag has the same name, the same attributes,
2603 and the same contents (recursively) as `other`."""
2604 if self is other:
2605 return True
2606 if not isinstance(other, Tag):
2607 return False
2608 if (
2609 not hasattr(other, "name")
2610 or not hasattr(other, "attrs")
2611 or not hasattr(other, "contents")
2612 or self.name != other.name
2613 or self.attrs != other.attrs
2614 or len(self) != len(other)
2615 ):
2616 return False
2617 for i, my_child in enumerate(self.contents):
2618 if my_child != other.contents[i]:
2619 return False
2620 return True
2622 def __ne__(self, other: Any) -> bool:
2623 """Returns true iff this Tag is not identical to `other`,
2624 as defined in __eq__."""
2625 return not self == other
2627 def __repr__(self) -> str:
2628 """Renders this `Tag` as a string."""
2629 return self.decode()
2631 __str__ = __unicode__ = __repr__
2633 def encode(
2634 self,
2635 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2636 indent_level: Optional[int] = None,
2637 formatter: _FormatterOrName = "minimal",
2638 errors: str = "xmlcharrefreplace",
2639 ) -> bytes:
2640 """Render this `Tag` and its contents as a bytestring.
2642 :param encoding: The encoding to use when converting to
2643 a bytestring. This may also affect the text of the document,
2644 specifically any encoding declarations within the document.
2645 :param indent_level: Each line of the rendering will be
2646 indented this many levels. (The ``formatter`` decides what a
2647 'level' means, in terms of spaces or other characters
2648 output.) This is used internally in recursive calls while
2649 pretty-printing.
2650 :param formatter: Either a `Formatter` object, or a string naming one of
2651 the standard formatters.
2652 :param errors: An error handling strategy such as
2653 'xmlcharrefreplace'. This value is passed along into
2654 :py:meth:`str.encode` and its value should be one of the `error
2655 handling constants defined by Python's codecs module
2656 <https://docs.python.org/3/library/codecs.html#error-handlers>`_.
2657 """
2658 # Turn the data structure into Unicode, then encode the
2659 # Unicode.
2660 u = self.decode(indent_level, encoding, formatter)
2661 return u.encode(encoding, errors)
2663 def decode(
2664 self,
2665 indent_level: Optional[int] = None,
2666 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2667 formatter: _FormatterOrName = "minimal",
2668 iterator: Optional[Iterator[PageElement]] = None,
2669 ) -> str:
2670 """Render this `Tag` and its contents as a Unicode string.
2672 :param indent_level: Each line of the rendering will be
2673 indented this many levels. (The ``formatter`` decides what a
2674 'level' means, in terms of spaces or other characters
2675 output.) This is used internally in recursive calls while
2676 pretty-printing.
2677 :param encoding: The encoding you intend to use when
2678 converting the string to a bytestring. decode() is *not*
2679 responsible for performing that encoding. This information
2680 is needed so that a real encoding can be substituted in if
2681 the document contains an encoding declaration (e.g. in a
2682 <meta> tag).
2683 :param formatter: Either a `Formatter` object, or a string
2684 naming one of the standard formatters.
2685 :param iterator: The iterator to use when navigating over the
2686 parse tree. This is only used by `Tag.decode_contents` and
2687 you probably won't need to use it.
2688 """
2689 pieces = []
2690 # First off, turn a non-Formatter `formatter` into a Formatter
2691 # object. This will stop the lookup from happening over and
2692 # over again.
2693 if not isinstance(formatter, Formatter):
2694 formatter = self.formatter_for_name(formatter)
2696 if indent_level is True:
2697 indent_level = 0
2699 # The currently active tag that put us into string literal
2700 # mode. Until this element is closed, children will be treated
2701 # as string literals and not pretty-printed. String literal
2702 # mode is turned on immediately after this tag begins, and
2703 # turned off immediately before it's closed. This means there
2704 # will be whitespace before and after the tag itself.
2705 string_literal_tag = None
2707 for event, element in self._event_stream(iterator):
2708 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
2709 element = cast(Tag, element)
2710 piece = element._format_tag(eventual_encoding, formatter, opening=True)
2711 elif event is Tag.END_ELEMENT_EVENT:
2712 element = cast(Tag, element)
2713 piece = element._format_tag(eventual_encoding, formatter, opening=False)
2714 if indent_level is not None:
2715 indent_level -= 1
2716 else:
2717 element = cast(NavigableString, element)
2718 piece = element.output_ready(formatter)
2720 # Now we need to apply the 'prettiness' -- extra
2721 # whitespace before and/or after this tag. This can get
2722 # complicated because certain tags, like <pre> and
2723 # <script>, can't be prettified, since adding whitespace would
2724 # change the meaning of the content.
2726 # The default behavior is to add whitespace before and
2727 # after an element when string literal mode is off, and to
2728 # leave things as they are when string literal mode is on.
2729 if string_literal_tag:
2730 indent_before = indent_after = False
2731 else:
2732 indent_before = indent_after = True
2734 # The only time the behavior is more complex than that is
2735 # when we encounter an opening or closing tag that might
2736 # put us into or out of string literal mode.
2737 if (
2738 event is Tag.START_ELEMENT_EVENT
2739 and not string_literal_tag
2740 and not cast(Tag, element)._should_pretty_print()
2741 ):
2742 # We are about to enter string literal mode. Add
2743 # whitespace before this tag, but not after. We
2744 # will stay in string literal mode until this tag
2745 # is closed.
2746 indent_before = True
2747 indent_after = False
2748 string_literal_tag = element
2749 elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag:
2750 # We are about to exit string literal mode by closing
2751 # the tag that sent us into that mode. Add whitespace
2752 # after this tag, but not before.
2753 indent_before = False
2754 indent_after = True
2755 string_literal_tag = None
2757 # Now we know whether to add whitespace before and/or
2758 # after this element.
2759 if indent_level is not None:
2760 if indent_before or indent_after:
2761 if isinstance(element, NavigableString):
2762 piece = piece.strip()
2763 if piece:
2764 piece = self._indent_string(
2765 piece, indent_level, formatter, indent_before, indent_after
2766 )
2767 if event == Tag.START_ELEMENT_EVENT:
2768 indent_level += 1
2769 pieces.append(piece)
2770 return "".join(pieces)
2772 class _TreeTraversalEvent(object):
2773 """An internal class representing an event in the process
2774 of traversing a parse tree.
2776 :meta private:
2777 """
2779 # Stand-ins for the different events yielded by _event_stream
2780 START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2781 END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2782 EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2783 STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:
2785 def _event_stream(
2786 self, iterator: Optional[Iterator[PageElement]] = None
2787 ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]:
2788 """Yield a sequence of events that can be used to reconstruct the DOM
2789 for this element.
2791 This lets us recreate the nested structure of this element
2792 (e.g. when formatting it as a string) without using recursive
2793 method calls.
2795 This is similar in concept to the SAX API, but it's a simpler
2796 interface designed for internal use. The events are different
2797 from SAX and the arguments associated with the events are Tags
2798 and other Beautiful Soup objects.
2800 :param iterator: An alternate iterator to use when traversing
2801 the tree.
2802 """
2803 tag_stack: List[Tag] = []
2805 iterator = iterator or self.self_and_descendants
2807 for c in iterator:
2808 # If the parent of the element we're about to yield is not
2809 # the tag currently on the stack, it means that the tag on
2810 # the stack closed before this element appeared.
2811 while tag_stack and c.parent != tag_stack[-1]:
2812 now_closed_tag = tag_stack.pop()
2813 yield Tag.END_ELEMENT_EVENT, now_closed_tag
2815 if isinstance(c, Tag):
2816 if c.is_empty_element:
2817 yield Tag.EMPTY_ELEMENT_EVENT, c
2818 else:
2819 yield Tag.START_ELEMENT_EVENT, c
2820 tag_stack.append(c)
2821 continue
2822 else:
2823 yield Tag.STRING_ELEMENT_EVENT, c
2825 while tag_stack:
2826 now_closed_tag = tag_stack.pop()
2827 yield Tag.END_ELEMENT_EVENT, now_closed_tag
2829 def _indent_string(
2830 self,
2831 s: str,
2832 indent_level: int,
2833 formatter: Formatter,
2834 indent_before: bool,
2835 indent_after: bool,
2836 ) -> str:
2837 """Add indentation whitespace before and/or after a string.
2839 :param s: The string to amend with whitespace.
2840 :param indent_level: The indentation level; affects how much
2841 whitespace goes before the string.
2842 :param indent_before: Whether or not to add whitespace
2843 before the string.
2844 :param indent_after: Whether or not to add whitespace
2845 (a newline) after the string.
2846 """
2847 space_before = ""
2848 if indent_before and indent_level:
2849 space_before = formatter.indent * indent_level
2851 space_after = ""
2852 if indent_after:
2853 space_after = "\n"
2855 return space_before + s + space_after
2857 def _format_tag(
2858 self, eventual_encoding: str, formatter: Formatter, opening: bool
2859 ) -> str:
2860 if self.hidden:
2861 # A hidden tag is invisible, although its contents
2862 # are visible.
2863 return ""
2865 # A tag starts with the < character (see below).
2867 # Then the / character, if this is a closing tag.
2868 closing_slash = ""
2869 if not opening:
2870 closing_slash = "/"
2872 # Then an optional namespace prefix.
2873 prefix = ""
2874 if self.prefix:
2875 prefix = self.prefix + ":"
2877 # Then a list of attribute values, if this is an opening tag.
2878 attribute_string = ""
2879 if opening:
2880 attributes = formatter.attributes(self)
2881 attrs = []
2882 for key, val in attributes:
2883 if val is None:
2884 decoded = key
2885 else:
2886 if isinstance(val, list) or isinstance(val, tuple):
2887 val = " ".join(val)
2888 elif not isinstance(val, str):
2889 val = str(val)
2890 elif (
2891 isinstance(val, AttributeValueWithCharsetSubstitution)
2892 and eventual_encoding is not None
2893 ):
2894 val = val.substitute_encoding(eventual_encoding)
2896 text = formatter.attribute_value(val)
2897 decoded = str(key) + "=" + formatter.quoted_attribute_value(text)
2898 attrs.append(decoded)
2899 if attrs:
2900 attribute_string = " " + " ".join(attrs)
2902 # Then an optional closing slash (for a void element in an
2903 # XML document).
2904 void_element_closing_slash = ""
2905 if self.is_empty_element:
2906 void_element_closing_slash = formatter.void_element_close_prefix or ""
2908 # Put it all together.
2909 return (
2910 "<"
2911 + closing_slash
2912 + prefix
2913 + self.name
2914 + attribute_string
2915 + void_element_closing_slash
2916 + ">"
2917 )
2919 def _should_pretty_print(self, indent_level: int = 1) -> bool:
2920 """Should this tag be pretty-printed?
2922 Most of them should, but some (such as <pre> in HTML
2923 documents) should not.
2924 """
2925 return indent_level is not None and (
2926 not self.preserve_whitespace_tags
2927 or self.name not in self.preserve_whitespace_tags
2928 )
2930 @overload
2931 def prettify(
2932 self,
2933 encoding: None = None,
2934 formatter: _FormatterOrName = "minimal",
2935 ) -> str:
2936 ...
2938 @overload
2939 def prettify(
2940 self,
2941 encoding: _Encoding,
2942 formatter: _FormatterOrName = "minimal",
2943 ) -> bytes:
2944 ...
2946 def prettify(
2947 self,
2948 encoding: Optional[_Encoding] = None,
2949 formatter: _FormatterOrName = "minimal",
2950 ) -> Union[str, bytes]:
2951 """Pretty-print this `Tag` as a string or bytestring.
2953 :param encoding: The encoding of the bytestring, or None if you want Unicode.
2954 :param formatter: A Formatter object, or a string naming one of
2955 the standard formatters.
2956 :return: A string (if no ``encoding`` is provided) or a bytestring
2957 (otherwise).
2958 """
2959 if encoding is None:
2960 return self.decode(indent_level=0, formatter=formatter)
2961 else:
2962 return self.encode(encoding=encoding, indent_level=0, formatter=formatter)
2964 def decode_contents(
2965 self,
2966 indent_level: Optional[int] = None,
2967 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2968 formatter: _FormatterOrName = "minimal",
2969 ) -> str:
2970 """Renders the contents of this tag as a Unicode string.
2972 :param indent_level: Each line of the rendering will be
2973 indented this many levels. (The formatter decides what a
2974 'level' means in terms of spaces or other characters
2975 output.) Used internally in recursive calls while
2976 pretty-printing.
2978 :param eventual_encoding: The tag is destined to be
2979 encoded into this encoding. decode_contents() is *not*
2980 responsible for performing that encoding. This information
2981 is needed so that a real encoding can be substituted in if
2982 the document contains an encoding declaration (e.g. in a
2983 <meta> tag).
2985 :param formatter: A `Formatter` object, or a string naming one of
2986 the standard Formatters.
2987 """
2988 return self.decode(
2989 indent_level, eventual_encoding, formatter, iterator=self.descendants
2990 )
2992 def encode_contents(
2993 self,
2994 indent_level: Optional[int] = None,
2995 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
2996 formatter: _FormatterOrName = "minimal",
2997 ) -> bytes:
2998 """Renders the contents of this PageElement as a bytestring.
3000 :param indent_level: Each line of the rendering will be
3001 indented this many levels. (The ``formatter`` decides what a
3002 'level' means, in terms of spaces or other characters
3003 output.) This is used internally in recursive calls while
3004 pretty-printing.
3005 :param formatter: Either a `Formatter` object, or a string naming one of
3006 the standard formatters.
3007 :param encoding: The bytestring will be in this encoding.
3008 """
3009 contents = self.decode_contents(indent_level, encoding, formatter)
3010 return contents.encode(encoding)
3012 @_deprecated("encode_contents", "4.0.0")
3013 def renderContents(
3014 self,
3015 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,
3016 prettyPrint: bool = False,
3017 indentLevel: Optional[int] = 0,
3018 ) -> bytes:
3019 """Deprecated method for BS3 compatibility.
3021 :meta private:
3022 """
3023 if not prettyPrint:
3024 indentLevel = None
3025 return self.encode_contents(indent_level=indentLevel, encoding=encoding)
3027 # Soup methods
3028 #
3030 # People who call these methods in a type-safe environment
3031 # basically want to know whether the call is going to return
3032 # NavigableStrings or Tags. It's always one or the other, never
3033 # both, but spelling it out requires a number of overloads for
3034 # each method.
3035 #
3036 # If I had it to do over again I'd design this API differently (it
3037 # would look more like ElementFilter), but that's life.
3038 #
3039 # The overloads all look for a clue in the input which restricts
3040 # the method to returning either only strings or only tags. Only
3041 # the most common cases are covered.
3043 # e.g. find(string="foo")
3044 # -> string information but no tag information
3045 # -> string
3046 @overload
3047 def find(
3048 self,
3049 name: None = None,
3050 attrs: None = None,
3051 recursive: bool = True,
3052 *,
3053 string: _StrainableString,
3054 **kwargs: _StrainableAttribute,
3055 ) -> _AtMostOneNavigableString:
3056 ...
3058 # e.g. find() -> default behavior -> tag
3059 # find(attr="value") -> only tags have attrs -> tag
3060 @overload
3061 def find(
3062 self,
3063 name: None = None,
3064 attrs: None = None,
3065 recursive: bool = True,
3066 string: None = None,
3067 **kwargs: _StrainableAttribute,
3068 ) -> _AtMostOneTag:
3069 ...
3071 # e.g. find(attrs=dict(attr="value"))
3072 # -> only tags have attrs
3073 # -> tag
3074 @overload
3075 def find(
3076 self,
3077 name: None,
3078 attrs: _StrainableAttributes,
3079 recursive: bool = True,
3080 string: Optional[_StrainableString] = None,
3081 **kwargs: _StrainableAttribute,
3082 ) -> _AtMostOneTag:
3083 ...
3085 # e.g. find(name="a")) -> only tags have names -> tag
3086 #
3087 # The confusing and controversial case of find(name="a", string="foo")
3088 # also hits this overload.
3089 @overload
3090 def find(
3091 self,
3092 name: _FindMethodName,
3093 attrs: Optional[_StrainableAttributes] = None,
3094 recursive: bool = True,
3095 string: Optional[_StrainableString] = None,
3096 **kwargs: _StrainableAttribute,
3097 ) -> _AtMostOneTag:
3098 ...
3100 # Some lesser-used cases are not covered by the overrides. Those
3101 # cases will hit this method directly and return a very general
3102 # type which will need to be cast after the call.
3103 def find(
3104 self,
3105 name: _OptionalFindMethodName = None,
3106 attrs: Optional[_StrainableAttributes] = None,
3107 recursive: bool = True,
3108 string: Optional[_StrainableString] = None,
3109 **kwargs: _StrainableAttribute,
3110 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
3111 """Look in the children of this PageElement and find the first
3112 PageElement that matches the given criteria.
3114 All find_* methods take a common set of arguments. See the online
3115 documentation for detailed explanations.
3117 :param name: A filter on tag name.
3118 :param attrs: Additional filters on attribute values.
3119 :param recursive: If this is True, find() will perform a
3120 recursive search of this Tag's children. Otherwise,
3121 only the direct children will be considered.
3122 :param string: A filter on the `Tag.string` attribute.
3123 :kwargs: Additional filters on attribute values.
3124 """
3125 tags = self._find_all(name, attrs, string, 1, self._generator_for_recursive(recursive), **kwargs)
3126 if tags:
3127 return tags[0]
3128 return None
3130 findChild = _deprecated_function_alias("findChild", "find", "3.0.0")
3132 # e.g. find_all(string="foo")
3133 # -> string information but no tag information
3134 # -> strings
3135 #
3136 # Also covers unlikely cases like find_all(name=None, string="foo")
3137 #
3138 # "To mark parameters as keyword-only, indicating the parameters
3139 # must be passed by keyword argument, place an * in the arguments
3140 # list just before the first keyword-only parameter."
3141 #
3142 # --https://peps.python.org/pep-0570/#keyword-only-arguments
3143 @overload
3144 def find_all(
3145 self,
3146 name: None = None,
3147 attrs: None = None,
3148 recursive: bool = True,
3149 *,
3150 string: _StrainableString,
3151 limit: Optional[int] = None,
3152 **kwargs: _StrainableAttribute,
3153 ) -> _SomeNavigableStrings:
3154 ...
3156 # e.g. find_all() -> default behavior -> tags
3157 # find_all(attr="value") -> only tags have attrs -> tags
3158 @overload
3159 def find_all(
3160 self,
3161 name: None = None,
3162 attrs: None = None,
3163 recursive: bool = True,
3164 string: None = None,
3165 limit: Optional[int] = None,
3166 **kwargs: _StrainableAttribute,
3167 ) -> _SomeTags:
3168 ...
3170 # e.g. find_all(attrs=dict(attr="value"))
3171 # -> only tags have attrs
3172 # -> tags
3173 @overload
3174 def find_all(
3175 self,
3176 name: None,
3177 attrs: _StrainableAttributes,
3178 recursive: bool = True,
3179 string: Optional[_StrainableString] = None,
3180 limit: Optional[int] = None,
3181 **kwargs: _StrainableAttribute,
3182 ) -> _SomeTags:
3183 ...
3185 # e.g. find_all(name="a")) -> only tags have names -> tags
3186 #
3187 # The confusing and controversial case of find_all(name="a", string="foo")
3188 # also hits this overload.
3189 @overload
3190 def find_all(
3191 self,
3192 name: _FindMethodName,
3193 attrs: Optional[_StrainableAttributes] = None,
3194 recursive: bool = True,
3195 string: Optional[_StrainableString] = None,
3196 limit: Optional[int] = None,
3197 **kwargs: _StrainableAttribute,
3198 ) -> _SomeTags:
3199 ...
3201 # Without the clues above, we don't know whether the method will
3202 # return strings or tags. However every common case will trigger one
3203 # of the overloads and give us the clue we need.
3204 def find_all(
3205 self,
3206 name: _OptionalFindMethodName = None,
3207 attrs: Optional[_StrainableAttributes] = None,
3208 recursive: bool = True,
3209 string: Optional[_StrainableString] = None,
3210 limit: Optional[int] = None,
3211 **kwargs: _StrainableAttribute,
3212 ) -> Union[_SomeTags,_SomeNavigableStrings]:
3213 """Look in the children of this `PageElement` and find all
3214 `PageElement` objects that match the given criteria.
3216 All find_* methods take a common set of arguments. See the online
3217 documentation for detailed explanations.
3219 :param name: A filter on tag name.
3220 :param attrs: Additional filters on attribute values.
3221 :param recursive: If this is True, find_all() will perform a
3222 recursive search of this PageElement's children. Otherwise,
3223 only the direct children will be considered.
3224 :param limit: Stop looking after finding this many results.
3225 :kwargs: Additional filters on attribute values.
3226 """
3227 generator = self._generator_for_recursive(recursive)
3229 if string is not None and (name is not None or attrs is not None or kwargs):
3230 # TODO: Using the @overload decorator to express the three ways you
3231 # could get into this path is way too much code for a rarely(?) used
3232 # feature.
3233 return cast(ResultSet[Tag],
3234 self._find_all(name, attrs, string, limit, generator,
3235 **kwargs))
3237 if string is None:
3238 # If string is None, we're searching for tags.
3239 return cast(ResultSet[Tag], self._find_all(
3240 name, attrs, None, limit, generator, **kwargs
3241 ))
3243 # Otherwise, we're searching for strings.
3244 return cast(ResultSet[NavigableString], self._find_all(
3245 None, None, string, limit, generator, **kwargs
3246 ))
3248 findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")
3249 findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")
3251 # Generator methods
3252 @property
3253 def children(self) -> Iterator[PageElement]:
3254 """Iterate over all direct children of this `PageElement`."""
3255 return (x for x in self.contents)
3257 @property
3258 def self_and_descendants(self) -> Iterator[PageElement]:
3259 """Iterate over this `Tag` and its children in a
3260 breadth-first sequence.
3261 """
3262 return self._self_and(self.descendants)
3264 @property
3265 def descendants(self) -> Iterator[PageElement]:
3266 """Iterate over all children of this `Tag` in a
3267 breadth-first sequence.
3268 """
3269 if not len(self.contents):
3270 return
3271 # _last_descendant() can't return None here because
3272 # accept_self is True. Worst case, last_descendant will end up
3273 # as self.
3274 last_descendant = cast(PageElement, self._last_descendant(accept_self=True))
3275 stopNode = last_descendant.next_element
3276 current: _AtMostOneElement = self.contents[0]
3277 while current is not stopNode and current is not None:
3278 successor = current.next_element
3279 yield current
3280 current = successor
3282 def _generator_for_recursive(self, recursive:bool) -> Iterator[PageElement]:
3283 """Helper method to process the boolean `recursive` argument
3284 for find* methods.
3286 :return: the appropriate generator
3287 """
3288 if recursive:
3289 return self.descendants
3290 return self.children
3292 # CSS selector code
3293 def select_one(
3294 self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any
3295 ) -> Optional[Tag]:
3296 """Perform a CSS selection operation on the current element.
3298 :param selector: A CSS selector.
3300 :param namespaces: A dictionary mapping namespace prefixes
3301 used in the CSS selector to namespace URIs. By default,
3302 Beautiful Soup will use the prefixes it encountered while
3303 parsing the document.
3305 :param kwargs: Keyword arguments to be passed into Soup Sieve's
3306 soupsieve.select() method.
3307 """
3308 return self.css.select_one(selector, namespaces, **kwargs)
3310 def select(
3311 self,
3312 selector: str,
3313 namespaces: Optional[Dict[str, str]] = None,
3314 limit: int = 0,
3315 **kwargs: Any,
3316 ) -> ResultSet[Tag]:
3317 """Perform a CSS selection operation on the current element.
3319 This uses the SoupSieve library.
3321 :param selector: A string containing a CSS selector.
3323 :param namespaces: A dictionary mapping namespace prefixes
3324 used in the CSS selector to namespace URIs. By default,
3325 Beautiful Soup will use the prefixes it encountered while
3326 parsing the document.
3328 :param limit: After finding this number of results, stop looking.
3330 :param kwargs: Keyword arguments to be passed into SoupSieve's
3331 soupsieve.select() method.
3332 """
3333 return self.css.select(selector, namespaces, limit, **kwargs)
3335 @property
3336 def css(self) -> CSS:
3337 """Return an interface to the CSS selector API."""
3338 return CSS(self)
3340 # Old names for backwards compatibility
3341 @_deprecated("children", "4.0.0")
3342 def childGenerator(self) -> Iterator[PageElement]:
3343 """Deprecated generator.
3345 :meta private:
3346 """
3347 return self.children
3349 @_deprecated("descendants", "4.0.0")
3350 def recursiveChildGenerator(self) -> Iterator[PageElement]:
3351 """Deprecated generator.
3353 :meta private:
3354 """
3355 return self.descendants
3357 @_deprecated("has_attr", "4.0.0")
3358 def has_key(self, key: str) -> bool:
3359 """Deprecated method. This was kind of misleading because has_key()
3360 (attributes) was different from __in__ (contents).
3362 has_key() is gone in Python 3, anyway.
3364 :meta private:
3365 """
3366 return self.has_attr(key)
3369_PageElementT = TypeVar("_PageElementT", bound=PageElement)
3371class ResultSet(List[_PageElementT], Generic[_PageElementT]):
3372 """A ResultSet is a list of `PageElement` objects, gathered as the result
3373 of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of
3374 search results.
3375 """
3377 source: Optional[ElementFilter]
3379 def __init__(
3380 self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = ()
3381 ) -> None:
3382 super(ResultSet, self).__init__(result)
3383 self.source = source
3385 def __getattr__(self, key: str) -> None:
3386 """Raise a helpful exception to explain a common code fix."""
3387 raise AttributeError(
3388 f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""
3389 )
3391# Now that all the classes used by SoupStrainer have been defined,
3392# import SoupStrainer itself into this module to preserve the
3393# backwards compatibility of anyone who imports
3394# bs4.element.SoupStrainer.
3395from bs4.filter import SoupStrainer # noqa: E402