Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/element.py: 55%

1668 """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_."""

1669

1670 PREFIX: str = "<!--"

1671 SUFFIX: str = "-->"

1672

1673

1674class Declaration(PreformattedString):

1675 """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_."""

1676

1677 PREFIX: str = "<?"

1678 SUFFIX: str = "?>"

1679

1680

1681class Doctype(PreformattedString):

1682 """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_."""

1683

1684 @classmethod

1685 def for_name_and_ids(

1686 cls, name: str, pub_id: Optional[str], system_id: Optional[str]

1687 ) -> Doctype:

1688 """Generate an appropriate document type declaration for a given

1689 public ID and system ID.

1690

1691 :param name: The name of the document's root element, e.g. 'html'.

1692 :param pub_id: The Formal Public Identifier for this document type,

1693 e.g. '-//W3C//DTD XHTML 1.1//EN'

1694 :param system_id: The system identifier for this document type,

1695 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'

1696 """

1697 return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id))

1698

1699 @classmethod

1700 def _string_for_name_and_ids(

1701 cls, name: str, pub_id: Optional[str], system_id: Optional[str]

1702 ) -> str:

1703 """Generate a string to be used as the basis of a Doctype object.

1704

1705 This is a separate method from for_name_and_ids() because the lxml

1706 TreeBuilder needs to call it.

1707 """

1708 value = name or ""

1709 if pub_id is not None:

1710 value += ' PUBLIC "%s"' % pub_id

1711 if system_id is not None:

1712 value += ' "%s"' % system_id

1713 elif system_id is not None:

1714 value += ' SYSTEM "%s"' % system_id

1715 return value

1716

1717 PREFIX: str = "<!DOCTYPE "

1718 SUFFIX: str = ">\n"

1719

1720

1721class Stylesheet(NavigableString):

1722 """A `NavigableString` representing the contents of a `<style> HTML

1723 tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_

1724 (probably CSS).

1725

1726 Used to distinguish embedded stylesheets from textual content.

1727 """

1728

1729

1730class Script(NavigableString):

1731 """A `NavigableString` representing the contents of a `<script>

1732 HTML tag

1733 <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_

1734 (probably Javascript).

1735

1736 Used to distinguish executable code from textual content.

1737 """

1738

1739

1740class TemplateString(NavigableString):

1741 """A `NavigableString` representing a string found inside an `HTML

1742 <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_

1743 embedded in a larger document.

1744

1745 Used to distinguish such strings from the main body of the document.

1746 """

1747

1748

1749class RubyTextString(NavigableString):

1750 """A NavigableString representing the contents of an `<rt> HTML

1751 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_.

1752

1753 Can be used to distinguish such strings from the strings they're

1754 annotating.

1755 """

1756

1757

1758class RubyParenthesisString(NavigableString):

1759 """A NavigableString representing the contents of an `<rp> HTML

1760 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_.

1761 """

1762

1763

1764class Tag(PageElement):

1765 """An HTML or XML tag that is part of a parse tree, along with its

1766 attributes, contents, and relationships to other parts of the tree.

1767

1768 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will

1769 create a `Tag` object representing the ``<b>`` tag. You can

1770 instantiate `Tag` objects directly, but it's not necessary unless

1771 you're adding entirely new markup to a parsed document. Most of

1772 the constructor arguments are intended for use by the `TreeBuilder`

1773 that's parsing a document.

1774

1775 :param parser: A `BeautifulSoup` object representing the parse tree this

1776 `Tag` will be part of.

1777 :param builder: The `TreeBuilder` being used to build the tree.

1778 :param name: The name of the tag.

1779 :param namespace: The URI of this tag's XML namespace, if any.

1780 :param prefix: The prefix for this tag's XML namespace, if any.

1781 :param attrs: A dictionary of attribute values.

1782 :param parent: The `Tag` to use as the parent of this `Tag`. May be

1783 the `BeautifulSoup` object itself.

1784 :param previous: The `PageElement` that was parsed immediately before

1785 parsing this tag.

1786 :param is_xml: If True, this is an XML tag. Otherwise, this is an

1787 HTML tag.

1788 :param sourceline: The line number where this tag was found in its

1789 source document.

1790 :param sourcepos: The character position within ``sourceline`` where this

1791 tag was found.

1792 :param can_be_empty_element: If True, this tag should be

1793 represented as <tag/>. If False, this tag should be represented

1794 as <tag></tag>.

1795 :param cdata_list_attributes: A dictionary of attributes whose values should

1796 be parsed as lists of strings if they ever show up on this tag.

1797 :param preserve_whitespace_tags: Names of tags whose contents

1798 should have their whitespace preserved if they are encountered inside

1799 this tag.

1800 :param interesting_string_types: When iterating over this tag's

1801 string contents in methods like `Tag.strings` or

1802 `PageElement.get_text`, these are the types of strings that are

1803 interesting enough to be considered. By default,

1804 `NavigableString` (normal strings) and `CData` (CDATA

1805 sections) are the only interesting string subtypes.

1806 :param namespaces: A dictionary mapping currently active

1807 namespace prefixes to URIs, as of the point in the parsing process when

1808 this tag was encountered. This can be used later to

1809 construct CSS selectors.

1810

1811 """

1812

1813 def __init__(

1814 self,

1815 parser: Optional[BeautifulSoup] = None,

1816 builder: Optional[TreeBuilder] = None,

1817 name: Optional[str] = None,

1818 namespace: Optional[str] = None,

1819 prefix: Optional[str] = None,

1820 attrs: Optional[_RawOrProcessedAttributeValues] = None,

1821 parent: Optional[Union[BeautifulSoup, Tag]] = None,

1822 previous: _AtMostOneElement = None,

1823 is_xml: Optional[bool] = None,

1824 sourceline: Optional[int] = None,

1825 sourcepos: Optional[int] = None,

1826 can_be_empty_element: Optional[bool] = None,

1827 cdata_list_attributes: Optional[Dict[str, Set[str]]] = None,

1828 preserve_whitespace_tags: Optional[Set[str]] = None,

1829 interesting_string_types: Optional[Set[Type[NavigableString]]] = None,

1830 namespaces: Optional[Dict[str, str]] = None,

1831 # NOTE: Any new arguments here need to be mirrored in

1832 # Tag.copy_self, and potentially BeautifulSoup.new_tag

1833 # as well.

1834 ):

1835 if parser is None:

1836 self.parser_class = None

1837 else:

1838 # We don't actually store the parser object: that lets extracted

1839 # chunks be garbage-collected.

1840 self.parser_class = parser.__class__

1841 if name is None:

1842 raise ValueError("No value provided for new tag's name.")

1843 self.name = name

1844 self.namespace = namespace

1845 self._namespaces = namespaces or {}

1846 self.prefix = prefix

1847 if (not builder or builder.store_line_numbers) and (

1848 sourceline is not None or sourcepos is not None

1849 ):

1850 self.sourceline = sourceline

1851 self.sourcepos = sourcepos

1852 else:

1853 self.sourceline = sourceline

1854 self.sourcepos = sourcepos

1855

1856 attr_dict_class: type[AttributeDict]

1857 attribute_value_list_class: type[AttributeValueList]

1858 if builder is None:

1859 if is_xml:

1860 attr_dict_class = XMLAttributeDict

1861 else:

1862 attr_dict_class = HTMLAttributeDict

1863 attribute_value_list_class = AttributeValueList

1864 else:

1865 attr_dict_class = builder.attribute_dict_class

1866 attribute_value_list_class = builder.attribute_value_list_class

1867 self.attribute_value_list_class = attribute_value_list_class

1868

1869 if attrs is None:

1870 self.attrs = attr_dict_class()

1871 else:

1872 if builder is not None and builder.cdata_list_attributes:

1873 self.attrs = builder._replace_cdata_list_attribute_values(

1874 self.name, attrs

1875 )

1876 else:

1877 self.attrs = attr_dict_class()

1878 # Make sure that the values of any multi-valued

1879 # attributes (e.g. when a Tag is copied) are stored in

1880 # new lists.

1881 for k, v in attrs.items():

1882 if isinstance(v, list):

1883 v = v.__class__(v)

1884 self.attrs[k] = v

1885

1886 # If possible, determine ahead of time whether this tag is an

1887 # XML tag.

1888 if builder:

1889 self.known_xml = builder.is_xml

1890 else:

1891 self.known_xml = is_xml

1892 self.contents: List[PageElement] = []

1893 self.setup(parent, previous)

1894 self.hidden = False

1895

1896 if builder is None:

1897 # In the absence of a TreeBuilder, use whatever values were

1898 # passed in here. They're probably None, unless this is a copy of some

1899 # other tag.

1900 self.can_be_empty_element = can_be_empty_element

1901 self.cdata_list_attributes = cdata_list_attributes

1902 self.preserve_whitespace_tags = preserve_whitespace_tags

1903 self.interesting_string_types = interesting_string_types

1904 else:

1905 # Set up any substitutions for this tag, such as the charset in a META tag.

1906 self.attribute_value_list_class = builder.attribute_value_list_class

1907 builder.set_up_substitutions(self)

1908

1909 # Ask the TreeBuilder whether this tag might be an empty-element tag.

1910 self.can_be_empty_element = builder.can_be_empty_element(name)

1911

1912 # Keep track of the list of attributes of this tag that

1913 # might need to be treated as a list.

1914 #

1915 # For performance reasons, we store the whole data structure

1916 # rather than asking the question of every tag. Asking would

1917 # require building a new data structure every time, and

1918 # (unlike can_be_empty_element), we almost never need

1919 # to check this.

1920 self.cdata_list_attributes = builder.cdata_list_attributes

1921

1922 # Keep track of the names that might cause this tag to be treated as a

1923 # whitespace-preserved tag.

1924 self.preserve_whitespace_tags = builder.preserve_whitespace_tags

1925

1926 if self.name in builder.string_containers:

1927 # This sort of tag uses a special string container

1928 # subclass for most of its strings. We need to be able

1929 # to look up the proper container subclass.

1930 self.interesting_string_types = {builder.string_containers[self.name]}

1931 else:

1932 self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES

1933

1934 parser_class: Optional[type[BeautifulSoup]]

1935 name: str

1936 namespace: Optional[str]

1937 prefix: Optional[str]

1938 attrs: _AttributeValues

1939 sourceline: Optional[int]

1940 sourcepos: Optional[int]

1941 known_xml: Optional[bool]

1942 contents: List[PageElement]

1943 hidden: bool

1944 interesting_string_types: Optional[Set[Type[NavigableString]]]

1945

1946 can_be_empty_element: Optional[bool]

1947 cdata_list_attributes: Optional[Dict[str, Set[str]]]

1948 preserve_whitespace_tags: Optional[Set[str]]

1949

1950 #: :meta private:

1951 parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0")

1952

1953 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self:

1954 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.

1955 Its contents are a copy of the old Tag's contents.

1956 """

1957 clone = self.copy_self()

1958

1959 if recursive:

1960 # Clone this tag's descendants recursively, but without

1961 # making any recursive function calls.

1962 tag_stack: List[Tag] = [clone]

1963 for event, element in self._event_stream(self.descendants):

1964 if event is Tag.END_ELEMENT_EVENT:

1965 # Stop appending incoming Tags to the Tag that was

1966 # just closed.

1967 tag_stack.pop()

1968 else:

1969 descendant_clone = element.__deepcopy__(memo, recursive=False)

1970 # Add to its parent's .contents

1971 tag_stack[-1].append(descendant_clone)

1972

1973 if event is Tag.START_ELEMENT_EVENT:

1974 # Add the Tag itself to the stack so that its

1975 # children will be .appended to it.

1976 tag_stack.append(cast(Tag, descendant_clone))

1977 return clone

1978

1979 def copy_self(self) -> Self:

1980 """Create a new Tag just like this one, but with no

1981 contents and unattached to any parse tree.

1982

1983 This is the first step in the deepcopy process, but you can

1984 call it on its own to create a copy of a Tag without copying its

1985 contents.

1986 """

1987 clone = type(self)(

1988 None,

1989 None,

1990 self.name,

1991 self.namespace,

1992 self.prefix,

1993 self.attrs,

1994 is_xml=self._is_xml,

1995 sourceline=self.sourceline,

1996 sourcepos=self.sourcepos,

1997 can_be_empty_element=self.can_be_empty_element,

1998 cdata_list_attributes=self.cdata_list_attributes,

1999 preserve_whitespace_tags=self.preserve_whitespace_tags,

2000 interesting_string_types=self.interesting_string_types,

2001 namespaces=self._namespaces,

2002 )

2003 for attr in ("can_be_empty_element", "hidden"):

2004 setattr(clone, attr, getattr(self, attr))

2005 return clone

2006

2007 @property

2008 def is_empty_element(self) -> bool:

2009 """Is this tag an empty-element tag? (aka a self-closing tag)

2010

2011 A tag that has contents is never an empty-element tag.

2012

2013 A tag that has no contents may or may not be an empty-element

2014 tag. It depends on the `TreeBuilder` used to create the

2015 tag. If the builder has a designated list of empty-element

2016 tags, then only a tag whose name shows up in that list is

2017 considered an empty-element tag. This is usually the case

2018 for HTML documents.

2019

2020 If the builder has no designated list of empty-element, then

2021 any tag with no contents is an empty-element tag. This is usually

2022 the case for XML documents.

2023 """

2024 return len(self.contents) == 0 and self.can_be_empty_element is True

2025

2026 @_deprecated("is_empty_element", "4.0.0")

2027 def isSelfClosing(self) -> bool:

2028 ": :meta private:"

2029 return self.is_empty_element

2030

2031 @property

2032 def string(self) -> Optional[str]:

2033 """Convenience property to get the single string within this

2034 `Tag`, assuming there is just one.

2035

2036 :return: If this `Tag` has a single child that's a

2037 `NavigableString`, the return value is that string. If this

2038 element has one child `Tag`, the return value is that child's

2039 `Tag.string`, recursively. If this `Tag` has no children,

2040 or has more than one child, the return value is ``None``.

2041

2042 If this property is unexpectedly returning ``None`` for you,

2043 it's probably because your `Tag` has more than one thing

2044 inside it.

2045 """

2046 if len(self.contents) != 1:

2047 return None

2048 child = self.contents[0]

2049 if isinstance(child, NavigableString):

2050 return child

2051 elif isinstance(child, Tag):

2052 return child.string

2053 return None

2054

2055 @string.setter

2056 def string(self, string: str) -> None:

2057 """Replace the `Tag.contents` of this `Tag` with a single string."""

2058 self.clear()

2059 if isinstance(string, NavigableString):

2060 new_class = string.__class__

2061 else:

2062 new_class = NavigableString

2063 self.append(new_class(string))

2064

2065 #: :meta private:

2066 MAIN_CONTENT_STRING_TYPES = {NavigableString, CData}

2067

2068 def _all_strings(

2069 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default

2070 ) -> Iterator[str]:

2071 """Yield all strings of certain classes, possibly stripping them.

2072

2073 :param strip: If True, all strings will be stripped before being

2074 yielded.

2075

2076 :param types: A tuple of NavigableString subclasses. Any strings of

2077 a subclass not found in this list will be ignored. By

2078 default, the subclasses considered are the ones found in

2079 self.interesting_string_types. If that's not specified,

2080 only NavigableString and CData objects will be

2081 considered. That means no comments, processing

2082 instructions, etc.

2083 """

2084 if types is self.default:

2085 if self.interesting_string_types is None:

2086 types = self.MAIN_CONTENT_STRING_TYPES

2087 else:

2088 types = self.interesting_string_types

2089

2090 for descendant in self.descendants:

2091 if not isinstance(descendant, NavigableString):

2092 continue

2093 descendant_type = type(descendant)

2094 if isinstance(types, type):

2095 if descendant_type is not types:

2096 # We're not interested in strings of this type.

2097 continue

2098 elif types is not None and descendant_type not in types:

2099 # We're not interested in strings of this type.

2100 continue

2101 if strip:

2102 stripped = descendant.strip()

2103 if len(stripped) == 0:

2104 continue

2105 yield stripped

2106 else:

2107 yield descendant

2108

2109 strings = property(_all_strings)

2110

2111 def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]:

2112 """Insert one or more new PageElements as a child of this `Tag`.

2113

2114 This works similarly to :py:meth:`list.insert`, except you can insert

2115 multiple elements at once.

2116

2117 :param position: The numeric position that should be occupied

2118 in this Tag's `Tag.children` by the first new `PageElement`.

2119

2120 :param new_children: The PageElements to insert.

2121

2122 :return The newly inserted PageElements.

2123 """

2124 inserted: List[PageElement] = []

2125 for new_child in new_children:

2126 inserted.extend(self._insert(position, new_child))

2127 position += 1

2128 return inserted

2129

2130 def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]:

2131 if new_child is None:

2132 raise ValueError("Cannot insert None into a tag.")

2133 if new_child is self:

2134 raise ValueError("Cannot insert a tag into itself.")

2135 if isinstance(new_child, str) and not isinstance(new_child, NavigableString):

2136 new_child = NavigableString(new_child)

2137

2138 from bs4 import BeautifulSoup

2139 if isinstance(new_child, BeautifulSoup):

2140 # We don't want to end up with a situation where one BeautifulSoup

2141 # object contains another. Insert the BeautifulSoup's children and

2142 # return them.

2143 return self.insert(position, *list(new_child.contents))

2144 position = min(position, len(self.contents))

2145 if hasattr(new_child, "parent") and new_child.parent is not None:

2146 # We're 'inserting' an element that's already one

2147 # of this object's children.

2148 if new_child.parent is self:

2149 current_index = self.index(new_child)

2150 if current_index < position:

2151 # We're moving this element further down the list

2152 # of this object's children. That means that when

2153 # we extract this element, our target index will

2154 # jump down one.

2155 position -= 1

2156 elif current_index == position:

2157 # We're 'inserting' an element into its current location.

2158 # This is a no-op.

2159 return [new_child]

2160 new_child.extract()

2161

2162 new_child.parent = self

2163 previous_child = None

2164 if position == 0:

2165 new_child.previous_sibling = None

2166 new_child.previous_element = self

2167 else:

2168 previous_child = self.contents[position - 1]

2169 new_child.previous_sibling = previous_child

2170 new_child.previous_sibling.next_sibling = new_child

2171 new_child.previous_element = previous_child._last_descendant(False)

2172 if new_child.previous_element is not None:

2173 new_child.previous_element.next_element = new_child

2174

2175 new_childs_last_element = new_child._last_descendant(

2176 is_initialized=False, accept_self=True

2177 )

2178 # new_childs_last_element can't be None because we passed

2179 # accept_self=True into _last_descendant. Worst case,

2180 # new_childs_last_element will be new_child itself. Making

2181 # this cast removes several mypy complaints later on as we

2182 # manipulate new_childs_last_element.

2183 new_childs_last_element = cast(PageElement, new_childs_last_element)

2184

2185 if position >= len(self.contents):

2186 new_child.next_sibling = None

2187

2188 parent: Optional[Tag] = self

2189 parents_next_sibling = None

2190 while parents_next_sibling is None and parent is not None:

2191 parents_next_sibling = parent.next_sibling

2192 parent = parent.parent

2193 if parents_next_sibling is not None:

2194 # We found the element that comes next in the document.

2195 break

2196 if parents_next_sibling is not None:

2197 new_childs_last_element.next_element = parents_next_sibling

2198 else:

2199 # The last element of this tag is the last element in

2200 # the document.

2201 new_childs_last_element.next_element = None

2202 else:

2203 next_child = self.contents[position]

2204 new_child.next_sibling = next_child

2205 if new_child.next_sibling is not None:

2206 new_child.next_sibling.previous_sibling = new_child

2207 new_childs_last_element.next_element = next_child

2208

2209 if new_childs_last_element.next_element is not None:

2210 new_childs_last_element.next_element.previous_element = (

2211 new_childs_last_element

2212 )

2213 self.contents.insert(position, new_child)

2214

2215 return [new_child]

2216

2217 def unwrap(self) -> Self:

2218 """Replace this `PageElement` with its contents.

2219

2220 :return: This object, no longer part of the tree.

2221 """

2222 my_parent = self.parent

2223 if my_parent is None:

2224 raise ValueError(

2225 "Cannot replace an element with its contents when that "

2226 "element is not part of a tree."

2227 )

2228 my_index = my_parent.index(self)

2229 self.extract(_self_index=my_index)

2230 for child in reversed(self.contents[:]):

2231 my_parent.insert(my_index, child)

2232 return self

2233

2234 replace_with_children = unwrap

2235

2236 @_deprecated("unwrap", "4.0.0")

2237 def replaceWithChildren(self) -> _OneElement:

2238 ": :meta private:"

2239 return self.unwrap()

2240

2241 def append(self, tag: _InsertableElement) -> PageElement|List[PageElement]:

2242 """Appends the given `PageElement` to the contents of this `Tag`.

2243

2244 :param tag: A PageElement. If this is another BeautifulSoup

2245 object, all of its contents will be inserted into this

2246 `Tag`, since one BeautifulSoup object can't contain another

2247 one.

2248

2249 :return: The object that was just appended, or (if `tag` was a BeautifulSoup

2250 object) all such objects.

2251 """

2252 inserted = self.insert(len(self.contents), tag)

2253 if isinstance(tag, Tag) and tag.name == "[document]": # TODO: can't reference BeautifulSoup class in this module

2254 return inserted

2255 else:

2256 return inserted[0]

2257

2258 def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]:

2259 """Appends one or more objects to the contents of this

2260 `Tag`.

2261

2262 :param tags: If a list of `PageElement` objects is provided,

2263 they will be appended to this tag's contents, one at a time.

2264 If a single `Tag` is provided, its `Tag.contents` will be

2265 used to extend this object's `Tag.contents`.

2266

2267 :return The list of PageElements that were appended.

2268 """

2269 tag_list: Iterable[_InsertableElement]

2270

2271 if isinstance(tags, Tag):

2272 tag_list = list(tags.contents)

2273 elif isinstance(tags, (PageElement, str)):

2274 # The caller should really be using append() instead,

2275 # but we can make it work.

2276 warnings.warn(

2277 "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.",

2278 UserWarning,

2279 stacklevel=2,

2280 )

2281 if isinstance(tags, str) and not isinstance(tags, PageElement):

2282 tags = NavigableString(tags)

2283 tag_list = [tags]

2284 elif isinstance(tags, Iterable):

2285 # Moving items around the tree may change their position in

2286 # the original list. Make a list that won't change.

2287 tag_list = list(tags)

2288

2289 results: List[PageElement] = []

2290 for tag in tag_list:

2291 appended = self.append(tag)

2292 if isinstance(appended, list):

2293 # This can happen if you pass in a mixture of Tag and BeautifulSoup objects.

2294 results.extend(appended)

2295 else:

2296 results.append(appended)

2297

2298 return results

2299

2300 def clear(self, decompose: bool = False) -> None:

2301 """Destroy all children of this `Tag` by calling

2302 `PageElement.extract` on them.

2303

2304 :param decompose: If this is True, `PageElement.decompose` (a

2305 more destructive method) will be called instead of

2306 `PageElement.extract`.

2307 """

2308 for element in self.contents[:]:

2309 if decompose:

2310 element.decompose()

2311 else:

2312 element.extract()

2313

2314 def smooth(self) -> None:

2315 """Smooth out the children of this `Tag` by consolidating consecutive

2316 strings.

2317

2318 If you perform a lot of operations that modify the tree,

2319 calling this method afterwards can make pretty-printed output

2320 look more natural.

2321 """

2322 # Mark the first position of every pair of children that need

2323 # to be consolidated. Do this rather than making a copy of

2324 # self.contents, since in most cases very few strings will be

2325 # affected.

2326 marked = []

2327 for i, a in enumerate(self.contents):

2328 if isinstance(a, Tag):

2329 # Recursively smooth children.

2330 a.smooth()

2331 if i == len(self.contents) - 1:

2332 # This is the last item in .contents, and it's not a

2333 # tag. There's no chance it needs any work.

2334 continue

2335 b = self.contents[i + 1]

2336 if (

2337 isinstance(a, NavigableString)

2338 and isinstance(b, NavigableString)

2339 and not isinstance(a, PreformattedString)

2340 and not isinstance(b, PreformattedString)

2341 ):

2342 marked.append(i)

2343

2344 # Go over the marked positions in reverse order, so that

2345 # removing items from .contents won't affect the remaining

2346 # positions.

2347 for i in reversed(marked):

2348 a = cast(NavigableString, self.contents[i])

2349 b = cast(NavigableString, self.contents[i + 1])

2350 b.extract()

2351 n = NavigableString(a + b)

2352 a.replace_with(n)

2353

2354 def index(self, element: PageElement) -> int:

2355 """Find the index of a child of this `Tag` (by identity, not value).

2356

2357 Doing this by identity avoids issues when a `Tag` contains two

2358 children that have string equality.

2359

2360 :param element: Look for this `PageElement` in this object's contents.

2361 """

2362 for i, child in enumerate(self.contents):

2363 if child is element:

2364 return i

2365 raise ValueError("Tag.index: element not in tag")

2366

2367 def get(

2368 self, key: str, default: Optional[_AttributeValue] = None

2369 ) -> Optional[_AttributeValue]:

2370 """Returns the value of the 'key' attribute for the tag, or

2371 the value given for 'default' if it doesn't have that

2372 attribute.

2373

2374 :param key: The attribute to look for.

2375 :param default: Use this value if the attribute is not present

2376 on this `Tag`.

2377 """

2378 return self.attrs.get(key, default)

2379

2380 def get_attribute_list(

2381 self, key: str, default: Optional[AttributeValueList] = None

2382 ) -> AttributeValueList:

2383 """The same as get(), but always returns a (possibly empty) list.

2384

2385 :param key: The attribute to look for.

2386 :param default: Use this value if the attribute is not present

2387 on this `Tag`.

2388 :return: A list of strings, usually empty or containing only a single

2389 value.

2390 """

2391 list_value: AttributeValueList

2392 value = self.get(key, default)

2393 if value is None:

2394 list_value = self.attribute_value_list_class()

2395 elif isinstance(value, list):

2396 list_value = value

2397 else:

2398 if not isinstance(value, str):

2399 value = cast(str, value)

2400 list_value = self.attribute_value_list_class([value])

2401 return list_value

2402

2403 def has_attr(self, key: str) -> bool:

2404 """Does this `Tag` have an attribute with the given name?"""

2405 return key in self.attrs

2406

2407 def __hash__(self) -> int:

2408 return str(self).__hash__()

2409

2410 def __getitem__(self, key: str) -> _AttributeValue:

2411 """tag[key] returns the value of the 'key' attribute for the Tag,

2412 and throws an exception if it's not there."""

2413 return self.attrs[key]

2414

2415 def __iter__(self) -> Iterator[PageElement]:

2416 "Iterating over a Tag iterates over its contents."

2417 return iter(self.contents)

2418

2419 def __len__(self) -> int:

2420 "The length of a Tag is the length of its list of contents."

2421 return len(self.contents)

2422

2423 def __contains__(self, x: Any) -> bool:

2424 return x in self.contents

2425

2426 def __bool__(self) -> bool:

2427 "A tag is non-None even if it has no contents."

2428 return True

2429

2430 def __setitem__(self, key: str, value: _AttributeValue) -> None:

2431 """Setting tag[key] sets the value of the 'key' attribute for the

2432 tag."""

2433 self.attrs[key] = value

2434

2435 def __delitem__(self, key: str) -> None:

2436 "Deleting tag[key] deletes all 'key' attributes for the tag."

2437 self.attrs.pop(key, None)

2438

2439 @overload

2440 def __call__( # pyright: ignore [reportOverlappingOverload]

2441 self,

2442 name: _FindMethodName = None,

2443 attrs: Optional[_StrainableAttributes] = None,

2444 recursive: bool = True,

2445 string: None = None,

2446 limit: Optional[int] = None,

2447 _stacklevel: int = 2,

2448 **kwargs: _StrainableAttribute,

2449 ) -> _SomeTags:

2450 ...

2451

2452 @overload

2453 def __call__(

2454 self,

2455 name: None = None,

2456 attrs: None = None,

2457 recursive: bool = True,

2458 string: _StrainableString = "",

2459 limit: Optional[int] = None,

2460 _stacklevel: int = 2,

2461 **kwargs: _StrainableAttribute,

2462 ) -> _SomeNavigableStrings:

2463 ...

2464

2465 def __call__(

2466 self,

2467 name: _FindMethodName = None,

2468 attrs: Optional[_StrainableAttributes] = None,

2469 recursive: bool = True,

2470 string: Optional[_StrainableString] = None,

2471 limit: Optional[int] = None,

2472 _stacklevel: int = 2,

2473 **kwargs: _StrainableAttribute,

2474 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:

2475 """Calling a Tag like a function is the same as calling its

2476 find_all() method. Eg. tag('a') returns a list of all the A tags

2477 found within this tag."""

2478 if string is not None and (name is not None or attrs is not None or kwargs):

2479 # TODO: Using the @overload decorator to express the three ways you

2480 # could get into this path is way too much code for a rarely(?) used

2481 # feature.

2482 return cast(ResultSet[Tag], self.find_all(name, attrs, recursive, string, limit, _stacklevel, **kwargs)) #type: ignore

2483

2484 if string is None:

2485 # If string is None, we're searching for tags.

2486 tags:ResultSet[Tag] = self.find_all(

2487 name, attrs, recursive, None, limit, _stacklevel, **kwargs

2488 )

2489 return tags

2490

2491 # Otherwise, we're searching for strings.

2492 strings:ResultSet[NavigableString] = self.find_all(

2493 None, None, recursive, string, limit, _stacklevel, **kwargs

2494 )

2495 return strings

2496

2497 def __getattr__(self, subtag: str) -> Optional[Tag]:

2498 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""

2499 # print("Getattr %s.%s" % (self.__class__, tag))

2500 result: _AtMostOneElement

2501 if len(subtag) > 3 and subtag.endswith("Tag"):

2502 # BS3: soup.aTag -> "soup.find("a")

2503 tag_name = subtag[:-3]

2504 warnings.warn(

2505 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")'

2506 % dict(name=tag_name),

2507 DeprecationWarning,

2508 stacklevel=2,

2509 )

2510 result = self.find(tag_name)

2511 # We special case contents to avoid recursion.

2512 elif not subtag.startswith("__") and not subtag == "contents":

2513 result = self.find(subtag)

2514 else:

2515 raise AttributeError(

2516 "'%s' object has no attribute '%s'" % (self.__class__, subtag)

2517 )

2518 return result

2519

2520 def __eq__(self, other: Any) -> bool:

2521 """Returns true iff this Tag has the same name, the same attributes,

2522 and the same contents (recursively) as `other`."""

2523 if self is other:

2524 return True

2525 if not isinstance(other, Tag):

2526 return False

2527 if (

2528 not hasattr(other, "name")

2529 or not hasattr(other, "attrs")

2530 or not hasattr(other, "contents")

2531 or self.name != other.name

2532 or self.attrs != other.attrs

2533 or len(self) != len(other)

2534 ):

2535 return False

2536 for i, my_child in enumerate(self.contents):

2537 if my_child != other.contents[i]:

2538 return False

2539 return True

2540

2541 def __ne__(self, other: Any) -> bool:

2542 """Returns true iff this Tag is not identical to `other`,

2543 as defined in __eq__."""

2544 return not self == other

2545

2546 def __repr__(self) -> str:

2547 """Renders this `Tag` as a string."""

2548 return self.decode()

2549

2550 __str__ = __unicode__ = __repr__

2551

2552 def encode(

2553 self,

2554 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2555 indent_level: Optional[int] = None,

2556 formatter: _FormatterOrName = "minimal",

2557 errors: str = "xmlcharrefreplace",

2558 ) -> bytes:

2559 """Render this `Tag` and its contents as a bytestring.

2560

2561 :param encoding: The encoding to use when converting to

2562 a bytestring. This may also affect the text of the document,

2563 specifically any encoding declarations within the document.

2564 :param indent_level: Each line of the rendering will be

2565 indented this many levels. (The ``formatter`` decides what a

2566 'level' means, in terms of spaces or other characters

2567 output.) This is used internally in recursive calls while

2568 pretty-printing.

2569 :param formatter: Either a `Formatter` object, or a string naming one of

2570 the standard formatters.

2571 :param errors: An error handling strategy such as

2572 'xmlcharrefreplace'. This value is passed along into

2573 :py:meth:`str.encode` and its value should be one of the `error

2574 handling constants defined by Python's codecs module

2575 <https://docs.python.org/3/library/codecs.html#error-handlers>`_.

2576 """

2577 # Turn the data structure into Unicode, then encode the

2578 # Unicode.

2579 u = self.decode(indent_level, encoding, formatter)

2580 return u.encode(encoding, errors)

2581

2582 def decode(

2583 self,

2584 indent_level: Optional[int] = None,

2585 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2586 formatter: _FormatterOrName = "minimal",

2587 iterator: Optional[Iterator[PageElement]] = None,

2588 ) -> str:

2589 """Render this `Tag` and its contents as a Unicode string.

2590

2591 :param indent_level: Each line of the rendering will be

2592 indented this many levels. (The ``formatter`` decides what a

2593 'level' means, in terms of spaces or other characters

2594 output.) This is used internally in recursive calls while

2595 pretty-printing.

2596 :param encoding: The encoding you intend to use when

2597 converting the string to a bytestring. decode() is *not*

2598 responsible for performing that encoding. This information

2599 is needed so that a real encoding can be substituted in if

2600 the document contains an encoding declaration (e.g. in a

2601 <meta> tag).

2602 :param formatter: Either a `Formatter` object, or a string

2603 naming one of the standard formatters.

2604 :param iterator: The iterator to use when navigating over the

2605 parse tree. This is only used by `Tag.decode_contents` and

2606 you probably won't need to use it.

2607 """

2608 pieces = []

2609 # First off, turn a non-Formatter `formatter` into a Formatter

2610 # object. This will stop the lookup from happening over and

2611 # over again.

2612 if not isinstance(formatter, Formatter):

2613 formatter = self.formatter_for_name(formatter)

2614

2615 if indent_level is True:

2616 indent_level = 0

2617

2618 # The currently active tag that put us into string literal

2619 # mode. Until this element is closed, children will be treated

2620 # as string literals and not pretty-printed. String literal

2621 # mode is turned on immediately after this tag begins, and

2622 # turned off immediately before it's closed. This means there

2623 # will be whitespace before and after the tag itself.

2624 string_literal_tag = None

2625

2626 for event, element in self._event_stream(iterator):

2627 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):

2628 element = cast(Tag, element)

2629 piece = element._format_tag(eventual_encoding, formatter, opening=True)

2630 elif event is Tag.END_ELEMENT_EVENT:

2631 element = cast(Tag, element)

2632 piece = element._format_tag(eventual_encoding, formatter, opening=False)

2633 if indent_level is not None:

2634 indent_level -= 1

2635 else:

2636 element = cast(NavigableString, element)

2637 piece = element.output_ready(formatter)

2638

2639 # Now we need to apply the 'prettiness' -- extra

2640 # whitespace before and/or after this tag. This can get

2641 # complicated because certain tags, like <pre> and

2642 # <script>, can't be prettified, since adding whitespace would

2643 # change the meaning of the content.

2644

2645 # The default behavior is to add whitespace before and

2646 # after an element when string literal mode is off, and to

2647 # leave things as they are when string literal mode is on.

2648 if string_literal_tag:

2649 indent_before = indent_after = False

2650 else:

2651 indent_before = indent_after = True

2652

2653 # The only time the behavior is more complex than that is

2654 # when we encounter an opening or closing tag that might

2655 # put us into or out of string literal mode.

2656 if (

2657 event is Tag.START_ELEMENT_EVENT

2658 and not string_literal_tag

2659 and not cast(Tag, element)._should_pretty_print()

2660 ):

2661 # We are about to enter string literal mode. Add

2662 # whitespace before this tag, but not after. We

2663 # will stay in string literal mode until this tag

2664 # is closed.

2665 indent_before = True

2666 indent_after = False

2667 string_literal_tag = element

2668 elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag:

2669 # We are about to exit string literal mode by closing

2670 # the tag that sent us into that mode. Add whitespace

2671 # after this tag, but not before.

2672 indent_before = False

2673 indent_after = True

2674 string_literal_tag = None

2675

2676 # Now we know whether to add whitespace before and/or

2677 # after this element.

2678 if indent_level is not None:

2679 if indent_before or indent_after:

2680 if isinstance(element, NavigableString):

2681 piece = piece.strip()

2682 if piece:

2683 piece = self._indent_string(

2684 piece, indent_level, formatter, indent_before, indent_after

2685 )

2686 if event == Tag.START_ELEMENT_EVENT:

2687 indent_level += 1

2688 pieces.append(piece)

2689 return "".join(pieces)

2690

2691 class _TreeTraversalEvent(object):

2692 """An internal class representing an event in the process

2693 of traversing a parse tree.

2694

2695 :meta private:

2696 """

2697

2698 # Stand-ins for the different events yielded by _event_stream

2699 START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:

2700 END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:

2701 EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:

2702 STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:

2703

2704 def _event_stream(

2705 self, iterator: Optional[Iterator[PageElement]] = None

2706 ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]:

2707 """Yield a sequence of events that can be used to reconstruct the DOM

2708 for this element.

2709

2710 This lets us recreate the nested structure of this element

2711 (e.g. when formatting it as a string) without using recursive

2712 method calls.

2713

2714 This is similar in concept to the SAX API, but it's a simpler

2715 interface designed for internal use. The events are different

2716 from SAX and the arguments associated with the events are Tags

2717 and other Beautiful Soup objects.

2718

2719 :param iterator: An alternate iterator to use when traversing

2720 the tree.

2721 """

2722 tag_stack: List[Tag] = []

2723

2724 iterator = iterator or self.self_and_descendants

2725

2726 for c in iterator:

2727 # If the parent of the element we're about to yield is not

2728 # the tag currently on the stack, it means that the tag on

2729 # the stack closed before this element appeared.

2730 while tag_stack and c.parent != tag_stack[-1]:

2731 now_closed_tag = tag_stack.pop()

2732 yield Tag.END_ELEMENT_EVENT, now_closed_tag

2733

2734 if isinstance(c, Tag):

2735 if c.is_empty_element:

2736 yield Tag.EMPTY_ELEMENT_EVENT, c

2737 else:

2738 yield Tag.START_ELEMENT_EVENT, c

2739 tag_stack.append(c)

2740 continue

2741 else:

2742 yield Tag.STRING_ELEMENT_EVENT, c

2743

2744 while tag_stack:

2745 now_closed_tag = tag_stack.pop()

2746 yield Tag.END_ELEMENT_EVENT, now_closed_tag

2747

2748 def _indent_string(

2749 self,

2750 s: str,

2751 indent_level: int,

2752 formatter: Formatter,

2753 indent_before: bool,

2754 indent_after: bool,

2755 ) -> str:

2756 """Add indentation whitespace before and/or after a string.

2757

2758 :param s: The string to amend with whitespace.

2759 :param indent_level: The indentation level; affects how much

2760 whitespace goes before the string.

2761 :param indent_before: Whether or not to add whitespace

2762 before the string.

2763 :param indent_after: Whether or not to add whitespace

2764 (a newline) after the string.

2765 """

2766 space_before = ""

2767 if indent_before and indent_level:

2768 space_before = formatter.indent * indent_level

2769

2770 space_after = ""

2771 if indent_after:

2772 space_after = "\n"

2773

2774 return space_before + s + space_after

2775

2776 def _format_tag(

2777 self, eventual_encoding: str, formatter: Formatter, opening: bool

2778 ) -> str:

2779 if self.hidden:

2780 # A hidden tag is invisible, although its contents

2781 # are visible.

2782 return ""

2783

2784 # A tag starts with the < character (see below).

2785

2786 # Then the / character, if this is a closing tag.

2787 closing_slash = ""

2788 if not opening:

2789 closing_slash = "/"

2790

2791 # Then an optional namespace prefix.

2792 prefix = ""

2793 if self.prefix:

2794 prefix = self.prefix + ":"

2795

2796 # Then a list of attribute values, if this is an opening tag.

2797 attribute_string = ""

2798 if opening:

2799 attributes = formatter.attributes(self)

2800 attrs = []

2801 for key, val in attributes:

2802 if val is None:

2803 decoded = key

2804 else:

2805 if isinstance(val, list) or isinstance(val, tuple):

2806 val = " ".join(val)

2807 elif not isinstance(val, str):

2808 val = str(val)

2809 elif (

2810 isinstance(val, AttributeValueWithCharsetSubstitution)

2811 and eventual_encoding is not None

2812 ):

2813 val = val.substitute_encoding(eventual_encoding)

2814

2815 text = formatter.attribute_value(val)

2816 decoded = str(key) + "=" + formatter.quoted_attribute_value(text)

2817 attrs.append(decoded)

2818 if attrs:

2819 attribute_string = " " + " ".join(attrs)

2820

2821 # Then an optional closing slash (for a void element in an

2822 # XML document).

2823 void_element_closing_slash = ""

2824 if self.is_empty_element:

2825 void_element_closing_slash = formatter.void_element_close_prefix or ""

2826

2827 # Put it all together.

2828 return (

2829 "<"

2830 + closing_slash

2831 + prefix

2832 + self.name

2833 + attribute_string

2834 + void_element_closing_slash

2835 + ">"

2836 )

2837

2838 def _should_pretty_print(self, indent_level: int = 1) -> bool:

2839 """Should this tag be pretty-printed?

2840

2841 Most of them should, but some (such as <pre> in HTML

2842 documents) should not.

2843 """

2844 return indent_level is not None and (

2845 not self.preserve_whitespace_tags

2846 or self.name not in self.preserve_whitespace_tags

2847 )

2848

2849 @overload

2850 def prettify(

2851 self,

2852 encoding: None = None,

2853 formatter: _FormatterOrName = "minimal",

2854 ) -> str:

2855 ...

2856

2857 @overload

2858 def prettify(

2859 self,

2860 encoding: _Encoding,

2861 formatter: _FormatterOrName = "minimal",

2862 ) -> bytes:

2863 ...

2864

2865 def prettify(

2866 self,

2867 encoding: Optional[_Encoding] = None,

2868 formatter: _FormatterOrName = "minimal",

2869 ) -> Union[str, bytes]:

2870 """Pretty-print this `Tag` as a string or bytestring.

2871

2872 :param encoding: The encoding of the bytestring, or None if you want Unicode.

2873 :param formatter: A Formatter object, or a string naming one of

2874 the standard formatters.

2875 :return: A string (if no ``encoding`` is provided) or a bytestring

2876 (otherwise).

2877 """

2878 if encoding is None:

2879 return self.decode(indent_level=0, formatter=formatter)

2880 else:

2881 return self.encode(encoding=encoding, indent_level=0, formatter=formatter)

2882

2883 def decode_contents(

2884 self,

2885 indent_level: Optional[int] = None,

2886 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2887 formatter: _FormatterOrName = "minimal",

2888 ) -> str:

2889 """Renders the contents of this tag as a Unicode string.

2890

2891 :param indent_level: Each line of the rendering will be

2892 indented this many levels. (The formatter decides what a

2893 'level' means in terms of spaces or other characters

2894 output.) Used internally in recursive calls while

2895 pretty-printing.

2896

2897 :param eventual_encoding: The tag is destined to be

2898 encoded into this encoding. decode_contents() is *not*

2899 responsible for performing that encoding. This information

2900 is needed so that a real encoding can be substituted in if

2901 the document contains an encoding declaration (e.g. in a

2902 <meta> tag).

2903

2904 :param formatter: A `Formatter` object, or a string naming one of

2905 the standard Formatters.

2906 """

2907 return self.decode(

2908 indent_level, eventual_encoding, formatter, iterator=self.descendants

2909 )

2910

2911 def encode_contents(

2912 self,

2913 indent_level: Optional[int] = None,

2914 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2915 formatter: _FormatterOrName = "minimal",

2916 ) -> bytes:

2917 """Renders the contents of this PageElement as a bytestring.

2918

2919 :param indent_level: Each line of the rendering will be

2920 indented this many levels. (The ``formatter`` decides what a

2921 'level' means, in terms of spaces or other characters

2922 output.) This is used internally in recursive calls while

2923 pretty-printing.

2924 :param formatter: Either a `Formatter` object, or a string naming one of

2925 the standard formatters.

2926 :param encoding: The bytestring will be in this encoding.

2927 """

2928 contents = self.decode_contents(indent_level, encoding, formatter)

2929 return contents.encode(encoding)

2930

2931 @_deprecated("encode_contents", "4.0.0")

2932 def renderContents(

2933 self,

2934 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2935 prettyPrint: bool = False,

2936 indentLevel: Optional[int] = 0,

2937 ) -> bytes:

2938 """Deprecated method for BS3 compatibility.

2939

2940 :meta private:

2941 """

2942 if not prettyPrint:

2943 indentLevel = None

2944 return self.encode_contents(indent_level=indentLevel, encoding=encoding)

2945

2946 # Soup methods

2947

2948 @overload

2949 def find(

2950 self,

2951 name: _FindMethodName = None,

2952 attrs: Optional[_StrainableAttributes] = None,

2953 recursive: bool = True,

2954 string: None=None,

2955 **kwargs: _StrainableAttribute,

2956 ) -> _AtMostOneTag:

2957 ...

2958

2959 @overload

2960 def find(

2961 self,

2962 name: None=None,

2963 attrs: None=None,

2964 recursive: bool = True,

2965 string: _StrainableString="",

2966 ) -> _AtMostOneNavigableString:

2967 ...

2968

2969 def find(

2970 self,

2971 name: _FindMethodName = None,

2972 attrs: Optional[_StrainableAttributes] = None,

2973 recursive: bool = True,

2974 string: Optional[_StrainableString] = None,

2975 **kwargs: _StrainableAttribute,

2976 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:

2977 """Look in the children of this PageElement and find the first

2978 PageElement that matches the given criteria.

2979

2980 All find_* methods take a common set of arguments. See the online

2981 documentation for detailed explanations.

2982

2983 :param name: A filter on tag name.

2984 :param attrs: Additional filters on attribute values.

2985 :param recursive: If this is True, find() will perform a

2986 recursive search of this Tag's children. Otherwise,

2987 only the direct children will be considered.

2988 :param string: A filter on the `Tag.string` attribute.

2989 :kwargs: Additional filters on attribute values.

2990 """

2991 if string is not None and (name is not None or attrs is not None or kwargs):

2992 # TODO: Using the @overload decorator to express the three ways you

2993 # could get into this path is way too much code for a rarely(?) used

2994 # feature.

2995 elements = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) # type:ignore

2996 if elements:

2997 return cast(Tag, elements[0])

2998 elif string is None:

2999 tags = self.find_all(name, attrs, recursive, None, 1, _stacklevel=3, **kwargs)

3000 if tags:

3001 return cast(Tag, tags[0])

3002 else:

3003 strings = self.find_all(None, None, recursive, string, 1, _stacklevel=3, **kwargs)

3004 if strings:

3005 return cast(NavigableString, strings[0])

3006 return None

3007

3008 findChild = _deprecated_function_alias("findChild", "find", "3.0.0")

3009

3010 @overload

3011 def find_all( # pyright: ignore [reportOverlappingOverload]

3012 self,

3013 name: _FindMethodName = None,

3014 attrs: Optional[_StrainableAttributes] = None,

3015 recursive: bool = True,

3016 string: None = None,

3017 limit: Optional[int] = None,

3018 _stacklevel: int = 2,

3019 **kwargs: _StrainableAttribute,

3020 ) -> _SomeTags:

3021 ...

3022

3023 @overload

3024 def find_all(

3025 self,

3026 name: None = None,

3027 attrs: None = None,

3028 recursive: bool = True,

3029 string: _StrainableString = "",

3030 limit: Optional[int] = None,

3031 _stacklevel: int = 2,

3032 **kwargs: _StrainableAttribute,

3033 ) -> _SomeNavigableStrings:

3034 ...

3035

3036 def find_all(

3037 self,

3038 name: _FindMethodName = None,

3039 attrs: Optional[_StrainableAttributes] = None,

3040 recursive: bool = True,

3041 string: Optional[_StrainableString] = None,

3042 limit: Optional[int] = None,

3043 _stacklevel: int = 2,

3044 **kwargs: _StrainableAttribute,

3045 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:

3046 """Look in the children of this `PageElement` and find all

3047 `PageElement` objects that match the given criteria.

3048

3049 All find_* methods take a common set of arguments. See the online

3050 documentation for detailed explanations.

3051

3052 :param name: A filter on tag name.

3053 :param attrs: Additional filters on attribute values.

3054 :param recursive: If this is True, find_all() will perform a

3055 recursive search of this PageElement's children. Otherwise,

3056 only the direct children will be considered.

3057 :param limit: Stop looking after finding this many results.

3058 :param _stacklevel: Used internally to improve warning messages.

3059 :kwargs: Additional filters on attribute values.

3060 """

3061 generator = self.descendants

3062 if not recursive:

3063 generator = self.children

3064 _stacklevel += 1

3065

3066 if string is not None and (name is not None or attrs is not None or kwargs):

3067 # TODO: Using the @overload decorator to express the three ways you

3068 # could get into this path is way too much code for a rarely(?) used

3069 # feature.

3070 return cast(ResultSet[Tag],

3071 self._find_all(name, attrs, string, limit, generator,

3072 _stacklevel=_stacklevel, **kwargs)

3073 )

3074

3075 if string is None:

3076 # If string is None, we're searching for tags.

3077 return cast(ResultSet[Tag], self._find_all(

3078 name, attrs, None, limit, generator, _stacklevel=_stacklevel, **kwargs

3079 ))

3080

3081 # Otherwise, we're searching for strings.

3082 return cast(ResultSet[NavigableString], self._find_all(

3083 None, None, string, limit, generator, _stacklevel=_stacklevel, **kwargs

3084 ))

3085

3086 findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")

3087 findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")

3088

3089 # Generator methods

3090 @property

3091 def children(self) -> Iterator[PageElement]:

3092 """Iterate over all direct children of this `PageElement`."""

3093 return (x for x in self.contents)

3094

3095 @property

3096 def self_and_descendants(self) -> Iterator[PageElement]:

3097 """Iterate over this `Tag` and its children in a

3098 breadth-first sequence.

3099 """

3100 return self._self_and(self.descendants)

3101

3102 @property

3103 def descendants(self) -> Iterator[PageElement]:

3104 """Iterate over all children of this `Tag` in a

3105 breadth-first sequence.

3106 """

3107 if not len(self.contents):

3108 return

3109 # _last_descendant() can't return None here because

3110 # accept_self is True. Worst case, last_descendant will end up

3111 # as self.

3112 last_descendant = cast(PageElement, self._last_descendant(accept_self=True))

3113 stopNode = last_descendant.next_element

3114 current: _AtMostOneElement = self.contents[0]

3115 while current is not stopNode and current is not None:

3116 successor = current.next_element

3117 yield current

3118 current = successor

3119

3120 # CSS selector code

3121 def select_one(

3122 self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any

3123 ) -> Optional[Tag]:

3124 """Perform a CSS selection operation on the current element.

3125

3126 :param selector: A CSS selector.

3127

3128 :param namespaces: A dictionary mapping namespace prefixes

3129 used in the CSS selector to namespace URIs. By default,

3130 Beautiful Soup will use the prefixes it encountered while

3131 parsing the document.

3132

3133 :param kwargs: Keyword arguments to be passed into Soup Sieve's

3134 soupsieve.select() method.

3135 """

3136 return self.css.select_one(selector, namespaces, **kwargs)

3137

3138 def select(

3139 self,

3140 selector: str,

3141 namespaces: Optional[Dict[str, str]] = None,

3142 limit: int = 0,

3143 **kwargs: Any,

3144 ) -> ResultSet[Tag]:

3145 """Perform a CSS selection operation on the current element.

3146

3147 This uses the SoupSieve library.

3148

3149 :param selector: A string containing a CSS selector.

3150

3151 :param namespaces: A dictionary mapping namespace prefixes

3152 used in the CSS selector to namespace URIs. By default,

3153 Beautiful Soup will use the prefixes it encountered while

3154 parsing the document.

3155

3156 :param limit: After finding this number of results, stop looking.

3157

3158 :param kwargs: Keyword arguments to be passed into SoupSieve's

3159 soupsieve.select() method.

3160 """

3161 return self.css.select(selector, namespaces, limit, **kwargs)

3162

3163 @property

3164 def css(self) -> CSS:

3165 """Return an interface to the CSS selector API."""

3166 return CSS(self)

3167

3168 # Old names for backwards compatibility

3169 @_deprecated("children", "4.0.0")

3170 def childGenerator(self) -> Iterator[PageElement]:

3171 """Deprecated generator.

3172

3173 :meta private:

3174 """

3175 return self.children

3176

3177 @_deprecated("descendants", "4.0.0")

3178 def recursiveChildGenerator(self) -> Iterator[PageElement]:

3179 """Deprecated generator.

3180

3181 :meta private:

3182 """

3183 return self.descendants

3184

3185 @_deprecated("has_attr", "4.0.0")

3186 def has_key(self, key: str) -> bool:

3187 """Deprecated method. This was kind of misleading because has_key()

3188 (attributes) was different from __in__ (contents).

3189

3190 has_key() is gone in Python 3, anyway.

3191

3192 :meta private:

3193 """

3194 return self.has_attr(key)

3195

3196

3197_PageElementT = TypeVar("_PageElementT", bound=PageElement)

3198

3199class ResultSet(List[_PageElementT], Generic[_PageElementT]):

3200 """A ResultSet is a list of `PageElement` objects, gathered as the result

3201 of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of

3202 search results.

3203 """

3204

3205 source: Optional[ElementFilter]

3206

3207 def __init__(

3208 self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = ()

3209 ) -> None:

3210 super(ResultSet, self).__init__(result)

3211 self.source = source

3212

3213 def __getattr__(self, key: str) -> None:

3214 """Raise a helpful exception to explain a common code fix."""

3215 raise AttributeError(

3216 f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""

3217 )

3218

3219# Now that all the classes used by SoupStrainer have been defined,

3220# import SoupStrainer itself into this module to preserve the

3221# backwards compatibility of anyone who imports

3222# bs4.element.SoupStrainer.

3223from bs4.filter import SoupStrainer # noqa: E402