Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/element.py: 32%

1668 """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_."""

1669

1670 PREFIX: str = "<!--"

1671 SUFFIX: str = "-->"

1672

1673

1674class Declaration(PreformattedString):

1675 """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_."""

1676

1677 PREFIX: str = "<?"

1678 SUFFIX: str = "?>"

1679

1680

1681class Doctype(PreformattedString):

1682 """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_."""

1683

1684 @classmethod

1685 def for_name_and_ids(

1686 cls, name: str, pub_id: Optional[str], system_id: Optional[str]

1687 ) -> Doctype:

1688 """Generate an appropriate document type declaration for a given

1689 public ID and system ID.

1690

1691 :param name: The name of the document's root element, e.g. 'html'.

1692 :param pub_id: The Formal Public Identifier for this document type,

1693 e.g. '-//W3C//DTD XHTML 1.1//EN'

1694 :param system_id: The system identifier for this document type,

1695 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'

1696 """

1697 return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id))

1698

1699 @classmethod

1700 def _string_for_name_and_ids(

1701 cls, name: str, pub_id: Optional[str], system_id: Optional[str]

1702 ) -> str:

1703 """Generate a string to be used as the basis of a Doctype object.

1704

1705 This is a separate method from for_name_and_ids() because the lxml

1706 TreeBuilder needs to call it.

1707 """

1708 value = name or ""

1709 if pub_id is not None:

1710 value += ' PUBLIC "%s"' % pub_id

1711 if system_id is not None:

1712 value += ' "%s"' % system_id

1713 elif system_id is not None:

1714 value += ' SYSTEM "%s"' % system_id

1715 return value

1716

1717 PREFIX: str = "<!DOCTYPE "

1718 SUFFIX: str = ">\n"

1719

1720

1721class Stylesheet(NavigableString):

1722 """A `NavigableString` representing the contents of a `<style> HTML

1723 tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_

1724 (probably CSS).

1725

1726 Used to distinguish embedded stylesheets from textual content.

1727 """

1728

1729

1730class Script(NavigableString):

1731 """A `NavigableString` representing the contents of a `<script>

1732 HTML tag

1733 <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_

1734 (probably Javascript).

1735

1736 Used to distinguish executable code from textual content.

1737 """

1738

1739

1740class TemplateString(NavigableString):

1741 """A `NavigableString` representing a string found inside an `HTML

1742 <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_

1743 embedded in a larger document.

1744

1745 Used to distinguish such strings from the main body of the document.

1746 """

1747

1748

1749class RubyTextString(NavigableString):

1750 """A NavigableString representing the contents of an `<rt> HTML

1751 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_.

1752

1753 Can be used to distinguish such strings from the strings they're

1754 annotating.

1755 """

1756

1757

1758class RubyParenthesisString(NavigableString):

1759 """A NavigableString representing the contents of an `<rp> HTML

1760 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_.

1761 """

1762

1763

1764class Tag(PageElement):

1765 """An HTML or XML tag that is part of a parse tree, along with its

1766 attributes, contents, and relationships to other parts of the tree.

1767

1768 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will

1769 create a `Tag` object representing the ``<b>`` tag. You can

1770 instantiate `Tag` objects directly, but it's not necessary unless

1771 you're adding entirely new markup to a parsed document. Most of

1772 the constructor arguments are intended for use by the `TreeBuilder`

1773 that's parsing a document.

1774

1775 :param parser: A `BeautifulSoup` object representing the parse tree this

1776 `Tag` will be part of.

1777 :param builder: The `TreeBuilder` being used to build the tree.

1778 :param name: The name of the tag.

1779 :param namespace: The URI of this tag's XML namespace, if any.

1780 :param prefix: The prefix for this tag's XML namespace, if any.

1781 :param attrs: A dictionary of attribute values.

1782 :param parent: The `Tag` to use as the parent of this `Tag`. May be

1783 the `BeautifulSoup` object itself.

1784 :param previous: The `PageElement` that was parsed immediately before

1785 parsing this tag.

1786 :param is_xml: If True, this is an XML tag. Otherwise, this is an

1787 HTML tag.

1788 :param sourceline: The line number where this tag was found in its

1789 source document.

1790 :param sourcepos: The character position within ``sourceline`` where this

1791 tag was found.

1792 :param can_be_empty_element: If True, this tag should be

1793 represented as <tag/>. If False, this tag should be represented

1794 as <tag></tag>.

1795 :param cdata_list_attributes: A dictionary of attributes whose values should

1796 be parsed as lists of strings if they ever show up on this tag.

1797 :param preserve_whitespace_tags: Names of tags whose contents

1798 should have their whitespace preserved if they are encountered inside

1799 this tag.

1800 :param interesting_string_types: When iterating over this tag's

1801 string contents in methods like `Tag.strings` or

1802 `PageElement.get_text`, these are the types of strings that are

1803 interesting enough to be considered. By default,

1804 `NavigableString` (normal strings) and `CData` (CDATA

1805 sections) are the only interesting string subtypes.

1806 :param namespaces: A dictionary mapping currently active

1807 namespace prefixes to URIs, as of the point in the parsing process when

1808 this tag was encountered. This can be used later to

1809 construct CSS selectors.

1810

1811 """

1812

1813 def __init__(

1814 self,

1815 parser: Optional[BeautifulSoup] = None,

1816 builder: Optional[TreeBuilder] = None,

1817 name: Optional[str] = None,

1818 namespace: Optional[str] = None,

1819 prefix: Optional[str] = None,

1820 attrs: Optional[_RawOrProcessedAttributeValues] = None,

1821 parent: Optional[Union[BeautifulSoup, Tag]] = None,

1822 previous: _AtMostOneElement = None,

1823 is_xml: Optional[bool] = None,

1824 sourceline: Optional[int] = None,

1825 sourcepos: Optional[int] = None,

1826 can_be_empty_element: Optional[bool] = None,

1827 cdata_list_attributes: Optional[Dict[str, Set[str]]] = None,

1828 preserve_whitespace_tags: Optional[Set[str]] = None,

1829 interesting_string_types: Optional[Set[Type[NavigableString]]] = None,

1830 namespaces: Optional[Dict[str, str]] = None,

1831 # NOTE: Any new arguments here need to be mirrored in

1832 # Tag.copy_self, and potentially BeautifulSoup.new_tag

1833 # as well.

1834 ):

1835 if parser is None:

1836 self.parser_class = None

1837 else:

1838 # We don't actually store the parser object: that lets extracted

1839 # chunks be garbage-collected.

1840 self.parser_class = parser.__class__

1841 if name is None:

1842 raise ValueError("No value provided for new tag's name.")

1843 self.name = name

1844 self.namespace = namespace

1845 self._namespaces = namespaces or {}

1846 self.prefix = prefix

1847 if (not builder or builder.store_line_numbers) and (

1848 sourceline is not None or sourcepos is not None

1849 ):

1850 self.sourceline = sourceline

1851 self.sourcepos = sourcepos

1852 else:

1853 self.sourceline = sourceline

1854 self.sourcepos = sourcepos

1855

1856 attr_dict_class: type[AttributeDict]

1857 attribute_value_list_class: type[AttributeValueList]

1858 if builder is None:

1859 if is_xml:

1860 attr_dict_class = XMLAttributeDict

1861 else:

1862 attr_dict_class = HTMLAttributeDict

1863 attribute_value_list_class = AttributeValueList

1864 else:

1865 attr_dict_class = builder.attribute_dict_class

1866 attribute_value_list_class = builder.attribute_value_list_class

1867 self.attribute_value_list_class = attribute_value_list_class

1868

1869 if attrs is None:

1870 self.attrs = attr_dict_class()

1871 else:

1872 if builder is not None and builder.cdata_list_attributes:

1873 self.attrs = builder._replace_cdata_list_attribute_values(

1874 self.name, attrs

1875 )

1876 else:

1877 self.attrs = attr_dict_class()

1878 # Make sure that the values of any multi-valued

1879 # attributes (e.g. when a Tag is copied) are stored in

1880 # new lists.

1881 for k, v in attrs.items():

1882 if isinstance(v, list):

1883 v = v.__class__(v)

1884 self.attrs[k] = v

1885

1886 # If possible, determine ahead of time whether this tag is an

1887 # XML tag.

1888 if builder:

1889 self.known_xml = builder.is_xml

1890 else:

1891 self.known_xml = is_xml

1892 self.contents: List[PageElement] = []

1893 self.setup(parent, previous)

1894 self.hidden = False

1895

1896 if builder is None:

1897 # In the absence of a TreeBuilder, use whatever values were

1898 # passed in here. They're probably None, unless this is a copy of some

1899 # other tag.

1900 self.can_be_empty_element = can_be_empty_element

1901 self.cdata_list_attributes = cdata_list_attributes

1902 self.preserve_whitespace_tags = preserve_whitespace_tags

1903 self.interesting_string_types = interesting_string_types

1904 else:

1905 # Set up any substitutions for this tag, such as the charset in a META tag.

1906 self.attribute_value_list_class = builder.attribute_value_list_class

1907 builder.set_up_substitutions(self)

1908

1909 # Ask the TreeBuilder whether this tag might be an empty-element tag.

1910 self.can_be_empty_element = builder.can_be_empty_element(name)

1911

1912 # Keep track of the list of attributes of this tag that

1913 # might need to be treated as a list.

1914 #

1915 # For performance reasons, we store the whole data structure

1916 # rather than asking the question of every tag. Asking would

1917 # require building a new data structure every time, and

1918 # (unlike can_be_empty_element), we almost never need

1919 # to check this.

1920 self.cdata_list_attributes = builder.cdata_list_attributes

1921

1922 # Keep track of the names that might cause this tag to be treated as a

1923 # whitespace-preserved tag.

1924 self.preserve_whitespace_tags = builder.preserve_whitespace_tags

1925

1926 if self.name in builder.string_containers:

1927 # This sort of tag uses a special string container

1928 # subclass for most of its strings. We need to be able

1929 # to look up the proper container subclass.

1930 self.interesting_string_types = {builder.string_containers[self.name]}

1931 else:

1932 self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES

1933

1934 parser_class: Optional[type[BeautifulSoup]]

1935 name: str

1936 namespace: Optional[str]

1937 prefix: Optional[str]

1938 attrs: _AttributeValues

1939 sourceline: Optional[int]

1940 sourcepos: Optional[int]

1941 known_xml: Optional[bool]

1942 contents: List[PageElement]

1943 hidden: bool

1944 interesting_string_types: Optional[Set[Type[NavigableString]]]

1945

1946 can_be_empty_element: Optional[bool]

1947 cdata_list_attributes: Optional[Dict[str, Set[str]]]

1948 preserve_whitespace_tags: Optional[Set[str]]

1949

1950 #: :meta private:

1951 parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0")

1952

1953 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self:

1954 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.

1955 Its contents are a copy of the old Tag's contents.

1956 """

1957 clone = self.copy_self()

1958

1959 if recursive:

1960 # Clone this tag's descendants recursively, but without

1961 # making any recursive function calls.

1962 tag_stack: List[Tag] = [clone]

1963 for event, element in self._event_stream(self.descendants):

1964 if event is Tag.END_ELEMENT_EVENT:

1965 # Stop appending incoming Tags to the Tag that was

1966 # just closed.

1967 tag_stack.pop()

1968 else:

1969 descendant_clone = element.__deepcopy__(memo, recursive=False)

1970 # Add to its parent's .contents

1971 tag_stack[-1].append(descendant_clone)

1972

1973 if event is Tag.START_ELEMENT_EVENT:

1974 # Add the Tag itself to the stack so that its

1975 # children will be .appended to it.

1976 tag_stack.append(cast(Tag, descendant_clone))

1977 return clone

1978

1979 def copy_self(self) -> Self:

1980 """Create a new Tag just like this one, but with no

1981 contents and unattached to any parse tree.

1982

1983 This is the first step in the deepcopy process, but you can

1984 call it on its own to create a copy of a Tag without copying its

1985 contents.

1986 """

1987 clone = type(self)(

1988 None,

1989 None,

1990 self.name,

1991 self.namespace,

1992 self.prefix,

1993 self.attrs,

1994 is_xml=self._is_xml,

1995 sourceline=self.sourceline,

1996 sourcepos=self.sourcepos,

1997 can_be_empty_element=self.can_be_empty_element,

1998 cdata_list_attributes=self.cdata_list_attributes,

1999 preserve_whitespace_tags=self.preserve_whitespace_tags,

2000 interesting_string_types=self.interesting_string_types,

2001 namespaces=self._namespaces,

2002 )

2003 for attr in ("can_be_empty_element", "hidden"):

2004 setattr(clone, attr, getattr(self, attr))

2005 return clone

2006

2007 @property

2008 def is_empty_element(self) -> bool:

2009 """Is this tag an empty-element tag? (aka a self-closing tag)

2010

2011 A tag that has contents is never an empty-element tag.

2012

2013 A tag that has no contents may or may not be an empty-element

2014 tag. It depends on the `TreeBuilder` used to create the

2015 tag. If the builder has a designated list of empty-element

2016 tags, then only a tag whose name shows up in that list is

2017 considered an empty-element tag. This is usually the case

2018 for HTML documents.

2019

2020 If the builder has no designated list of empty-element, then

2021 any tag with no contents is an empty-element tag. This is usually

2022 the case for XML documents.

2023 """

2024 return len(self.contents) == 0 and self.can_be_empty_element is True

2025

2026 @_deprecated("is_empty_element", "4.0.0")

2027 def isSelfClosing(self) -> bool:

2028 ": :meta private:"

2029 return self.is_empty_element

2030

2031 @property

2032 def string(self) -> Optional[str]:

2033 """Convenience property to get the single string within this

2034 `Tag`, assuming there is just one.

2035

2036 :return: If this `Tag` has a single child that's a

2037 `NavigableString`, the return value is that string. If this

2038 element has one child `Tag`, the return value is that child's

2039 `Tag.string`, recursively. If this `Tag` has no children,

2040 or has more than one child, the return value is ``None``.

2041

2042 If this property is unexpectedly returning ``None`` for you,

2043 it's probably because your `Tag` has more than one thing

2044 inside it.

2045 """

2046 if len(self.contents) != 1:

2047 return None

2048 child = self.contents[0]

2049 if isinstance(child, NavigableString):

2050 return child

2051 elif isinstance(child, Tag):

2052 return child.string

2053 return None

2054

2055 @string.setter

2056 def string(self, string: str) -> None:

2057 """Replace the `Tag.contents` of this `Tag` with a single string."""

2058 self.clear()

2059 if isinstance(string, NavigableString):

2060 new_class = string.__class__

2061 else:

2062 new_class = NavigableString

2063 self.append(new_class(string))

2064

2065 #: :meta private:

2066 MAIN_CONTENT_STRING_TYPES = {NavigableString, CData}

2067

2068 def _all_strings(

2069 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default

2070 ) -> Iterator[str]:

2071 """Yield all strings of certain classes, possibly stripping them.

2072

2073 :param strip: If True, all strings will be stripped before being

2074 yielded.

2075

2076 :param types: A tuple of NavigableString subclasses. Any strings of

2077 a subclass not found in this list will be ignored. By

2078 default, the subclasses considered are the ones found in

2079 self.interesting_string_types. If that's not specified,

2080 only NavigableString and CData objects will be

2081 considered. That means no comments, processing

2082 instructions, etc.

2083 """

2084 if types is self.default:

2085 if self.interesting_string_types is None:

2086 types = self.MAIN_CONTENT_STRING_TYPES

2087 else:

2088 types = self.interesting_string_types

2089

2090 for descendant in self.descendants:

2091 if not isinstance(descendant, NavigableString):

2092 continue

2093 descendant_type = type(descendant)

2094 if isinstance(types, type):

2095 if descendant_type is not types:

2096 # We're not interested in strings of this type.

2097 continue

2098 elif types is not None and descendant_type not in types:

2099 # We're not interested in strings of this type.

2100 continue

2101 if strip:

2102 stripped = descendant.strip()

2103 if len(stripped) == 0:

2104 continue

2105 yield stripped

2106 else:

2107 yield descendant

2108

2109 strings = property(_all_strings)

2110

2111 def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]:

2112 """Insert one or more new PageElements as a child of this `Tag`.

2113

2114 This works similarly to :py:meth:`list.insert`, except you can insert

2115 multiple elements at once.

2116

2117 :param position: The numeric position that should be occupied

2118 in this Tag's `Tag.children` by the first new `PageElement`.

2119

2120 :param new_children: The PageElements to insert.

2121

2122 :return The newly inserted PageElements.

2123 """

2124 inserted: List[PageElement] = []

2125 for new_child in new_children:

2126 inserted.extend(self._insert(position, new_child))

2127 position += 1

2128 return inserted

2129

2130 def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]:

2131 if new_child is None:

2132 raise ValueError("Cannot insert None into a tag.")

2133 if new_child is self:

2134 raise ValueError("Cannot insert a tag into itself.")

2135 if isinstance(new_child, str) and not isinstance(new_child, NavigableString):

2136 new_child = NavigableString(new_child)

2137

2138 from bs4 import BeautifulSoup

2139 if isinstance(new_child, BeautifulSoup):

2140 # We don't want to end up with a situation where one BeautifulSoup

2141 # object contains another. Insert the BeautifulSoup's children and

2142 # return them.

2143 return self.insert(position, *list(new_child.contents))

2144 position = min(position, len(self.contents))

2145 if hasattr(new_child, "parent") and new_child.parent is not None:

2146 # We're 'inserting' an element that's already one

2147 # of this object's children.

2148 if new_child.parent is self:

2149 current_index = self.index(new_child)

2150 if current_index < position:

2151 # We're moving this element further down the list

2152 # of this object's children. That means that when

2153 # we extract this element, our target index will

2154 # jump down one.

2155 position -= 1

2156 elif current_index == position:

2157 # We're 'inserting' an element into its current location.

2158 # This is a no-op.

2159 return [new_child]

2160 new_child.extract()

2161

2162 new_child.parent = self

2163 previous_child = None

2164 if position == 0:

2165 new_child.previous_sibling = None

2166 new_child.previous_element = self

2167 else:

2168 previous_child = self.contents[position - 1]

2169 new_child.previous_sibling = previous_child

2170 new_child.previous_sibling.next_sibling = new_child

2171 new_child.previous_element = previous_child._last_descendant(False)

2172 if new_child.previous_element is not None:

2173 new_child.previous_element.next_element = new_child

2174

2175 new_childs_last_element = new_child._last_descendant(

2176 is_initialized=False, accept_self=True

2177 )

2178 # new_childs_last_element can't be None because we passed

2179 # accept_self=True into _last_descendant. Worst case,

2180 # new_childs_last_element will be new_child itself. Making

2181 # this cast removes several mypy complaints later on as we

2182 # manipulate new_childs_last_element.

2183 new_childs_last_element = cast(PageElement, new_childs_last_element)

2184

2185 if position >= len(self.contents):

2186 new_child.next_sibling = None

2187

2188 parent: Optional[Tag] = self

2189 parents_next_sibling = None

2190 while parents_next_sibling is None and parent is not None:

2191 parents_next_sibling = parent.next_sibling

2192 parent = parent.parent

2193 if parents_next_sibling is not None:

2194 # We found the element that comes next in the document.

2195 break

2196 if parents_next_sibling is not None:

2197 new_childs_last_element.next_element = parents_next_sibling

2198 else:

2199 # The last element of this tag is the last element in

2200 # the document.

2201 new_childs_last_element.next_element = None

2202 else:

2203 next_child = self.contents[position]

2204 new_child.next_sibling = next_child

2205 if new_child.next_sibling is not None:

2206 new_child.next_sibling.previous_sibling = new_child

2207 new_childs_last_element.next_element = next_child

2208

2209 if new_childs_last_element.next_element is not None:

2210 new_childs_last_element.next_element.previous_element = (

2211 new_childs_last_element

2212 )

2213 self.contents.insert(position, new_child)

2214

2215 return [new_child]

2216

2217 def unwrap(self) -> Self:

2218 """Replace this `PageElement` with its contents.

2219

2220 :return: This object, no longer part of the tree.

2221 """

2222 my_parent = self.parent

2223 if my_parent is None:

2224 raise ValueError(

2225 "Cannot replace an element with its contents when that "

2226 "element is not part of a tree."

2227 )

2228 my_index = my_parent.index(self)

2229 self.extract(_self_index=my_index)

2230 for child in reversed(self.contents[:]):

2231 my_parent.insert(my_index, child)

2232 return self

2233

2234 replace_with_children = unwrap

2235

2236 @_deprecated("unwrap", "4.0.0")

2237 def replaceWithChildren(self) -> _OneElement:

2238 ": :meta private:"

2239 return self.unwrap()

2240

2241 def append(self, tag: _InsertableElement) -> PageElement:

2242 """

2243 Appends the given `PageElement` to the contents of this `Tag`.

2244

2245 :param tag: A PageElement.

2246

2247 :return The newly appended PageElement.

2248 """

2249 return self.insert(len(self.contents), tag)[0]

2250

2251 def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]:

2252 """Appends one or more objects to the contents of this

2253 `Tag`.

2254

2255 :param tags: If a list of `PageElement` objects is provided,

2256 they will be appended to this tag's contents, one at a time.

2257 If a single `Tag` is provided, its `Tag.contents` will be

2258 used to extend this object's `Tag.contents`.

2259

2260 :return The list of PageElements that were appended.

2261 """

2262 tag_list: Iterable[_InsertableElement]

2263

2264 if isinstance(tags, Tag):

2265 tag_list = list(tags.contents)

2266 elif isinstance(tags, (PageElement, str)):

2267 # The caller should really be using append() instead,

2268 # but we can make it work.

2269 warnings.warn(

2270 "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.",

2271 UserWarning,

2272 stacklevel=2,

2273 )

2274 if isinstance(tags, str) and not isinstance(tags, PageElement):

2275 tags = NavigableString(tags)

2276 tag_list = [tags]

2277 elif isinstance(tags, Iterable):

2278 # Moving items around the tree may change their position in

2279 # the original list. Make a list that won't change.

2280 tag_list = list(tags)

2281

2282 results: List[PageElement] = []

2283 for tag in tag_list:

2284 results.append(self.append(tag))

2285

2286 return results

2287

2288 def clear(self, decompose: bool = False) -> None:

2289 """Destroy all children of this `Tag` by calling

2290 `PageElement.extract` on them.

2291

2292 :param decompose: If this is True, `PageElement.decompose` (a

2293 more destructive method) will be called instead of

2294 `PageElement.extract`.

2295 """

2296 for element in self.contents[:]:

2297 if decompose:

2298 element.decompose()

2299 else:

2300 element.extract()

2301

2302 def smooth(self) -> None:

2303 """Smooth out the children of this `Tag` by consolidating consecutive

2304 strings.

2305

2306 If you perform a lot of operations that modify the tree,

2307 calling this method afterwards can make pretty-printed output

2308 look more natural.

2309 """

2310 # Mark the first position of every pair of children that need

2311 # to be consolidated. Do this rather than making a copy of

2312 # self.contents, since in most cases very few strings will be

2313 # affected.

2314 marked = []

2315 for i, a in enumerate(self.contents):

2316 if isinstance(a, Tag):

2317 # Recursively smooth children.

2318 a.smooth()

2319 if i == len(self.contents) - 1:

2320 # This is the last item in .contents, and it's not a

2321 # tag. There's no chance it needs any work.

2322 continue

2323 b = self.contents[i + 1]

2324 if (

2325 isinstance(a, NavigableString)

2326 and isinstance(b, NavigableString)

2327 and not isinstance(a, PreformattedString)

2328 and not isinstance(b, PreformattedString)

2329 ):

2330 marked.append(i)

2331

2332 # Go over the marked positions in reverse order, so that

2333 # removing items from .contents won't affect the remaining

2334 # positions.

2335 for i in reversed(marked):

2336 a = cast(NavigableString, self.contents[i])

2337 b = cast(NavigableString, self.contents[i + 1])

2338 b.extract()

2339 n = NavigableString(a + b)

2340 a.replace_with(n)

2341

2342 def index(self, element: PageElement) -> int:

2343 """Find the index of a child of this `Tag` (by identity, not value).

2344

2345 Doing this by identity avoids issues when a `Tag` contains two

2346 children that have string equality.

2347

2348 :param element: Look for this `PageElement` in this object's contents.

2349 """

2350 for i, child in enumerate(self.contents):

2351 if child is element:

2352 return i

2353 raise ValueError("Tag.index: element not in tag")

2354

2355 def get(

2356 self, key: str, default: Optional[_AttributeValue] = None

2357 ) -> Optional[_AttributeValue]:

2358 """Returns the value of the 'key' attribute for the tag, or

2359 the value given for 'default' if it doesn't have that

2360 attribute.

2361

2362 :param key: The attribute to look for.

2363 :param default: Use this value if the attribute is not present

2364 on this `Tag`.

2365 """

2366 return self.attrs.get(key, default)

2367

2368 def get_attribute_list(

2369 self, key: str, default: Optional[AttributeValueList] = None

2370 ) -> AttributeValueList:

2371 """The same as get(), but always returns a (possibly empty) list.

2372

2373 :param key: The attribute to look for.

2374 :param default: Use this value if the attribute is not present

2375 on this `Tag`.

2376 :return: A list of strings, usually empty or containing only a single

2377 value.

2378 """

2379 list_value: AttributeValueList

2380 value = self.get(key, default)

2381 if value is None:

2382 list_value = self.attribute_value_list_class()

2383 elif isinstance(value, list):

2384 list_value = value

2385 else:

2386 if not isinstance(value, str):

2387 value = cast(str, value)

2388 list_value = self.attribute_value_list_class([value])

2389 return list_value

2390

2391 def has_attr(self, key: str) -> bool:

2392 """Does this `Tag` have an attribute with the given name?"""

2393 return key in self.attrs

2394

2395 def __hash__(self) -> int:

2396 return str(self).__hash__()

2397

2398 def __getitem__(self, key: str) -> _AttributeValue:

2399 """tag[key] returns the value of the 'key' attribute for the Tag,

2400 and throws an exception if it's not there."""

2401 return self.attrs[key]

2402

2403 def __iter__(self) -> Iterator[PageElement]:

2404 "Iterating over a Tag iterates over its contents."

2405 return iter(self.contents)

2406

2407 def __len__(self) -> int:

2408 "The length of a Tag is the length of its list of contents."

2409 return len(self.contents)

2410

2411 def __contains__(self, x: Any) -> bool:

2412 return x in self.contents

2413

2414 def __bool__(self) -> bool:

2415 "A tag is non-None even if it has no contents."

2416 return True

2417

2418 def __setitem__(self, key: str, value: _AttributeValue) -> None:

2419 """Setting tag[key] sets the value of the 'key' attribute for the

2420 tag."""

2421 self.attrs[key] = value

2422

2423 def __delitem__(self, key: str) -> None:

2424 "Deleting tag[key] deletes all 'key' attributes for the tag."

2425 self.attrs.pop(key, None)

2426

2427 @overload

2428 def __call__( # pyright: ignore [reportOverlappingOverload]

2429 self,

2430 name: _FindMethodName = None,

2431 attrs: Optional[_StrainableAttributes] = None,

2432 recursive: bool = True,

2433 string: None = None,

2434 limit: Optional[int] = None,

2435 _stacklevel: int = 2,

2436 **kwargs: _StrainableAttribute,

2437 ) -> _SomeTags:

2438 ...

2439

2440 @overload

2441 def __call__(

2442 self,

2443 name: None = None,

2444 attrs: None = None,

2445 recursive: bool = True,

2446 string: _StrainableString = "",

2447 limit: Optional[int] = None,

2448 _stacklevel: int = 2,

2449 **kwargs: _StrainableAttribute,

2450 ) -> _SomeNavigableStrings:

2451 ...

2452

2453 def __call__(

2454 self,

2455 name: _FindMethodName = None,

2456 attrs: Optional[_StrainableAttributes] = None,

2457 recursive: bool = True,

2458 string: Optional[_StrainableString] = None,

2459 limit: Optional[int] = None,

2460 _stacklevel: int = 2,

2461 **kwargs: _StrainableAttribute,

2462 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:

2463 """Calling a Tag like a function is the same as calling its

2464 find_all() method. Eg. tag('a') returns a list of all the A tags

2465 found within this tag."""

2466 if string is not None and (name is not None or attrs is not None or kwargs):

2467 # TODO: Using the @overload decorator to express the three ways you

2468 # could get into this path is way too much code for a rarely(?) used

2469 # feature.

2470 return cast(ResultSet[Tag], self.find_all(name, attrs, recursive, string, limit, _stacklevel, **kwargs)) #type: ignore

2471

2472 if string is None:

2473 # If string is None, we're searching for tags.

2474 tags:ResultSet[Tag] = self.find_all(

2475 name, attrs, recursive, None, limit, _stacklevel, **kwargs

2476 )

2477 return tags

2478

2479 # Otherwise, we're searching for strings.

2480 strings:ResultSet[NavigableString] = self.find_all(

2481 None, None, recursive, string, limit, _stacklevel, **kwargs

2482 )

2483 return strings

2484

2485 def __getattr__(self, subtag: str) -> Optional[Tag]:

2486 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""

2487 # print("Getattr %s.%s" % (self.__class__, tag))

2488 result: _AtMostOneElement

2489 if len(subtag) > 3 and subtag.endswith("Tag"):

2490 # BS3: soup.aTag -> "soup.find("a")

2491 tag_name = subtag[:-3]

2492 warnings.warn(

2493 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")'

2494 % dict(name=tag_name),

2495 DeprecationWarning,

2496 stacklevel=2,

2497 )

2498 result = self.find(tag_name)

2499 # We special case contents to avoid recursion.

2500 elif not subtag.startswith("__") and not subtag == "contents":

2501 result = self.find(subtag)

2502 else:

2503 raise AttributeError(

2504 "'%s' object has no attribute '%s'" % (self.__class__, subtag)

2505 )

2506 return result

2507

2508 def __eq__(self, other: Any) -> bool:

2509 """Returns true iff this Tag has the same name, the same attributes,

2510 and the same contents (recursively) as `other`."""

2511 if self is other:

2512 return True

2513 if not isinstance(other, Tag):

2514 return False

2515 if (

2516 not hasattr(other, "name")

2517 or not hasattr(other, "attrs")

2518 or not hasattr(other, "contents")

2519 or self.name != other.name

2520 or self.attrs != other.attrs

2521 or len(self) != len(other)

2522 ):

2523 return False

2524 for i, my_child in enumerate(self.contents):

2525 if my_child != other.contents[i]:

2526 return False

2527 return True

2528

2529 def __ne__(self, other: Any) -> bool:

2530 """Returns true iff this Tag is not identical to `other`,

2531 as defined in __eq__."""

2532 return not self == other

2533

2534 def __repr__(self) -> str:

2535 """Renders this `Tag` as a string."""

2536 return self.decode()

2537

2538 __str__ = __unicode__ = __repr__

2539

2540 def encode(

2541 self,

2542 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2543 indent_level: Optional[int] = None,

2544 formatter: _FormatterOrName = "minimal",

2545 errors: str = "xmlcharrefreplace",

2546 ) -> bytes:

2547 """Render this `Tag` and its contents as a bytestring.

2548

2549 :param encoding: The encoding to use when converting to

2550 a bytestring. This may also affect the text of the document,

2551 specifically any encoding declarations within the document.

2552 :param indent_level: Each line of the rendering will be

2553 indented this many levels. (The ``formatter`` decides what a

2554 'level' means, in terms of spaces or other characters

2555 output.) This is used internally in recursive calls while

2556 pretty-printing.

2557 :param formatter: Either a `Formatter` object, or a string naming one of

2558 the standard formatters.

2559 :param errors: An error handling strategy such as

2560 'xmlcharrefreplace'. This value is passed along into

2561 :py:meth:`str.encode` and its value should be one of the `error

2562 handling constants defined by Python's codecs module

2563 <https://docs.python.org/3/library/codecs.html#error-handlers>`_.

2564 """

2565 # Turn the data structure into Unicode, then encode the

2566 # Unicode.

2567 u = self.decode(indent_level, encoding, formatter)

2568 return u.encode(encoding, errors)

2569

2570 def decode(

2571 self,

2572 indent_level: Optional[int] = None,

2573 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2574 formatter: _FormatterOrName = "minimal",

2575 iterator: Optional[Iterator[PageElement]] = None,

2576 ) -> str:

2577 """Render this `Tag` and its contents as a Unicode string.

2578

2579 :param indent_level: Each line of the rendering will be

2580 indented this many levels. (The ``formatter`` decides what a

2581 'level' means, in terms of spaces or other characters

2582 output.) This is used internally in recursive calls while

2583 pretty-printing.

2584 :param encoding: The encoding you intend to use when

2585 converting the string to a bytestring. decode() is *not*

2586 responsible for performing that encoding. This information

2587 is needed so that a real encoding can be substituted in if

2588 the document contains an encoding declaration (e.g. in a

2589 <meta> tag).

2590 :param formatter: Either a `Formatter` object, or a string

2591 naming one of the standard formatters.

2592 :param iterator: The iterator to use when navigating over the

2593 parse tree. This is only used by `Tag.decode_contents` and

2594 you probably won't need to use it.

2595 """

2596 pieces = []

2597 # First off, turn a non-Formatter `formatter` into a Formatter

2598 # object. This will stop the lookup from happening over and

2599 # over again.

2600 if not isinstance(formatter, Formatter):

2601 formatter = self.formatter_for_name(formatter)

2602

2603 if indent_level is True:

2604 indent_level = 0

2605

2606 # The currently active tag that put us into string literal

2607 # mode. Until this element is closed, children will be treated

2608 # as string literals and not pretty-printed. String literal

2609 # mode is turned on immediately after this tag begins, and

2610 # turned off immediately before it's closed. This means there

2611 # will be whitespace before and after the tag itself.

2612 string_literal_tag = None

2613

2614 for event, element in self._event_stream(iterator):

2615 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):

2616 element = cast(Tag, element)

2617 piece = element._format_tag(eventual_encoding, formatter, opening=True)

2618 elif event is Tag.END_ELEMENT_EVENT:

2619 element = cast(Tag, element)

2620 piece = element._format_tag(eventual_encoding, formatter, opening=False)

2621 if indent_level is not None:

2622 indent_level -= 1

2623 else:

2624 element = cast(NavigableString, element)

2625 piece = element.output_ready(formatter)

2626

2627 # Now we need to apply the 'prettiness' -- extra

2628 # whitespace before and/or after this tag. This can get

2629 # complicated because certain tags, like <pre> and

2630 # <script>, can't be prettified, since adding whitespace would

2631 # change the meaning of the content.

2632

2633 # The default behavior is to add whitespace before and

2634 # after an element when string literal mode is off, and to

2635 # leave things as they are when string literal mode is on.

2636 if string_literal_tag:

2637 indent_before = indent_after = False

2638 else:

2639 indent_before = indent_after = True

2640

2641 # The only time the behavior is more complex than that is

2642 # when we encounter an opening or closing tag that might

2643 # put us into or out of string literal mode.

2644 if (

2645 event is Tag.START_ELEMENT_EVENT

2646 and not string_literal_tag

2647 and not cast(Tag, element)._should_pretty_print()

2648 ):

2649 # We are about to enter string literal mode. Add

2650 # whitespace before this tag, but not after. We

2651 # will stay in string literal mode until this tag

2652 # is closed.

2653 indent_before = True

2654 indent_after = False

2655 string_literal_tag = element

2656 elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag:

2657 # We are about to exit string literal mode by closing

2658 # the tag that sent us into that mode. Add whitespace

2659 # after this tag, but not before.

2660 indent_before = False

2661 indent_after = True

2662 string_literal_tag = None

2663

2664 # Now we know whether to add whitespace before and/or

2665 # after this element.

2666 if indent_level is not None:

2667 if indent_before or indent_after:

2668 if isinstance(element, NavigableString):

2669 piece = piece.strip()

2670 if piece:

2671 piece = self._indent_string(

2672 piece, indent_level, formatter, indent_before, indent_after

2673 )

2674 if event == Tag.START_ELEMENT_EVENT:

2675 indent_level += 1

2676 pieces.append(piece)

2677 return "".join(pieces)

2678

2679 class _TreeTraversalEvent(object):

2680 """An internal class representing an event in the process

2681 of traversing a parse tree.

2682

2683 :meta private:

2684 """

2685

2686 # Stand-ins for the different events yielded by _event_stream

2687 START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:

2688 END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:

2689 EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:

2690 STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:

2691

2692 def _event_stream(

2693 self, iterator: Optional[Iterator[PageElement]] = None

2694 ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]:

2695 """Yield a sequence of events that can be used to reconstruct the DOM

2696 for this element.

2697

2698 This lets us recreate the nested structure of this element

2699 (e.g. when formatting it as a string) without using recursive

2700 method calls.

2701

2702 This is similar in concept to the SAX API, but it's a simpler

2703 interface designed for internal use. The events are different

2704 from SAX and the arguments associated with the events are Tags

2705 and other Beautiful Soup objects.

2706

2707 :param iterator: An alternate iterator to use when traversing

2708 the tree.

2709 """

2710 tag_stack: List[Tag] = []

2711

2712 iterator = iterator or self.self_and_descendants

2713

2714 for c in iterator:

2715 # If the parent of the element we're about to yield is not

2716 # the tag currently on the stack, it means that the tag on

2717 # the stack closed before this element appeared.

2718 while tag_stack and c.parent != tag_stack[-1]:

2719 now_closed_tag = tag_stack.pop()

2720 yield Tag.END_ELEMENT_EVENT, now_closed_tag

2721

2722 if isinstance(c, Tag):

2723 if c.is_empty_element:

2724 yield Tag.EMPTY_ELEMENT_EVENT, c

2725 else:

2726 yield Tag.START_ELEMENT_EVENT, c

2727 tag_stack.append(c)

2728 continue

2729 else:

2730 yield Tag.STRING_ELEMENT_EVENT, c

2731

2732 while tag_stack:

2733 now_closed_tag = tag_stack.pop()

2734 yield Tag.END_ELEMENT_EVENT, now_closed_tag

2735

2736 def _indent_string(

2737 self,

2738 s: str,

2739 indent_level: int,

2740 formatter: Formatter,

2741 indent_before: bool,

2742 indent_after: bool,

2743 ) -> str:

2744 """Add indentation whitespace before and/or after a string.

2745

2746 :param s: The string to amend with whitespace.

2747 :param indent_level: The indentation level; affects how much

2748 whitespace goes before the string.

2749 :param indent_before: Whether or not to add whitespace

2750 before the string.

2751 :param indent_after: Whether or not to add whitespace

2752 (a newline) after the string.

2753 """

2754 space_before = ""

2755 if indent_before and indent_level:

2756 space_before = formatter.indent * indent_level

2757

2758 space_after = ""

2759 if indent_after:

2760 space_after = "\n"

2761

2762 return space_before + s + space_after

2763

2764 def _format_tag(

2765 self, eventual_encoding: str, formatter: Formatter, opening: bool

2766 ) -> str:

2767 if self.hidden:

2768 # A hidden tag is invisible, although its contents

2769 # are visible.

2770 return ""

2771

2772 # A tag starts with the < character (see below).

2773

2774 # Then the / character, if this is a closing tag.

2775 closing_slash = ""

2776 if not opening:

2777 closing_slash = "/"

2778

2779 # Then an optional namespace prefix.

2780 prefix = ""

2781 if self.prefix:

2782 prefix = self.prefix + ":"

2783

2784 # Then a list of attribute values, if this is an opening tag.

2785 attribute_string = ""

2786 if opening:

2787 attributes = formatter.attributes(self)

2788 attrs = []

2789 for key, val in attributes:

2790 if val is None:

2791 decoded = key

2792 else:

2793 if isinstance(val, list) or isinstance(val, tuple):

2794 val = " ".join(val)

2795 elif not isinstance(val, str):

2796 val = str(val)

2797 elif (

2798 isinstance(val, AttributeValueWithCharsetSubstitution)

2799 and eventual_encoding is not None

2800 ):

2801 val = val.substitute_encoding(eventual_encoding)

2802

2803 text = formatter.attribute_value(val)

2804 decoded = str(key) + "=" + formatter.quoted_attribute_value(text)

2805 attrs.append(decoded)

2806 if attrs:

2807 attribute_string = " " + " ".join(attrs)

2808

2809 # Then an optional closing slash (for a void element in an

2810 # XML document).

2811 void_element_closing_slash = ""

2812 if self.is_empty_element:

2813 void_element_closing_slash = formatter.void_element_close_prefix or ""

2814

2815 # Put it all together.

2816 return (

2817 "<"

2818 + closing_slash

2819 + prefix

2820 + self.name

2821 + attribute_string

2822 + void_element_closing_slash

2823 + ">"

2824 )

2825

2826 def _should_pretty_print(self, indent_level: int = 1) -> bool:

2827 """Should this tag be pretty-printed?

2828

2829 Most of them should, but some (such as <pre> in HTML

2830 documents) should not.

2831 """

2832 return indent_level is not None and (

2833 not self.preserve_whitespace_tags

2834 or self.name not in self.preserve_whitespace_tags

2835 )

2836

2837 @overload

2838 def prettify(

2839 self,

2840 encoding: None = None,

2841 formatter: _FormatterOrName = "minimal",

2842 ) -> str:

2843 ...

2844

2845 @overload

2846 def prettify(

2847 self,

2848 encoding: _Encoding,

2849 formatter: _FormatterOrName = "minimal",

2850 ) -> bytes:

2851 ...

2852

2853 def prettify(

2854 self,

2855 encoding: Optional[_Encoding] = None,

2856 formatter: _FormatterOrName = "minimal",

2857 ) -> Union[str, bytes]:

2858 """Pretty-print this `Tag` as a string or bytestring.

2859

2860 :param encoding: The encoding of the bytestring, or None if you want Unicode.

2861 :param formatter: A Formatter object, or a string naming one of

2862 the standard formatters.

2863 :return: A string (if no ``encoding`` is provided) or a bytestring

2864 (otherwise).

2865 """

2866 if encoding is None:

2867 return self.decode(indent_level=0, formatter=formatter)

2868 else:

2869 return self.encode(encoding=encoding, indent_level=0, formatter=formatter)

2870

2871 def decode_contents(

2872 self,

2873 indent_level: Optional[int] = None,

2874 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2875 formatter: _FormatterOrName = "minimal",

2876 ) -> str:

2877 """Renders the contents of this tag as a Unicode string.

2878

2879 :param indent_level: Each line of the rendering will be

2880 indented this many levels. (The formatter decides what a

2881 'level' means in terms of spaces or other characters

2882 output.) Used internally in recursive calls while

2883 pretty-printing.

2884

2885 :param eventual_encoding: The tag is destined to be

2886 encoded into this encoding. decode_contents() is *not*

2887 responsible for performing that encoding. This information

2888 is needed so that a real encoding can be substituted in if

2889 the document contains an encoding declaration (e.g. in a

2890 <meta> tag).

2891

2892 :param formatter: A `Formatter` object, or a string naming one of

2893 the standard Formatters.

2894 """

2895 return self.decode(

2896 indent_level, eventual_encoding, formatter, iterator=self.descendants

2897 )

2898

2899 def encode_contents(

2900 self,

2901 indent_level: Optional[int] = None,

2902 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2903 formatter: _FormatterOrName = "minimal",

2904 ) -> bytes:

2905 """Renders the contents of this PageElement as a bytestring.

2906

2907 :param indent_level: Each line of the rendering will be

2908 indented this many levels. (The ``formatter`` decides what a

2909 'level' means, in terms of spaces or other characters

2910 output.) This is used internally in recursive calls while

2911 pretty-printing.

2912 :param formatter: Either a `Formatter` object, or a string naming one of

2913 the standard formatters.

2914 :param encoding: The bytestring will be in this encoding.

2915 """

2916 contents = self.decode_contents(indent_level, encoding, formatter)

2917 return contents.encode(encoding)

2918

2919 @_deprecated("encode_contents", "4.0.0")

2920 def renderContents(

2921 self,

2922 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2923 prettyPrint: bool = False,

2924 indentLevel: Optional[int] = 0,

2925 ) -> bytes:

2926 """Deprecated method for BS3 compatibility.

2927

2928 :meta private:

2929 """

2930 if not prettyPrint:

2931 indentLevel = None

2932 return self.encode_contents(indent_level=indentLevel, encoding=encoding)

2933

2934 # Soup methods

2935

2936 @overload

2937 def find(

2938 self,

2939 name: _FindMethodName = None,

2940 attrs: Optional[_StrainableAttributes] = None,

2941 recursive: bool = True,

2942 string: None=None,

2943 **kwargs: _StrainableAttribute,

2944 ) -> _AtMostOneTag:

2945 ...

2946

2947 @overload

2948 def find(

2949 self,

2950 name: None=None,

2951 attrs: None=None,

2952 recursive: bool = True,

2953 string: _StrainableString="",

2954 ) -> _AtMostOneNavigableString:

2955 ...

2956

2957 def find(

2958 self,

2959 name: _FindMethodName = None,

2960 attrs: Optional[_StrainableAttributes] = None,

2961 recursive: bool = True,

2962 string: Optional[_StrainableString] = None,

2963 **kwargs: _StrainableAttribute,

2964 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:

2965 """Look in the children of this PageElement and find the first

2966 PageElement that matches the given criteria.

2967

2968 All find_* methods take a common set of arguments. See the online

2969 documentation for detailed explanations.

2970

2971 :param name: A filter on tag name.

2972 :param attrs: Additional filters on attribute values.

2973 :param recursive: If this is True, find() will perform a

2974 recursive search of this Tag's children. Otherwise,

2975 only the direct children will be considered.

2976 :param string: A filter on the `Tag.string` attribute.

2977 :kwargs: Additional filters on attribute values.

2978 """

2979 if string is not None and (name is not None or attrs is not None or kwargs):

2980 # TODO: Using the @overload decorator to express the three ways you

2981 # could get into this path is way too much code for a rarely(?) used

2982 # feature.

2983 elements = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) # type:ignore

2984 if elements:

2985 return cast(Tag, elements[0])

2986 elif string is None:

2987 tags = self.find_all(name, attrs, recursive, None, 1, _stacklevel=3, **kwargs)

2988 if tags:

2989 return cast(Tag, tags[0])

2990 else:

2991 strings = self.find_all(None, None, recursive, string, 1, _stacklevel=3, **kwargs)

2992 if strings:

2993 return cast(NavigableString, strings[0])

2994 return None

2995

2996 findChild = _deprecated_function_alias("findChild", "find", "3.0.0")

2997

2998 @overload

2999 def find_all( # pyright: ignore [reportOverlappingOverload]

3000 self,

3001 name: _FindMethodName = None,

3002 attrs: Optional[_StrainableAttributes] = None,

3003 recursive: bool = True,

3004 string: None = None,

3005 limit: Optional[int] = None,

3006 _stacklevel: int = 2,

3007 **kwargs: _StrainableAttribute,

3008 ) -> _SomeTags:

3009 ...

3010

3011 @overload

3012 def find_all(

3013 self,

3014 name: None = None,

3015 attrs: None = None,

3016 recursive: bool = True,

3017 string: _StrainableString = "",

3018 limit: Optional[int] = None,

3019 _stacklevel: int = 2,

3020 **kwargs: _StrainableAttribute,

3021 ) -> _SomeNavigableStrings:

3022 ...

3023

3024 def find_all(

3025 self,

3026 name: _FindMethodName = None,

3027 attrs: Optional[_StrainableAttributes] = None,

3028 recursive: bool = True,

3029 string: Optional[_StrainableString] = None,

3030 limit: Optional[int] = None,

3031 _stacklevel: int = 2,

3032 **kwargs: _StrainableAttribute,

3033 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:

3034 """Look in the children of this `PageElement` and find all

3035 `PageElement` objects that match the given criteria.

3036

3037 All find_* methods take a common set of arguments. See the online

3038 documentation for detailed explanations.

3039

3040 :param name: A filter on tag name.

3041 :param attrs: Additional filters on attribute values.

3042 :param recursive: If this is True, find_all() will perform a

3043 recursive search of this PageElement's children. Otherwise,

3044 only the direct children will be considered.

3045 :param limit: Stop looking after finding this many results.

3046 :param _stacklevel: Used internally to improve warning messages.

3047 :kwargs: Additional filters on attribute values.

3048 """

3049 generator = self.descendants

3050 if not recursive:

3051 generator = self.children

3052 _stacklevel += 1

3053

3054 if string is not None and (name is not None or attrs is not None or kwargs):

3055 # TODO: Using the @overload decorator to express the three ways you

3056 # could get into this path is way too much code for a rarely(?) used

3057 # feature.

3058 return cast(ResultSet[Tag],

3059 self._find_all(name, attrs, string, limit, generator,

3060 _stacklevel=_stacklevel, **kwargs)

3061 )

3062

3063 if string is None:

3064 # If string is None, we're searching for tags.

3065 return cast(ResultSet[Tag], self._find_all(

3066 name, attrs, None, limit, generator, _stacklevel=_stacklevel, **kwargs

3067 ))

3068

3069 # Otherwise, we're searching for strings.

3070 return cast(ResultSet[NavigableString], self._find_all(

3071 None, None, string, limit, generator, _stacklevel=_stacklevel, **kwargs

3072 ))

3073

3074 findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")

3075 findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")

3076

3077 # Generator methods

3078 @property

3079 def children(self) -> Iterator[PageElement]:

3080 """Iterate over all direct children of this `PageElement`."""

3081 return (x for x in self.contents)

3082

3083 @property

3084 def self_and_descendants(self) -> Iterator[PageElement]:

3085 """Iterate over this `Tag` and its children in a

3086 breadth-first sequence.

3087 """

3088 return self._self_and(self.descendants)

3089

3090 @property

3091 def descendants(self) -> Iterator[PageElement]:

3092 """Iterate over all children of this `Tag` in a

3093 breadth-first sequence.

3094 """

3095 if not len(self.contents):

3096 return

3097 # _last_descendant() can't return None here because

3098 # accept_self is True. Worst case, last_descendant will end up

3099 # as self.

3100 last_descendant = cast(PageElement, self._last_descendant(accept_self=True))

3101 stopNode = last_descendant.next_element

3102 current: _AtMostOneElement = self.contents[0]

3103 while current is not stopNode and current is not None:

3104 successor = current.next_element

3105 yield current

3106 current = successor

3107

3108 # CSS selector code

3109 def select_one(

3110 self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any

3111 ) -> Optional[Tag]:

3112 """Perform a CSS selection operation on the current element.

3113

3114 :param selector: A CSS selector.

3115

3116 :param namespaces: A dictionary mapping namespace prefixes

3117 used in the CSS selector to namespace URIs. By default,

3118 Beautiful Soup will use the prefixes it encountered while

3119 parsing the document.

3120

3121 :param kwargs: Keyword arguments to be passed into Soup Sieve's

3122 soupsieve.select() method.

3123 """

3124 return self.css.select_one(selector, namespaces, **kwargs)

3125

3126 def select(

3127 self,

3128 selector: str,

3129 namespaces: Optional[Dict[str, str]] = None,

3130 limit: int = 0,

3131 **kwargs: Any,

3132 ) -> ResultSet[Tag]:

3133 """Perform a CSS selection operation on the current element.

3134

3135 This uses the SoupSieve library.

3136

3137 :param selector: A string containing a CSS selector.

3138

3139 :param namespaces: A dictionary mapping namespace prefixes

3140 used in the CSS selector to namespace URIs. By default,

3141 Beautiful Soup will use the prefixes it encountered while

3142 parsing the document.

3143

3144 :param limit: After finding this number of results, stop looking.

3145

3146 :param kwargs: Keyword arguments to be passed into SoupSieve's

3147 soupsieve.select() method.

3148 """

3149 return self.css.select(selector, namespaces, limit, **kwargs)

3150

3151 @property

3152 def css(self) -> CSS:

3153 """Return an interface to the CSS selector API."""

3154 return CSS(self)

3155

3156 # Old names for backwards compatibility

3157 @_deprecated("children", "4.0.0")

3158 def childGenerator(self) -> Iterator[PageElement]:

3159 """Deprecated generator.

3160

3161 :meta private:

3162 """

3163 return self.children

3164

3165 @_deprecated("descendants", "4.0.0")

3166 def recursiveChildGenerator(self) -> Iterator[PageElement]:

3167 """Deprecated generator.

3168

3169 :meta private:

3170 """

3171 return self.descendants

3172

3173 @_deprecated("has_attr", "4.0.0")

3174 def has_key(self, key: str) -> bool:

3175 """Deprecated method. This was kind of misleading because has_key()

3176 (attributes) was different from __in__ (contents).

3177

3178 has_key() is gone in Python 3, anyway.

3179

3180 :meta private:

3181 """

3182 return self.has_attr(key)

3183

3184

3185_PageElementT = TypeVar("_PageElementT", bound=PageElement)

3186

3187class ResultSet(List[_PageElementT], Generic[_PageElementT]):

3188 """A ResultSet is a list of `PageElement` objects, gathered as the result

3189 of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of

3190 search results.

3191 """

3192

3193 source: Optional[ElementFilter]

3194

3195 def __init__(

3196 self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = ()

3197 ) -> None:

3198 super(ResultSet, self).__init__(result)

3199 self.source = source

3200

3201 def __getattr__(self, key: str) -> None:

3202 """Raise a helpful exception to explain a common code fix."""

3203 raise AttributeError(

3204 f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""

3205 )

3206

3207# Now that all the classes used by SoupStrainer have been defined,

3208# import SoupStrainer itself into this module to preserve the

3209# backwards compatibility of anyone who imports

3210# bs4.element.SoupStrainer.

3211from bs4.filter import SoupStrainer # noqa: E402