Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/element.py: 41%

1739 """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_."""

1740

1741 PREFIX: str = "<!--"

1742 SUFFIX: str = "-->"

1743

1744

1745class Declaration(PreformattedString):

1746 """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_."""

1747

1748 PREFIX: str = "<?"

1749 SUFFIX: str = "?>"

1750

1751

1752class Doctype(PreformattedString):

1753 """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_."""

1754

1755 @classmethod

1756 def for_name_and_ids(

1757 cls, name: str, pub_id: Optional[str], system_id: Optional[str]

1758 ) -> Doctype:

1759 """Generate an appropriate document type declaration for a given

1760 public ID and system ID.

1761

1762 :param name: The name of the document's root element, e.g. 'html'.

1763 :param pub_id: The Formal Public Identifier for this document type,

1764 e.g. '-//W3C//DTD XHTML 1.1//EN'

1765 :param system_id: The system identifier for this document type,

1766 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'

1767 """

1768 return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id))

1769

1770 @classmethod

1771 def _string_for_name_and_ids(

1772 cls, name: str, pub_id: Optional[str], system_id: Optional[str]

1773 ) -> str:

1774 """Generate a string to be used as the basis of a Doctype object.

1775

1776 This is a separate method from for_name_and_ids() because the lxml

1777 TreeBuilder needs to call it.

1778 """

1779 value = name or ""

1780 if pub_id is not None:

1781 value += ' PUBLIC "%s"' % pub_id

1782 if system_id is not None:

1783 value += ' "%s"' % system_id

1784 elif system_id is not None:

1785 value += ' SYSTEM "%s"' % system_id

1786 return value

1787

1788 PREFIX: str = "<!DOCTYPE "

1789 SUFFIX: str = ">\n"

1790

1791

1792class Stylesheet(NavigableString):

1793 """A `NavigableString` representing the contents of a `<style> HTML

1794 tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_

1795 (probably CSS).

1796

1797 Used to distinguish embedded stylesheets from textual content.

1798 """

1799

1800

1801class Script(NavigableString):

1802 """A `NavigableString` representing the contents of a `<script>

1803 HTML tag

1804 <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_

1805 (probably Javascript).

1806

1807 Used to distinguish executable code from textual content.

1808 """

1809

1810

1811class TemplateString(NavigableString):

1812 """A `NavigableString` representing a string found inside an `HTML

1813 <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_

1814 embedded in a larger document.

1815

1816 Used to distinguish such strings from the main body of the document.

1817 """

1818

1819

1820class RubyTextString(NavigableString):

1821 """A NavigableString representing the contents of an `<rt> HTML

1822 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_.

1823

1824 Can be used to distinguish such strings from the strings they're

1825 annotating.

1826 """

1827

1828

1829class RubyParenthesisString(NavigableString):

1830 """A NavigableString representing the contents of an `<rp> HTML

1831 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_.

1832 """

1833

1834

1835class Tag(PageElement):

1836 """An HTML or XML tag that is part of a parse tree, along with its

1837 attributes, contents, and relationships to other parts of the tree.

1838

1839 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will

1840 create a `Tag` object representing the ``<b>`` tag. You can

1841 instantiate `Tag` objects directly, but it's not necessary unless

1842 you're adding entirely new markup to a parsed document. Most of

1843 the constructor arguments are intended for use by the `TreeBuilder`

1844 that's parsing a document.

1845

1846 :param parser: A `BeautifulSoup` object representing the parse tree this

1847 `Tag` will be part of.

1848 :param builder: The `TreeBuilder` being used to build the tree.

1849 :param name: The name of the tag.

1850 :param namespace: The URI of this tag's XML namespace, if any.

1851 :param prefix: The prefix for this tag's XML namespace, if any.

1852 :param attrs: A dictionary of attribute values.

1853 :param parent: The `Tag` to use as the parent of this `Tag`. May be

1854 the `BeautifulSoup` object itself.

1855 :param previous: The `PageElement` that was parsed immediately before

1856 parsing this tag.

1857 :param is_xml: If True, this is an XML tag. Otherwise, this is an

1858 HTML tag.

1859 :param sourceline: The line number where this tag was found in its

1860 source document.

1861 :param sourcepos: The character position within ``sourceline`` where this

1862 tag was found.

1863 :param can_be_empty_element: If True, this tag should be

1864 represented as <tag/>. If False, this tag should be represented

1865 as <tag></tag>.

1866 :param cdata_list_attributes: A dictionary of attributes whose values should

1867 be parsed as lists of strings if they ever show up on this tag.

1868 :param preserve_whitespace_tags: Names of tags whose contents

1869 should have their whitespace preserved if they are encountered inside

1870 this tag.

1871 :param interesting_string_types: When iterating over this tag's

1872 string contents in methods like `Tag.strings` or

1873 `PageElement.get_text`, these are the types of strings that are

1874 interesting enough to be considered. By default,

1875 `NavigableString` (normal strings) and `CData` (CDATA

1876 sections) are the only interesting string subtypes.

1877 :param namespaces: A dictionary mapping currently active

1878 namespace prefixes to URIs, as of the point in the parsing process when

1879 this tag was encountered. This can be used later to

1880 construct CSS selectors.

1881

1882 """

1883

1884 def __init__(

1885 self,

1886 parser: Optional[BeautifulSoup] = None,

1887 builder: Optional[TreeBuilder] = None,

1888 name: Optional[str] = None,

1889 namespace: Optional[str] = None,

1890 prefix: Optional[str] = None,

1891 attrs: Optional[_RawOrProcessedAttributeValues] = None,

1892 parent: Optional[Union[BeautifulSoup, Tag]] = None,

1893 previous: _AtMostOneElement = None,

1894 is_xml: Optional[bool] = None,

1895 sourceline: Optional[int] = None,

1896 sourcepos: Optional[int] = None,

1897 can_be_empty_element: Optional[bool] = None,

1898 cdata_list_attributes: Optional[Dict[str, Set[str]]] = None,

1899 preserve_whitespace_tags: Optional[Set[str]] = None,

1900 interesting_string_types: Optional[Set[Type[NavigableString]]] = None,

1901 namespaces: Optional[Dict[str, str]] = None,

1902 # NOTE: Any new arguments here need to be mirrored in

1903 # Tag.copy_self, and potentially BeautifulSoup.new_tag

1904 # as well.

1905 ):

1906 if parser is None:

1907 self.parser_class = None

1908 else:

1909 # We don't actually store the parser object: that lets extracted

1910 # chunks be garbage-collected.

1911 self.parser_class = parser.__class__

1912 if name is None:

1913 raise ValueError("No value provided for new tag's name.")

1914 self.name = name

1915 self.namespace = namespace

1916 self._namespaces = namespaces or {}

1917 self.prefix = prefix

1918 if (not builder or builder.store_line_numbers) and (

1919 sourceline is not None or sourcepos is not None

1920 ):

1921 self.sourceline = sourceline

1922 self.sourcepos = sourcepos

1923 else:

1924 self.sourceline = sourceline

1925 self.sourcepos = sourcepos

1926

1927 attr_dict_class: type[AttributeDict]

1928 attribute_value_list_class: type[AttributeValueList]

1929 if builder is None:

1930 if is_xml:

1931 attr_dict_class = XMLAttributeDict

1932 else:

1933 attr_dict_class = HTMLAttributeDict

1934 attribute_value_list_class = AttributeValueList

1935 else:

1936 attr_dict_class = builder.attribute_dict_class

1937 attribute_value_list_class = builder.attribute_value_list_class

1938 self.attribute_value_list_class = attribute_value_list_class

1939

1940 if attrs is None:

1941 self.attrs = attr_dict_class()

1942 else:

1943 if builder is not None and builder.cdata_list_attributes:

1944 self.attrs = builder._replace_cdata_list_attribute_values(

1945 self.name, attrs

1946 )

1947 else:

1948 self.attrs = attr_dict_class()

1949 # Make sure that the values of any multi-valued

1950 # attributes (e.g. when a Tag is copied) are stored in

1951 # new lists.

1952 for k, v in attrs.items():

1953 if isinstance(v, list):

1954 v = v.__class__(v)

1955 self.attrs[k] = v

1956

1957 # If possible, determine ahead of time whether this tag is an

1958 # XML tag.

1959 if builder:

1960 self.known_xml = builder.is_xml

1961 else:

1962 self.known_xml = is_xml

1963 self.contents: List[PageElement] = []

1964 self.setup(parent, previous)

1965 self.hidden = False

1966

1967 if builder is None:

1968 # In the absence of a TreeBuilder, use whatever values were

1969 # passed in here. They're probably None, unless this is a copy of some

1970 # other tag.

1971 self.can_be_empty_element = can_be_empty_element

1972 self.cdata_list_attributes = cdata_list_attributes

1973 self.preserve_whitespace_tags = preserve_whitespace_tags

1974 self.interesting_string_types = interesting_string_types

1975 else:

1976 # Set up any substitutions for this tag, such as the charset in a META tag.

1977 self.attribute_value_list_class = builder.attribute_value_list_class

1978 builder.set_up_substitutions(self)

1979

1980 # Ask the TreeBuilder whether this tag might be an empty-element tag.

1981 self.can_be_empty_element = builder.can_be_empty_element(name)

1982

1983 # Keep track of the list of attributes of this tag that

1984 # might need to be treated as a list.

1985 #

1986 # For performance reasons, we store the whole data structure

1987 # rather than asking the question of every tag. Asking would

1988 # require building a new data structure every time, and

1989 # (unlike can_be_empty_element), we almost never need

1990 # to check this.

1991 self.cdata_list_attributes = builder.cdata_list_attributes

1992

1993 # Keep track of the names that might cause this tag to be treated as a

1994 # whitespace-preserved tag.

1995 self.preserve_whitespace_tags = builder.preserve_whitespace_tags

1996

1997 if self.name in builder.string_containers:

1998 # This sort of tag uses a special string container

1999 # subclass for most of its strings. We need to be able

2000 # to look up the proper container subclass.

2001 self.interesting_string_types = {builder.string_containers[self.name]}

2002 else:

2003 self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES

2004

2005 parser_class: Optional[type[BeautifulSoup]]

2006 name: str

2007 namespace: Optional[str]

2008 prefix: Optional[str]

2009 attrs: _AttributeValues

2010 sourceline: Optional[int]

2011 sourcepos: Optional[int]

2012 known_xml: Optional[bool]

2013 contents: List[PageElement]

2014 hidden: bool

2015 interesting_string_types: Optional[Set[Type[NavigableString]]]

2016

2017 can_be_empty_element: Optional[bool]

2018 cdata_list_attributes: Optional[Dict[str, Set[str]]]

2019 preserve_whitespace_tags: Optional[Set[str]]

2020

2021 #: :meta private:

2022 parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0")

2023

2024 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self:

2025 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.

2026 Its contents are a copy of the old Tag's contents.

2027 """

2028 clone = self.copy_self()

2029

2030 if recursive:

2031 # Clone this tag's descendants recursively, but without

2032 # making any recursive function calls.

2033 tag_stack: List[Tag] = [clone]

2034 for event, element in self._event_stream(self.descendants):

2035 if event is Tag.END_ELEMENT_EVENT:

2036 # Stop appending incoming Tags to the Tag that was

2037 # just closed.

2038 tag_stack.pop()

2039 else:

2040 descendant_clone = element.__deepcopy__(memo, recursive=False)

2041 # Add to its parent's .contents

2042 tag_stack[-1].append(descendant_clone)

2043

2044 if event is Tag.START_ELEMENT_EVENT:

2045 # Add the Tag itself to the stack so that its

2046 # children will be .appended to it.

2047 tag_stack.append(cast(Tag, descendant_clone))

2048 return clone

2049

2050 def copy_self(self) -> Self:

2051 """Create a new Tag just like this one, but with no

2052 contents and unattached to any parse tree.

2053

2054 This is the first step in the deepcopy process, but you can

2055 call it on its own to create a copy of a Tag without copying its

2056 contents.

2057 """

2058 clone = type(self)(

2059 None,

2060 None,

2061 self.name,

2062 self.namespace,

2063 self.prefix,

2064 self.attrs,

2065 is_xml=self._is_xml,

2066 sourceline=self.sourceline,

2067 sourcepos=self.sourcepos,

2068 can_be_empty_element=self.can_be_empty_element,

2069 cdata_list_attributes=self.cdata_list_attributes,

2070 preserve_whitespace_tags=self.preserve_whitespace_tags,

2071 interesting_string_types=self.interesting_string_types,

2072 namespaces=self._namespaces,

2073 )

2074 for attr in ("can_be_empty_element", "hidden"):

2075 setattr(clone, attr, getattr(self, attr))

2076 return clone

2077

2078 @property

2079 def is_empty_element(self) -> bool:

2080 """Is this tag an empty-element tag? (aka a self-closing tag)

2081

2082 A tag that has contents is never an empty-element tag.

2083

2084 A tag that has no contents may or may not be an empty-element

2085 tag. It depends on the `TreeBuilder` used to create the

2086 tag. If the builder has a designated list of empty-element

2087 tags, then only a tag whose name shows up in that list is

2088 considered an empty-element tag. This is usually the case

2089 for HTML documents.

2090

2091 If the builder has no designated list of empty-element, then

2092 any tag with no contents is an empty-element tag. This is usually

2093 the case for XML documents.

2094 """

2095 return len(self.contents) == 0 and self.can_be_empty_element is True

2096

2097 @_deprecated("is_empty_element", "4.0.0")

2098 def isSelfClosing(self) -> bool:

2099 ": :meta private:"

2100 return self.is_empty_element

2101

2102 @property

2103 def string(self) -> Optional[str]:

2104 """Convenience property to get the single string within this

2105 `Tag`, assuming there is just one.

2106

2107 :return: If this `Tag` has a single child that's a

2108 `NavigableString`, the return value is that string. If this

2109 element has one child `Tag`, the return value is that child's

2110 `Tag.string`, recursively. If this `Tag` has no children,

2111 or has more than one child, the return value is ``None``.

2112

2113 If this property is unexpectedly returning ``None`` for you,

2114 it's probably because your `Tag` has more than one thing

2115 inside it.

2116 """

2117 if len(self.contents) != 1:

2118 return None

2119 child = self.contents[0]

2120 if isinstance(child, NavigableString):

2121 return child

2122 elif isinstance(child, Tag):

2123 return child.string

2124 return None

2125

2126 @string.setter

2127 def string(self, string: str) -> None:

2128 """Replace the `Tag.contents` of this `Tag` with a single string."""

2129 self.clear()

2130 if isinstance(string, NavigableString):

2131 new_class = string.__class__

2132 else:

2133 new_class = NavigableString

2134 self.append(new_class(string))

2135

2136 #: :meta private:

2137 MAIN_CONTENT_STRING_TYPES = {NavigableString, CData}

2138

2139 def _all_strings(

2140 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default

2141 ) -> Iterator[str]:

2142 """Yield all strings of certain classes, possibly stripping them.

2143

2144 :param strip: If True, all strings will be stripped before being

2145 yielded.

2146

2147 :param types: A tuple of NavigableString subclasses. Any strings of

2148 a subclass not found in this list will be ignored. By

2149 default, the subclasses considered are the ones found in

2150 self.interesting_string_types. If that's not specified,

2151 only NavigableString and CData objects will be

2152 considered. That means no comments, processing

2153 instructions, etc.

2154 """

2155 if types is self.default:

2156 if self.interesting_string_types is None:

2157 types = self.MAIN_CONTENT_STRING_TYPES

2158 else:

2159 types = self.interesting_string_types

2160

2161 for descendant in self.descendants:

2162 if not isinstance(descendant, NavigableString):

2163 continue

2164 descendant_type = type(descendant)

2165 if isinstance(types, type):

2166 if descendant_type is not types:

2167 # We're not interested in strings of this type.

2168 continue

2169 elif types is not None and descendant_type not in types:

2170 # We're not interested in strings of this type.

2171 continue

2172 if strip:

2173 stripped = descendant.strip()

2174 if len(stripped) == 0:

2175 continue

2176 yield stripped

2177 else:

2178 yield descendant

2179

2180 strings = property(_all_strings)

2181

2182 def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]:

2183 """Insert one or more new PageElements as a child of this `Tag`.

2184

2185 This works similarly to :py:meth:`list.insert`, except you can insert

2186 multiple elements at once.

2187

2188 :param position: The numeric position that should be occupied

2189 in this Tag's `Tag.children` by the first new `PageElement`.

2190

2191 :param new_children: The PageElements to insert.

2192

2193 :return The newly inserted PageElements.

2194 """

2195 inserted: List[PageElement] = []

2196 for new_child in new_children:

2197 inserted.extend(self._insert(position, new_child))

2198 position += 1

2199 return inserted

2200

2201 def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]:

2202 if new_child is None:

2203 raise ValueError("Cannot insert None into a tag.")

2204 if new_child is self:

2205 raise ValueError("Cannot insert a tag into itself.")

2206 if isinstance(new_child, str) and not isinstance(new_child, NavigableString):

2207 new_child = NavigableString(new_child)

2208

2209 from bs4 import BeautifulSoup

2210 if isinstance(new_child, BeautifulSoup):

2211 # We don't want to end up with a situation where one BeautifulSoup

2212 # object contains another. Insert the BeautifulSoup's children and

2213 # return them.

2214 return self.insert(position, *list(new_child.contents))

2215 position = min(position, len(self.contents))

2216 if hasattr(new_child, "parent") and new_child.parent is not None:

2217 # We're 'inserting' an element that's already one

2218 # of this object's children.

2219 if new_child.parent is self:

2220 current_index = self.index(new_child)

2221 if current_index < position:

2222 # We're moving this element further down the list

2223 # of this object's children. That means that when

2224 # we extract this element, our target index will

2225 # jump down one.

2226 position -= 1

2227 elif current_index == position:

2228 # We're 'inserting' an element into its current location.

2229 # This is a no-op.

2230 return [new_child]

2231 new_child.extract()

2232

2233 new_child.parent = self

2234 previous_child = None

2235 if position == 0:

2236 new_child.previous_sibling = None

2237 new_child.previous_element = self

2238 else:

2239 previous_child = self.contents[position - 1]

2240 new_child.previous_sibling = previous_child

2241 new_child.previous_sibling.next_sibling = new_child

2242 new_child.previous_element = previous_child._last_descendant(False)

2243 if new_child.previous_element is not None:

2244 new_child.previous_element.next_element = new_child

2245

2246 new_childs_last_element = new_child._last_descendant(

2247 is_initialized=False, accept_self=True

2248 )

2249 # new_childs_last_element can't be None because we passed

2250 # accept_self=True into _last_descendant. Worst case,

2251 # new_childs_last_element will be new_child itself. Making

2252 # this cast removes several mypy complaints later on as we

2253 # manipulate new_childs_last_element.

2254 new_childs_last_element = cast(PageElement, new_childs_last_element)

2255

2256 if position >= len(self.contents):

2257 new_child.next_sibling = None

2258

2259 parent: Optional[Tag] = self

2260 parents_next_sibling = None

2261 while parents_next_sibling is None and parent is not None:

2262 parents_next_sibling = parent.next_sibling

2263 parent = parent.parent

2264 if parents_next_sibling is not None:

2265 # We found the element that comes next in the document.

2266 break

2267 if parents_next_sibling is not None:

2268 new_childs_last_element.next_element = parents_next_sibling

2269 else:

2270 # The last element of this tag is the last element in

2271 # the document.

2272 new_childs_last_element.next_element = None

2273 else:

2274 next_child = self.contents[position]

2275 new_child.next_sibling = next_child

2276 if new_child.next_sibling is not None:

2277 new_child.next_sibling.previous_sibling = new_child

2278 new_childs_last_element.next_element = next_child

2279

2280 if new_childs_last_element.next_element is not None:

2281 new_childs_last_element.next_element.previous_element = (

2282 new_childs_last_element

2283 )

2284 self.contents.insert(position, new_child)

2285

2286 return [new_child]

2287

2288 def unwrap(self) -> Self:

2289 """Replace this `PageElement` with its contents.

2290

2291 :return: This object, no longer part of the tree.

2292 """

2293 my_parent = self.parent

2294 if my_parent is None:

2295 raise ValueError(

2296 "Cannot replace an element with its contents when that "

2297 "element is not part of a tree."

2298 )

2299 my_index = my_parent.index(self)

2300 self.extract(_self_index=my_index)

2301 for child in reversed(self.contents[:]):

2302 my_parent.insert(my_index, child)

2303 return self

2304

2305 replace_with_children = unwrap

2306

2307 @_deprecated("unwrap", "4.0.0")

2308 def replaceWithChildren(self) -> _OneElement:

2309 ": :meta private:"

2310 return self.unwrap()

2311

2312 def append(self, tag: _InsertableElement) -> PageElement|List[PageElement]:

2313 """Appends the given `PageElement` to the contents of this `Tag`.

2314

2315 :param tag: A PageElement. If this is another BeautifulSoup

2316 object, all of its contents will be inserted into this

2317 `Tag`, since one BeautifulSoup object can't contain another

2318 one.

2319

2320 :return: The object that was just appended, or (if `tag` was a BeautifulSoup

2321 object) all such objects.

2322 """

2323 inserted = self.insert(len(self.contents), tag)

2324 if isinstance(tag, Tag) and tag.name == "[document]": # TODO: can't reference BeautifulSoup class in this module

2325 return inserted

2326 else:

2327 return inserted[0]

2328

2329 def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]:

2330 """Appends one or more objects to the contents of this

2331 `Tag`.

2332

2333 :param tags: If a list of `PageElement` objects is provided,

2334 they will be appended to this tag's contents, one at a time.

2335 If a single `Tag` is provided, its `Tag.contents` will be

2336 used to extend this object's `Tag.contents`.

2337

2338 :return The list of PageElements that were appended.

2339 """

2340 tag_list: Iterable[_InsertableElement]

2341

2342 if isinstance(tags, Tag):

2343 tag_list = list(tags.contents)

2344 elif isinstance(tags, (PageElement, str)):

2345 # The caller should really be using append() instead,

2346 # but we can make it work.

2347 warnings.warn(

2348 "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.",

2349 UserWarning,

2350 stacklevel=2,

2351 )

2352 if isinstance(tags, str) and not isinstance(tags, PageElement):

2353 tags = NavigableString(tags)

2354 tag_list = [tags]

2355 elif isinstance(tags, Iterable):

2356 # Moving items around the tree may change their position in

2357 # the original list. Make a list that won't change.

2358 tag_list = list(tags)

2359

2360 results: List[PageElement] = []

2361 for tag in tag_list:

2362 appended = self.append(tag)

2363 if isinstance(appended, list):

2364 # This can happen if you pass in a mixture of Tag and BeautifulSoup objects.

2365 results.extend(appended)

2366 else:

2367 results.append(appended)

2368

2369 return results

2370

2371 def clear(self, decompose: bool = False) -> None:

2372 """Destroy all children of this `Tag` by calling

2373 `PageElement.extract` on them.

2374

2375 :param decompose: If this is True, `PageElement.decompose` (a

2376 more destructive method) will be called instead of

2377 `PageElement.extract`.

2378 """

2379 for element in self.contents[:]:

2380 if decompose:

2381 element.decompose()

2382 else:

2383 element.extract()

2384

2385 def smooth(self) -> None:

2386 """Smooth out the children of this `Tag` by consolidating consecutive

2387 strings.

2388

2389 If you perform a lot of operations that modify the tree,

2390 calling this method afterwards can make pretty-printed output

2391 look more natural.

2392 """

2393 # Mark the first position of every pair of children that need

2394 # to be consolidated. Do this rather than making a copy of

2395 # self.contents, since in most cases very few strings will be

2396 # affected.

2397 marked = []

2398 for i, a in enumerate(self.contents):

2399 if isinstance(a, Tag):

2400 # Recursively smooth children.

2401 a.smooth()

2402 if i == len(self.contents) - 1:

2403 # This is the last item in .contents, and it's not a

2404 # tag. There's no chance it needs any work.

2405 continue

2406 b = self.contents[i + 1]

2407 if (

2408 isinstance(a, NavigableString)

2409 and isinstance(b, NavigableString)

2410 and not isinstance(a, PreformattedString)

2411 and not isinstance(b, PreformattedString)

2412 ):

2413 marked.append(i)

2414

2415 # Go over the marked positions in reverse order, so that

2416 # removing items from .contents won't affect the remaining

2417 # positions.

2418 for i in reversed(marked):

2419 a = cast(NavigableString, self.contents[i])

2420 b = cast(NavigableString, self.contents[i + 1])

2421 b.extract()

2422 n = NavigableString(a + b)

2423 a.replace_with(n)

2424

2425 def index(self, element: PageElement) -> int:

2426 """Find the index of a child of this `Tag` (by identity, not value).

2427

2428 Doing this by identity avoids issues when a `Tag` contains two

2429 children that have string equality.

2430

2431 :param element: Look for this `PageElement` in this object's contents.

2432 """

2433 for i, child in enumerate(self.contents):

2434 if child is element:

2435 return i

2436 raise ValueError("Tag.index: element not in tag")

2437

2438 def get(

2439 self, key: str, default: Optional[_AttributeValue] = None

2440 ) -> Optional[_AttributeValue]:

2441 """Returns the value of the 'key' attribute for the tag, or

2442 the value given for 'default' if it doesn't have that

2443 attribute.

2444

2445 :param key: The attribute to look for.

2446 :param default: Use this value if the attribute is not present

2447 on this `Tag`.

2448 """

2449 return self.attrs.get(key, default)

2450

2451 def get_attribute_list(

2452 self, key: str, default: Optional[AttributeValueList] = None

2453 ) -> AttributeValueList:

2454 """The same as get(), but always returns a (possibly empty) list.

2455

2456 :param key: The attribute to look for.

2457 :param default: Use this value if the attribute is not present

2458 on this `Tag`.

2459 :return: A list of strings, usually empty or containing only a single

2460 value.

2461 """

2462 list_value: AttributeValueList

2463 value = self.get(key, default)

2464 if value is None:

2465 list_value = self.attribute_value_list_class()

2466 elif isinstance(value, list):

2467 list_value = value

2468 else:

2469 if not isinstance(value, str):

2470 value = cast(str, value)

2471 list_value = self.attribute_value_list_class([value])

2472 return list_value

2473

2474 def has_attr(self, key: str) -> bool:

2475 """Does this `Tag` have an attribute with the given name?"""

2476 return key in self.attrs

2477

2478 def __hash__(self) -> int:

2479 return str(self).__hash__()

2480

2481 def __getitem__(self, key: str) -> _AttributeValue:

2482 """tag[key] returns the value of the 'key' attribute for the Tag,

2483 and throws an exception if it's not there."""

2484 return self.attrs[key]

2485

2486 def __iter__(self) -> Iterator[PageElement]:

2487 "Iterating over a Tag iterates over its contents."

2488 return iter(self.contents)

2489

2490 def __len__(self) -> int:

2491 "The length of a Tag is the length of its list of contents."

2492 return len(self.contents)

2493

2494 def __contains__(self, x: Any) -> bool:

2495 return x in self.contents

2496

2497 def __bool__(self) -> bool:

2498 "A tag is non-None even if it has no contents."

2499 return True

2500

2501 def __setitem__(self, key: str, value: _AttributeValue) -> None:

2502 """Setting tag[key] sets the value of the 'key' attribute for the

2503 tag."""

2504 self.attrs[key] = value

2505

2506 def __delitem__(self, key: str) -> None:

2507 "Deleting tag[key] deletes all 'key' attributes for the tag."

2508 self.attrs.pop(key, None)

2509

2510 # Since Tag.__call__ is effectively the same as PageElement.find_all, see find_all for notes

2511 # on these overloads.

2512

2513 @overload

2514 def __call__(

2515 self,

2516 name: None = None,

2517 attrs: None = None,

2518 recursive: bool = True,

2519 *,

2520 string: _StrainableString,

2521 limit: Optional[int] = None,

2522 **kwargs: _StrainableAttribute,

2523 ) -> _SomeNavigableStrings:

2524 ...

2525

2526 @overload

2527 def __call__(

2528 self,

2529 name: None = None,

2530 attrs: None = None,

2531 recursive: bool = True,

2532 string: None = None,

2533 limit: Optional[int] = None,

2534 **kwargs: _StrainableAttribute,

2535 ) -> _SomeTags:

2536 ...

2537

2538 @overload

2539 def __call__(

2540 self,

2541 name: None,

2542 attrs: _StrainableAttributes,

2543 recursive: bool = True,

2544 string: None = None,

2545 limit: Optional[int] = None,

2546 **kwargs: _StrainableAttribute,

2547 ) -> _SomeTags:

2548 ...

2549

2550 @overload

2551 def __call__(

2552 self,

2553 name: _FindMethodName,

2554 attrs: Optional[_StrainableAttributes] = None,

2555 recursive: bool = True,

2556 string: Optional[_StrainableString] = None,

2557 limit: Optional[int] = None,

2558 **kwargs: _StrainableAttribute,

2559 ) -> _SomeTags:

2560 ...

2561

2562 def __call__(

2563 self,

2564 name: _OptionalFindMethodName = None,

2565 attrs: Optional[_StrainableAttributes] = None,

2566 recursive: bool = True,

2567 string: Optional[_StrainableString] = None,

2568 limit: Optional[int] = None,

2569 **kwargs: _StrainableAttribute,

2570 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:

2571 """Calling a Tag like a function is the same as calling its

2572 find_all() method.

2573

2574 Eg. tag('a') returns a list of all the A tags found within this tag.

2575 """

2576 return self._find_all(name, attrs, string, limit, self._generator_for_recursive(recursive), **kwargs)

2577

2578 def __getattr__(self, subtag: str) -> Optional[Tag]:

2579 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""

2580 # print("Getattr %s.%s" % (self.__class__, tag))

2581 result: _AtMostOneElement

2582 if len(subtag) > 3 and subtag.endswith("Tag"):

2583 # BS3: soup.aTag -> "soup.find("a")

2584 tag_name = subtag[:-3]

2585 warnings.warn(

2586 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")'

2587 % dict(name=tag_name),

2588 DeprecationWarning,

2589 stacklevel=2,

2590 )

2591 result = self.find(tag_name)

2592 # We special case contents to avoid recursion.

2593 elif not subtag.startswith("__") and not subtag == "contents":

2594 result = self.find(subtag)

2595 else:

2596 raise AttributeError(

2597 "'%s' object has no attribute '%s'" % (self.__class__, subtag)

2598 )

2599 return result

2600

2601 def __eq__(self, other: Any) -> bool:

2602 """Returns true iff this Tag has the same name, the same attributes,

2603 and the same contents (recursively) as `other`."""

2604 if self is other:

2605 return True

2606 if not isinstance(other, Tag):

2607 return False

2608 if (

2609 not hasattr(other, "name")

2610 or not hasattr(other, "attrs")

2611 or not hasattr(other, "contents")

2612 or self.name != other.name

2613 or self.attrs != other.attrs

2614 or len(self) != len(other)

2615 ):

2616 return False

2617 for i, my_child in enumerate(self.contents):

2618 if my_child != other.contents[i]:

2619 return False

2620 return True

2621

2622 def __ne__(self, other: Any) -> bool:

2623 """Returns true iff this Tag is not identical to `other`,

2624 as defined in __eq__."""

2625 return not self == other

2626

2627 def __repr__(self) -> str:

2628 """Renders this `Tag` as a string."""

2629 return self.decode()

2630

2631 __str__ = __unicode__ = __repr__

2632

2633 def encode(

2634 self,

2635 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2636 indent_level: Optional[int] = None,

2637 formatter: _FormatterOrName = "minimal",

2638 errors: str = "xmlcharrefreplace",

2639 ) -> bytes:

2640 """Render this `Tag` and its contents as a bytestring.

2641

2642 :param encoding: The encoding to use when converting to

2643 a bytestring. This may also affect the text of the document,

2644 specifically any encoding declarations within the document.

2645 :param indent_level: Each line of the rendering will be

2646 indented this many levels. (The ``formatter`` decides what a

2647 'level' means, in terms of spaces or other characters

2648 output.) This is used internally in recursive calls while

2649 pretty-printing.

2650 :param formatter: Either a `Formatter` object, or a string naming one of

2651 the standard formatters.

2652 :param errors: An error handling strategy such as

2653 'xmlcharrefreplace'. This value is passed along into

2654 :py:meth:`str.encode` and its value should be one of the `error

2655 handling constants defined by Python's codecs module

2656 <https://docs.python.org/3/library/codecs.html#error-handlers>`_.

2657 """

2658 # Turn the data structure into Unicode, then encode the

2659 # Unicode.

2660 u = self.decode(indent_level, encoding, formatter)

2661 return u.encode(encoding, errors)

2662

2663 def decode(

2664 self,

2665 indent_level: Optional[int] = None,

2666 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2667 formatter: _FormatterOrName = "minimal",

2668 iterator: Optional[Iterator[PageElement]] = None,

2669 ) -> str:

2670 """Render this `Tag` and its contents as a Unicode string.

2671

2672 :param indent_level: Each line of the rendering will be

2673 indented this many levels. (The ``formatter`` decides what a

2674 'level' means, in terms of spaces or other characters

2675 output.) This is used internally in recursive calls while

2676 pretty-printing.

2677 :param encoding: The encoding you intend to use when

2678 converting the string to a bytestring. decode() is *not*

2679 responsible for performing that encoding. This information

2680 is needed so that a real encoding can be substituted in if

2681 the document contains an encoding declaration (e.g. in a

2682 <meta> tag).

2683 :param formatter: Either a `Formatter` object, or a string

2684 naming one of the standard formatters.

2685 :param iterator: The iterator to use when navigating over the

2686 parse tree. This is only used by `Tag.decode_contents` and

2687 you probably won't need to use it.

2688 """

2689 pieces = []

2690 # First off, turn a non-Formatter `formatter` into a Formatter

2691 # object. This will stop the lookup from happening over and

2692 # over again.

2693 if not isinstance(formatter, Formatter):

2694 formatter = self.formatter_for_name(formatter)

2695

2696 if indent_level is True:

2697 indent_level = 0

2698

2699 # The currently active tag that put us into string literal

2700 # mode. Until this element is closed, children will be treated

2701 # as string literals and not pretty-printed. String literal

2702 # mode is turned on immediately after this tag begins, and

2703 # turned off immediately before it's closed. This means there

2704 # will be whitespace before and after the tag itself.

2705 string_literal_tag = None

2706

2707 for event, element in self._event_stream(iterator):

2708 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):

2709 element = cast(Tag, element)

2710 piece = element._format_tag(eventual_encoding, formatter, opening=True)

2711 elif event is Tag.END_ELEMENT_EVENT:

2712 element = cast(Tag, element)

2713 piece = element._format_tag(eventual_encoding, formatter, opening=False)

2714 if indent_level is not None:

2715 indent_level -= 1

2716 else:

2717 element = cast(NavigableString, element)

2718 piece = element.output_ready(formatter)

2719

2720 # Now we need to apply the 'prettiness' -- extra

2721 # whitespace before and/or after this tag. This can get

2722 # complicated because certain tags, like <pre> and

2723 # <script>, can't be prettified, since adding whitespace would

2724 # change the meaning of the content.

2725

2726 # The default behavior is to add whitespace before and

2727 # after an element when string literal mode is off, and to

2728 # leave things as they are when string literal mode is on.

2729 if string_literal_tag:

2730 indent_before = indent_after = False

2731 else:

2732 indent_before = indent_after = True

2733

2734 # The only time the behavior is more complex than that is

2735 # when we encounter an opening or closing tag that might

2736 # put us into or out of string literal mode.

2737 if (

2738 event is Tag.START_ELEMENT_EVENT

2739 and not string_literal_tag

2740 and not cast(Tag, element)._should_pretty_print()

2741 ):

2742 # We are about to enter string literal mode. Add

2743 # whitespace before this tag, but not after. We

2744 # will stay in string literal mode until this tag

2745 # is closed.

2746 indent_before = True

2747 indent_after = False

2748 string_literal_tag = element

2749 elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag:

2750 # We are about to exit string literal mode by closing

2751 # the tag that sent us into that mode. Add whitespace

2752 # after this tag, but not before.

2753 indent_before = False

2754 indent_after = True

2755 string_literal_tag = None

2756

2757 # Now we know whether to add whitespace before and/or

2758 # after this element.

2759 if indent_level is not None:

2760 if indent_before or indent_after:

2761 if isinstance(element, NavigableString):

2762 piece = piece.strip()

2763 if piece:

2764 piece = self._indent_string(

2765 piece, indent_level, formatter, indent_before, indent_after

2766 )

2767 if event == Tag.START_ELEMENT_EVENT:

2768 indent_level += 1

2769 pieces.append(piece)

2770 return "".join(pieces)

2771

2772 class _TreeTraversalEvent(object):

2773 """An internal class representing an event in the process

2774 of traversing a parse tree.

2775

2776 :meta private:

2777 """

2778

2779 # Stand-ins for the different events yielded by _event_stream

2780 START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:

2781 END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:

2782 EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:

2783 STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private:

2784

2785 def _event_stream(

2786 self, iterator: Optional[Iterator[PageElement]] = None

2787 ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]:

2788 """Yield a sequence of events that can be used to reconstruct the DOM

2789 for this element.

2790

2791 This lets us recreate the nested structure of this element

2792 (e.g. when formatting it as a string) without using recursive

2793 method calls.

2794

2795 This is similar in concept to the SAX API, but it's a simpler

2796 interface designed for internal use. The events are different

2797 from SAX and the arguments associated with the events are Tags

2798 and other Beautiful Soup objects.

2799

2800 :param iterator: An alternate iterator to use when traversing

2801 the tree.

2802 """

2803 tag_stack: List[Tag] = []

2804

2805 iterator = iterator or self.self_and_descendants

2806

2807 for c in iterator:

2808 # If the parent of the element we're about to yield is not

2809 # the tag currently on the stack, it means that the tag on

2810 # the stack closed before this element appeared.

2811 while tag_stack and c.parent != tag_stack[-1]:

2812 now_closed_tag = tag_stack.pop()

2813 yield Tag.END_ELEMENT_EVENT, now_closed_tag

2814

2815 if isinstance(c, Tag):

2816 if c.is_empty_element:

2817 yield Tag.EMPTY_ELEMENT_EVENT, c

2818 else:

2819 yield Tag.START_ELEMENT_EVENT, c

2820 tag_stack.append(c)

2821 continue

2822 else:

2823 yield Tag.STRING_ELEMENT_EVENT, c

2824

2825 while tag_stack:

2826 now_closed_tag = tag_stack.pop()

2827 yield Tag.END_ELEMENT_EVENT, now_closed_tag

2828

2829 def _indent_string(

2830 self,

2831 s: str,

2832 indent_level: int,

2833 formatter: Formatter,

2834 indent_before: bool,

2835 indent_after: bool,

2836 ) -> str:

2837 """Add indentation whitespace before and/or after a string.

2838

2839 :param s: The string to amend with whitespace.

2840 :param indent_level: The indentation level; affects how much

2841 whitespace goes before the string.

2842 :param indent_before: Whether or not to add whitespace

2843 before the string.

2844 :param indent_after: Whether or not to add whitespace

2845 (a newline) after the string.

2846 """

2847 space_before = ""

2848 if indent_before and indent_level:

2849 space_before = formatter.indent * indent_level

2850

2851 space_after = ""

2852 if indent_after:

2853 space_after = "\n"

2854

2855 return space_before + s + space_after

2856

2857 def _format_tag(

2858 self, eventual_encoding: str, formatter: Formatter, opening: bool

2859 ) -> str:

2860 if self.hidden:

2861 # A hidden tag is invisible, although its contents

2862 # are visible.

2863 return ""

2864

2865 # A tag starts with the < character (see below).

2866

2867 # Then the / character, if this is a closing tag.

2868 closing_slash = ""

2869 if not opening:

2870 closing_slash = "/"

2871

2872 # Then an optional namespace prefix.

2873 prefix = ""

2874 if self.prefix:

2875 prefix = self.prefix + ":"

2876

2877 # Then a list of attribute values, if this is an opening tag.

2878 attribute_string = ""

2879 if opening:

2880 attributes = formatter.attributes(self)

2881 attrs = []

2882 for key, val in attributes:

2883 if val is None:

2884 decoded = key

2885 else:

2886 if isinstance(val, list) or isinstance(val, tuple):

2887 val = " ".join(val)

2888 elif not isinstance(val, str):

2889 val = str(val)

2890 elif (

2891 isinstance(val, AttributeValueWithCharsetSubstitution)

2892 and eventual_encoding is not None

2893 ):

2894 val = val.substitute_encoding(eventual_encoding)

2895

2896 text = formatter.attribute_value(val)

2897 decoded = str(key) + "=" + formatter.quoted_attribute_value(text)

2898 attrs.append(decoded)

2899 if attrs:

2900 attribute_string = " " + " ".join(attrs)

2901

2902 # Then an optional closing slash (for a void element in an

2903 # XML document).

2904 void_element_closing_slash = ""

2905 if self.is_empty_element:

2906 void_element_closing_slash = formatter.void_element_close_prefix or ""

2907

2908 # Put it all together.

2909 return (

2910 "<"

2911 + closing_slash

2912 + prefix

2913 + self.name

2914 + attribute_string

2915 + void_element_closing_slash

2916 + ">"

2917 )

2918

2919 def _should_pretty_print(self, indent_level: int = 1) -> bool:

2920 """Should this tag be pretty-printed?

2921

2922 Most of them should, but some (such as <pre> in HTML

2923 documents) should not.

2924 """

2925 return indent_level is not None and (

2926 not self.preserve_whitespace_tags

2927 or self.name not in self.preserve_whitespace_tags

2928 )

2929

2930 @overload

2931 def prettify(

2932 self,

2933 encoding: None = None,

2934 formatter: _FormatterOrName = "minimal",

2935 ) -> str:

2936 ...

2937

2938 @overload

2939 def prettify(

2940 self,

2941 encoding: _Encoding,

2942 formatter: _FormatterOrName = "minimal",

2943 ) -> bytes:

2944 ...

2945

2946 def prettify(

2947 self,

2948 encoding: Optional[_Encoding] = None,

2949 formatter: _FormatterOrName = "minimal",

2950 ) -> Union[str, bytes]:

2951 """Pretty-print this `Tag` as a string or bytestring.

2952

2953 :param encoding: The encoding of the bytestring, or None if you want Unicode.

2954 :param formatter: A Formatter object, or a string naming one of

2955 the standard formatters.

2956 :return: A string (if no ``encoding`` is provided) or a bytestring

2957 (otherwise).

2958 """

2959 if encoding is None:

2960 return self.decode(indent_level=0, formatter=formatter)

2961 else:

2962 return self.encode(encoding=encoding, indent_level=0, formatter=formatter)

2963

2964 def decode_contents(

2965 self,

2966 indent_level: Optional[int] = None,

2967 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2968 formatter: _FormatterOrName = "minimal",

2969 ) -> str:

2970 """Renders the contents of this tag as a Unicode string.

2971

2972 :param indent_level: Each line of the rendering will be

2973 indented this many levels. (The formatter decides what a

2974 'level' means in terms of spaces or other characters

2975 output.) Used internally in recursive calls while

2976 pretty-printing.

2977

2978 :param eventual_encoding: The tag is destined to be

2979 encoded into this encoding. decode_contents() is *not*

2980 responsible for performing that encoding. This information

2981 is needed so that a real encoding can be substituted in if

2982 the document contains an encoding declaration (e.g. in a

2983 <meta> tag).

2984

2985 :param formatter: A `Formatter` object, or a string naming one of

2986 the standard Formatters.

2987 """

2988 return self.decode(

2989 indent_level, eventual_encoding, formatter, iterator=self.descendants

2990 )

2991

2992 def encode_contents(

2993 self,

2994 indent_level: Optional[int] = None,

2995 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

2996 formatter: _FormatterOrName = "minimal",

2997 ) -> bytes:

2998 """Renders the contents of this PageElement as a bytestring.

2999

3000 :param indent_level: Each line of the rendering will be

3001 indented this many levels. (The ``formatter`` decides what a

3002 'level' means, in terms of spaces or other characters

3003 output.) This is used internally in recursive calls while

3004 pretty-printing.

3005 :param formatter: Either a `Formatter` object, or a string naming one of

3006 the standard formatters.

3007 :param encoding: The bytestring will be in this encoding.

3008 """

3009 contents = self.decode_contents(indent_level, encoding, formatter)

3010 return contents.encode(encoding)

3011

3012 @_deprecated("encode_contents", "4.0.0")

3013 def renderContents(

3014 self,

3015 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

3016 prettyPrint: bool = False,

3017 indentLevel: Optional[int] = 0,

3018 ) -> bytes:

3019 """Deprecated method for BS3 compatibility.

3020

3021 :meta private:

3022 """

3023 if not prettyPrint:

3024 indentLevel = None

3025 return self.encode_contents(indent_level=indentLevel, encoding=encoding)

3026

3027 # Soup methods

3028 #

3029

3030 # People who call these methods in a type-safe environment

3031 # basically want to know whether the call is going to return

3032 # NavigableStrings or Tags. It's always one or the other, never

3033 # both, but spelling it out requires a number of overloads for

3034 # each method.

3035 #

3036 # If I had it to do over again I'd design this API differently (it

3037 # would look more like ElementFilter), but that's life.

3038 #

3039 # The overloads all look for a clue in the input which restricts

3040 # the method to returning either only strings or only tags. Only

3041 # the most common cases are covered.

3042

3043 # e.g. find(string="foo")

3044 # -> string information but no tag information

3045 # -> string

3046 @overload

3047 def find(

3048 self,

3049 name: None = None,

3050 attrs: None = None,

3051 recursive: bool = True,

3052 *,

3053 string: _StrainableString,

3054 **kwargs: _StrainableAttribute,

3055 ) -> _AtMostOneNavigableString:

3056 ...

3057

3058 # e.g. find() -> default behavior -> tag

3059 # find(attr="value") -> only tags have attrs -> tag

3060 @overload

3061 def find(

3062 self,

3063 name: None = None,

3064 attrs: None = None,

3065 recursive: bool = True,

3066 string: None = None,

3067 **kwargs: _StrainableAttribute,

3068 ) -> _AtMostOneTag:

3069 ...

3070

3071 # e.g. find(attrs=dict(attr="value"))

3072 # -> only tags have attrs

3073 # -> tag

3074 @overload

3075 def find(

3076 self,

3077 name: None,

3078 attrs: _StrainableAttributes,

3079 recursive: bool = True,

3080 string: Optional[_StrainableString] = None,

3081 **kwargs: _StrainableAttribute,

3082 ) -> _AtMostOneTag:

3083 ...

3084

3085 # e.g. find(name="a")) -> only tags have names -> tag

3086 #

3087 # The confusing and controversial case of find(name="a", string="foo")

3088 # also hits this overload.

3089 @overload

3090 def find(

3091 self,

3092 name: _FindMethodName,

3093 attrs: Optional[_StrainableAttributes] = None,

3094 recursive: bool = True,

3095 string: Optional[_StrainableString] = None,

3096 **kwargs: _StrainableAttribute,

3097 ) -> _AtMostOneTag:

3098 ...

3099

3100 # Some lesser-used cases are not covered by the overrides. Those

3101 # cases will hit this method directly and return a very general

3102 # type which will need to be cast after the call.

3103 def find(

3104 self,

3105 name: _OptionalFindMethodName = None,

3106 attrs: Optional[_StrainableAttributes] = None,

3107 recursive: bool = True,

3108 string: Optional[_StrainableString] = None,

3109 **kwargs: _StrainableAttribute,

3110 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:

3111 """Look in the children of this PageElement and find the first

3112 PageElement that matches the given criteria.

3113

3114 All find_* methods take a common set of arguments. See the online

3115 documentation for detailed explanations.

3116

3117 :param name: A filter on tag name.

3118 :param attrs: Additional filters on attribute values.

3119 :param recursive: If this is True, find() will perform a

3120 recursive search of this Tag's children. Otherwise,

3121 only the direct children will be considered.

3122 :param string: A filter on the `Tag.string` attribute.

3123 :kwargs: Additional filters on attribute values.

3124 """

3125 tags = self._find_all(name, attrs, string, 1, self._generator_for_recursive(recursive), **kwargs)

3126 if tags:

3127 return tags[0]

3128 return None

3129

3130 findChild = _deprecated_function_alias("findChild", "find", "3.0.0")

3131

3132 # e.g. find_all(string="foo")

3133 # -> string information but no tag information

3134 # -> strings

3135 #

3136 # Also covers unlikely cases like find_all(name=None, string="foo")

3137 #

3138 # "To mark parameters as keyword-only, indicating the parameters

3139 # must be passed by keyword argument, place an * in the arguments

3140 # list just before the first keyword-only parameter."

3141 #

3142 # --https://peps.python.org/pep-0570/#keyword-only-arguments

3143 @overload

3144 def find_all(

3145 self,

3146 name: None = None,

3147 attrs: None = None,

3148 recursive: bool = True,

3149 *,

3150 string: _StrainableString,

3151 limit: Optional[int] = None,

3152 **kwargs: _StrainableAttribute,

3153 ) -> _SomeNavigableStrings:

3154 ...

3155

3156 # e.g. find_all() -> default behavior -> tags

3157 # find_all(attr="value") -> only tags have attrs -> tags

3158 @overload

3159 def find_all(

3160 self,

3161 name: None = None,

3162 attrs: None = None,

3163 recursive: bool = True,

3164 string: None = None,

3165 limit: Optional[int] = None,

3166 **kwargs: _StrainableAttribute,

3167 ) -> _SomeTags:

3168 ...

3169

3170 # e.g. find_all(attrs=dict(attr="value"))

3171 # -> only tags have attrs

3172 # -> tags

3173 @overload

3174 def find_all(

3175 self,

3176 name: None,

3177 attrs: _StrainableAttributes,

3178 recursive: bool = True,

3179 string: Optional[_StrainableString] = None,

3180 limit: Optional[int] = None,

3181 **kwargs: _StrainableAttribute,

3182 ) -> _SomeTags:

3183 ...

3184

3185 # e.g. find_all(name="a")) -> only tags have names -> tags

3186 #

3187 # The confusing and controversial case of find_all(name="a", string="foo")

3188 # also hits this overload.

3189 @overload

3190 def find_all(

3191 self,

3192 name: _FindMethodName,

3193 attrs: Optional[_StrainableAttributes] = None,

3194 recursive: bool = True,

3195 string: Optional[_StrainableString] = None,

3196 limit: Optional[int] = None,

3197 **kwargs: _StrainableAttribute,

3198 ) -> _SomeTags:

3199 ...

3200

3201 # Without the clues above, we don't know whether the method will

3202 # return strings or tags. However every common case will trigger one

3203 # of the overloads and give us the clue we need.

3204 def find_all(

3205 self,

3206 name: _OptionalFindMethodName = None,

3207 attrs: Optional[_StrainableAttributes] = None,

3208 recursive: bool = True,

3209 string: Optional[_StrainableString] = None,

3210 limit: Optional[int] = None,

3211 **kwargs: _StrainableAttribute,

3212 ) -> Union[_SomeTags,_SomeNavigableStrings]:

3213 """Look in the children of this `PageElement` and find all

3214 `PageElement` objects that match the given criteria.

3215

3216 All find_* methods take a common set of arguments. See the online

3217 documentation for detailed explanations.

3218

3219 :param name: A filter on tag name.

3220 :param attrs: Additional filters on attribute values.

3221 :param recursive: If this is True, find_all() will perform a

3222 recursive search of this PageElement's children. Otherwise,

3223 only the direct children will be considered.

3224 :param limit: Stop looking after finding this many results.

3225 :kwargs: Additional filters on attribute values.

3226 """

3227 generator = self._generator_for_recursive(recursive)

3228

3229 if string is not None and (name is not None or attrs is not None or kwargs):

3230 # TODO: Using the @overload decorator to express the three ways you

3231 # could get into this path is way too much code for a rarely(?) used

3232 # feature.

3233 return cast(ResultSet[Tag],

3234 self._find_all(name, attrs, string, limit, generator,

3235 **kwargs))

3236

3237 if string is None:

3238 # If string is None, we're searching for tags.

3239 return cast(ResultSet[Tag], self._find_all(

3240 name, attrs, None, limit, generator, **kwargs

3241 ))

3242

3243 # Otherwise, we're searching for strings.

3244 return cast(ResultSet[NavigableString], self._find_all(

3245 None, None, string, limit, generator, **kwargs

3246 ))

3247

3248 findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")

3249 findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")

3250

3251 # Generator methods

3252 @property

3253 def children(self) -> Iterator[PageElement]:

3254 """Iterate over all direct children of this `PageElement`."""

3255 return (x for x in self.contents)

3256

3257 @property

3258 def self_and_descendants(self) -> Iterator[PageElement]:

3259 """Iterate over this `Tag` and its children in a

3260 breadth-first sequence.

3261 """

3262 return self._self_and(self.descendants)

3263

3264 @property

3265 def descendants(self) -> Iterator[PageElement]:

3266 """Iterate over all children of this `Tag` in a

3267 breadth-first sequence.

3268 """

3269 if not len(self.contents):

3270 return

3271 # _last_descendant() can't return None here because

3272 # accept_self is True. Worst case, last_descendant will end up

3273 # as self.

3274 last_descendant = cast(PageElement, self._last_descendant(accept_self=True))

3275 stopNode = last_descendant.next_element

3276 current: _AtMostOneElement = self.contents[0]

3277 while current is not stopNode and current is not None:

3278 successor = current.next_element

3279 yield current

3280 current = successor

3281

3282 def _generator_for_recursive(self, recursive:bool) -> Iterator[PageElement]:

3283 """Helper method to process the boolean `recursive` argument

3284 for find* methods.

3285

3286 :return: the appropriate generator

3287 """

3288 if recursive:

3289 return self.descendants

3290 return self.children

3291

3292 # CSS selector code

3293 def select_one(

3294 self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any

3295 ) -> Optional[Tag]:

3296 """Perform a CSS selection operation on the current element.

3297

3298 :param selector: A CSS selector.

3299

3300 :param namespaces: A dictionary mapping namespace prefixes

3301 used in the CSS selector to namespace URIs. By default,

3302 Beautiful Soup will use the prefixes it encountered while

3303 parsing the document.

3304

3305 :param kwargs: Keyword arguments to be passed into Soup Sieve's

3306 soupsieve.select() method.

3307 """

3308 return self.css.select_one(selector, namespaces, **kwargs)

3309

3310 def select(

3311 self,

3312 selector: str,

3313 namespaces: Optional[Dict[str, str]] = None,

3314 limit: int = 0,

3315 **kwargs: Any,

3316 ) -> ResultSet[Tag]:

3317 """Perform a CSS selection operation on the current element.

3318

3319 This uses the SoupSieve library.

3320

3321 :param selector: A string containing a CSS selector.

3322

3323 :param namespaces: A dictionary mapping namespace prefixes

3324 used in the CSS selector to namespace URIs. By default,

3325 Beautiful Soup will use the prefixes it encountered while

3326 parsing the document.

3327

3328 :param limit: After finding this number of results, stop looking.

3329

3330 :param kwargs: Keyword arguments to be passed into SoupSieve's

3331 soupsieve.select() method.

3332 """

3333 return self.css.select(selector, namespaces, limit, **kwargs)

3334

3335 @property

3336 def css(self) -> CSS:

3337 """Return an interface to the CSS selector API."""

3338 return CSS(self)

3339

3340 # Old names for backwards compatibility

3341 @_deprecated("children", "4.0.0")

3342 def childGenerator(self) -> Iterator[PageElement]:

3343 """Deprecated generator.

3344

3345 :meta private:

3346 """

3347 return self.children

3348

3349 @_deprecated("descendants", "4.0.0")

3350 def recursiveChildGenerator(self) -> Iterator[PageElement]:

3351 """Deprecated generator.

3352

3353 :meta private:

3354 """

3355 return self.descendants

3356

3357 @_deprecated("has_attr", "4.0.0")

3358 def has_key(self, key: str) -> bool:

3359 """Deprecated method. This was kind of misleading because has_key()

3360 (attributes) was different from __in__ (contents).

3361

3362 has_key() is gone in Python 3, anyway.

3363

3364 :meta private:

3365 """

3366 return self.has_attr(key)

3367

3368

3369_PageElementT = TypeVar("_PageElementT", bound=PageElement)

3370

3371class ResultSet(List[_PageElementT], Generic[_PageElementT]):

3372 """A ResultSet is a list of `PageElement` objects, gathered as the result

3373 of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of

3374 search results.

3375 """

3376

3377 source: Optional[ElementFilter]

3378

3379 def __init__(

3380 self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = ()

3381 ) -> None:

3382 super(ResultSet, self).__init__(result)

3383 self.source = source

3384

3385 def __getattr__(self, key: str) -> None:

3386 """Raise a helpful exception to explain a common code fix."""

3387 raise AttributeError(

3388 f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""

3389 )

3390

3391# Now that all the classes used by SoupStrainer have been defined,

3392# import SoupStrainer itself into this module to preserve the

3393# backwards compatibility of anyone who imports

3394# bs4.element.SoupStrainer.

3395from bs4.filter import SoupStrainer # noqa: E402