Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/element.py: 32%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1060 statements  

1from __future__ import annotations 

2 

3# Use of this source code is governed by the MIT license. 

4__license__ = "MIT" 

5 

6import re 

7import warnings 

8 

9from bs4.css import CSS 

10from bs4._deprecation import ( 

11 _deprecated, 

12 _deprecated_alias, 

13 _deprecated_function_alias, 

14) 

15from bs4.formatter import ( 

16 Formatter, 

17 HTMLFormatter, 

18 XMLFormatter, 

19) 

20from bs4._warnings import AttributeResemblesVariableWarning 

21 

22from typing import ( 

23 Any, 

24 Callable, 

25 Dict, 

26 Generic, 

27 Iterable, 

28 Iterator, 

29 List, 

30 Mapping, 

31 MutableSequence, 

32 Optional, 

33 Pattern, 

34 Set, 

35 TYPE_CHECKING, 

36 Tuple, 

37 Type, 

38 TypeVar, 

39 Union, 

40 cast, 

41 overload, 

42) 

43from typing_extensions import ( 

44 Self, 

45 TypeAlias, 

46) 

47 

48if TYPE_CHECKING: 

49 from bs4 import BeautifulSoup 

50 from bs4.builder import TreeBuilder 

51 from bs4.filter import ElementFilter 

52 from bs4.formatter import ( 

53 _EntitySubstitutionFunction, 

54 _FormatterOrName, 

55 ) 

56 from bs4._typing import ( 

57 _AtMostOneElement, 

58 _AtMostOneTag, 

59 _AtMostOneNavigableString, 

60 _AttributeValue, 

61 _AttributeValues, 

62 _Encoding, 

63 _InsertableElement, 

64 _OneElement, 

65 _QueryResults, 

66 _RawOrProcessedAttributeValues, 

67 _StrainableElement, 

68 _StrainableAttribute, 

69 _StrainableAttributes, 

70 _StrainableString, 

71 _SomeNavigableStrings, 

72 _SomeTags, 

73 ) 

74 

75_OneOrMoreStringTypes: TypeAlias = Union[ 

76 Type["NavigableString"], Iterable[Type["NavigableString"]] 

77] 

78 

79_FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]] 

80 

81# Deprecated module-level attributes. 

82# See https://peps.python.org/pep-0562/ 

83_deprecated_names = dict( 

84 whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy." 

85) 

86#: :meta private: 

87_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+") 

88 

89 

90def __getattr__(name: str) -> Any: 

91 if name in _deprecated_names: 

92 message = _deprecated_names[name] 

93 warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2) 

94 

95 return globals()[f"_deprecated_{name}"] 

96 raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 

97 

98 

99#: Documents output by Beautiful Soup will be encoded with 

100#: this encoding unless you specify otherwise. 

101DEFAULT_OUTPUT_ENCODING: str = "utf-8" 

102 

103#: A regular expression that can be used to split on whitespace. 

104nonwhitespace_re: Pattern[str] = re.compile(r"\S+") 

105 

106#: These encodings are recognized by Python (so `Tag.encode` 

107#: could theoretically support them) but XML and HTML don't recognize 

108#: them (so they should not show up in an XML or HTML document as that 

109#: document's encoding). 

110#: 

111#: If an XML document is encoded in one of these encodings, no encoding 

112#: will be mentioned in the XML declaration. If an HTML document is 

113#: encoded in one of these encodings, and the HTML document has a 

114#: <meta> tag that mentions an encoding, the encoding will be given as 

115#: the empty string. 

116#: 

117#: Source: 

118#: Python documentation, `Python Specific Encodings <https://docs.python.org/3/library/codecs.html#python-specific-encodings>`_ 

119PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set( 

120 [ 

121 "idna", 

122 "mbcs", 

123 "oem", 

124 "palmos", 

125 "punycode", 

126 "raw_unicode_escape", 

127 "undefined", 

128 "unicode_escape", 

129 "raw-unicode-escape", 

130 "unicode-escape", 

131 "string-escape", 

132 "string_escape", 

133 ] 

134) 

135 

136 

137class NamespacedAttribute(str): 

138 """A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"') 

139 which remembers the namespace prefix ('xml') and the name ('lang') 

140 that were used to create it. 

141 """ 

142 

143 prefix: Optional[str] 

144 name: Optional[str] 

145 namespace: Optional[str] 

146 

147 def __new__( 

148 cls, 

149 prefix: Optional[str], 

150 name: Optional[str] = None, 

151 namespace: Optional[str] = None, 

152 ) -> Self: 

153 if not name: 

154 # This is the default namespace. Its name "has no value" 

155 # per https://www.w3.org/TR/xml-names/#defaulting 

156 name = None 

157 

158 if not name: 

159 obj = str.__new__(cls, prefix) 

160 elif not prefix: 

161 # Not really namespaced. 

162 obj = str.__new__(cls, name) 

163 else: 

164 obj = str.__new__(cls, prefix + ":" + name) 

165 obj.prefix = prefix 

166 obj.name = name 

167 obj.namespace = namespace 

168 return obj 

169 

170 

171class AttributeValueWithCharsetSubstitution(str): 

172 """An abstract class standing in for a character encoding specified 

173 inside an HTML ``<meta>`` tag. 

174 

175 Subclasses exist for each place such a character encoding might be 

176 found: either inside the ``charset`` attribute 

177 (`CharsetMetaAttributeValue`) or inside the ``content`` attribute 

178 (`ContentMetaAttributeValue`) 

179 

180 This allows Beautiful Soup to replace that part of the HTML file 

181 with a different encoding when ouputting a tree as a string. 

182 """ 

183 

184 # The original, un-encoded value of the ``content`` attribute. 

185 #: :meta private: 

186 original_value: str 

187 

188 def substitute_encoding(self, eventual_encoding: str) -> str: 

189 """Do whatever's necessary in this implementation-specific 

190 portion an HTML document to substitute in a specific encoding. 

191 """ 

192 raise NotImplementedError() 

193 

194 

195class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 

196 """A generic stand-in for the value of a ``<meta>`` tag's ``charset`` 

197 attribute. 

198 

199 When Beautiful Soup parses the markup ``<meta charset="utf8">``, the 

200 value of the ``charset`` attribute will become one of these objects. 

201 

202 If the document is later encoded to an encoding other than UTF-8, its 

203 ``<meta>`` tag will mention the new encoding instead of ``utf8``. 

204 """ 

205 

206 def __new__(cls, original_value: str) -> Self: 

207 # We don't need to use the original value for anything, but 

208 # it might be useful for the user to know. 

209 obj = str.__new__(cls, original_value) 

210 obj.original_value = original_value 

211 return obj 

212 

213 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: 

214 """When an HTML document is being encoded to a given encoding, the 

215 value of a ``<meta>`` tag's ``charset`` becomes the name of 

216 the encoding. 

217 """ 

218 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 

219 return "" 

220 return eventual_encoding 

221 

222 

223class AttributeValueList(List[str]): 

224 """Class for the list used to hold the values of attributes which 

225 have multiple values (such as HTML's 'class'). It's just a regular 

226 list, but you can subclass it and pass it in to the TreeBuilder 

227 constructor as attribute_value_list_class, to have your subclass 

228 instantiated instead. 

229 """ 

230 

231 

232class AttributeDict(Dict[Any,Any]): 

233 """Superclass for the dictionary used to hold a tag's 

234 attributes. You can use this, but it's just a regular dict with no 

235 special logic. 

236 """ 

237 

238 

239class XMLAttributeDict(AttributeDict): 

240 """A dictionary for holding a Tag's attributes, which processes 

241 incoming values for consistency with the HTML spec. 

242 """ 

243 

244 def __setitem__(self, key: str, value: Any) -> None: 

245 """Set an attribute value, possibly modifying it to comply with 

246 the XML spec. 

247 

248 This just means converting common non-string values to 

249 strings: XML attributes may have "any literal string as a 

250 value." 

251 """ 

252 if value is None: 

253 value = "" 

254 if isinstance(value, bool): 

255 # XML does not define any rules for boolean attributes. 

256 # Preserve the old Beautiful Soup behavior (a bool that 

257 # gets converted to a string on output) rather than 

258 # guessing what the value should be. 

259 pass 

260 elif isinstance(value, (int, float)): 

261 # It's dangerous to convert _every_ attribute value into a 

262 # plain string, since an attribute value may be a more 

263 # sophisticated string-like object 

264 # (e.g. CharsetMetaAttributeValue). But we can definitely 

265 # convert numeric values and booleans, which are the most common. 

266 value = str(value) 

267 

268 super().__setitem__(key, value) 

269 

270 

271class HTMLAttributeDict(AttributeDict): 

272 """A dictionary for holding a Tag's attributes, which processes 

273 incoming values for consistency with the HTML spec, which says 

274 'Attribute values are a mixture of text and character 

275 references...' 

276 

277 Basically, this means converting common non-string values into 

278 strings, like XMLAttributeDict, though HTML also has some rules 

279 around boolean attributes that XML doesn't have. 

280 """ 

281 

282 def __setitem__(self, key: str, value: Any) -> None: 

283 """Set an attribute value, possibly modifying it to comply 

284 with the HTML spec, 

285 """ 

286 if value in (False, None): 

287 # 'The values "true" and "false" are not allowed on 

288 # boolean attributes. To represent a false value, the 

289 # attribute has to be omitted altogether.' 

290 if key in self: 

291 del self[key] 

292 return 

293 if isinstance(value, bool): 

294 # 'If the [boolean] attribute is present, its value must 

295 # either be the empty string or a value that is an ASCII 

296 # case-insensitive match for the attribute's canonical 

297 # name, with no leading or trailing whitespace.' 

298 # 

299 # [fixme] It's not clear to me whether "canonical name" 

300 # means fully-qualified name, unqualified name, or 

301 # (probably not) name with namespace prefix. For now I'm 

302 # going with unqualified name. 

303 if isinstance(key, NamespacedAttribute): 

304 value = key.name 

305 else: 

306 value = key 

307 elif isinstance(value, (int, float)): 

308 # See note in XMLAttributeDict for the reasoning why we 

309 # only do this to numbers. 

310 value = str(value) 

311 super().__setitem__(key, value) 

312 

313 

314class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): 

315 """A generic stand-in for the value of a ``<meta>`` tag's ``content`` 

316 attribute. 

317 

318 When Beautiful Soup parses the markup: 

319 ``<meta http-equiv="content-type" content="text/html; charset=utf8">`` 

320 

321 The value of the ``content`` attribute will become one of these objects. 

322 

323 If the document is later encoded to an encoding other than UTF-8, its 

324 ``<meta>`` tag will mention the new encoding instead of ``utf8``. 

325 """ 

326 

327 #: Match the 'charset' argument inside the 'content' attribute 

328 #: of a <meta> tag. 

329 #: :meta private: 

330 CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) 

331 

332 def __new__(cls, original_value: str) -> Self: 

333 cls.CHARSET_RE.search(original_value) 

334 obj = str.__new__(cls, original_value) 

335 obj.original_value = original_value 

336 return obj 

337 

338 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: 

339 """When an HTML document is being encoded to a given encoding, the 

340 value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes 

341 the name of the encoding. 

342 """ 

343 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 

344 return self.CHARSET_RE.sub("", self.original_value) 

345 

346 def rewrite(match: re.Match[str]) -> str: 

347 return match.group(1) + eventual_encoding 

348 

349 return self.CHARSET_RE.sub(rewrite, self.original_value) 

350 

351 

352class PageElement(object): 

353 """An abstract class representing a single element in the parse tree. 

354 

355 `NavigableString`, `Tag`, etc. are all subclasses of 

356 `PageElement`. For this reason you'll see a lot of methods that 

357 return `PageElement`, but you'll never see an actual `PageElement` 

358 object. For the most part you can think of `PageElement` as 

359 meaning "a `Tag` or a `NavigableString`." 

360 """ 

361 

362 #: In general, we can't tell just by looking at an element whether 

363 #: it's contained in an XML document or an HTML document. But for 

364 #: `Tag` objects (q.v.) we can store this information at parse time. 

365 #: :meta private: 

366 known_xml: Optional[bool] = None 

367 

368 #: Whether or not this element has been decomposed from the tree 

369 #: it was created in. 

370 _decomposed: bool 

371 

372 parent: Optional[Tag] 

373 next_element: _AtMostOneElement 

374 previous_element: _AtMostOneElement 

375 next_sibling: _AtMostOneElement 

376 previous_sibling: _AtMostOneElement 

377 

378 #: Whether or not this element is hidden from generated output. 

379 #: Only the `BeautifulSoup` object itself is hidden. 

380 hidden: bool = False 

381 

382 def setup( 

383 self, 

384 parent: Optional[Tag] = None, 

385 previous_element: _AtMostOneElement = None, 

386 next_element: _AtMostOneElement = None, 

387 previous_sibling: _AtMostOneElement = None, 

388 next_sibling: _AtMostOneElement = None, 

389 ) -> None: 

390 """Sets up the initial relations between this element and 

391 other elements. 

392 

393 :param parent: The parent of this element. 

394 

395 :param previous_element: The element parsed immediately before 

396 this one. 

397 

398 :param next_element: The element parsed immediately after 

399 this one. 

400 

401 :param previous_sibling: The most recently encountered element 

402 on the same level of the parse tree as this one. 

403 

404 :param previous_sibling: The next element to be encountered 

405 on the same level of the parse tree as this one. 

406 """ 

407 self.parent = parent 

408 

409 self.previous_element = previous_element 

410 if self.previous_element is not None: 

411 self.previous_element.next_element = self 

412 

413 self.next_element = next_element 

414 if self.next_element is not None: 

415 self.next_element.previous_element = self 

416 

417 self.next_sibling = next_sibling 

418 if self.next_sibling is not None: 

419 self.next_sibling.previous_sibling = self 

420 

421 if ( 

422 previous_sibling is None 

423 and self.parent is not None 

424 and self.parent.contents 

425 ): 

426 previous_sibling = self.parent.contents[-1] 

427 

428 self.previous_sibling = previous_sibling 

429 if self.previous_sibling is not None: 

430 self.previous_sibling.next_sibling = self 

431 

432 def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str: 

433 """Format the given string using the given formatter. 

434 

435 :param s: A string. 

436 :param formatter: A Formatter object, or a string naming one of the standard formatters. 

437 """ 

438 if formatter is None: 

439 return s 

440 if not isinstance(formatter, Formatter): 

441 formatter = self.formatter_for_name(formatter) 

442 output = formatter.substitute(s) 

443 return output 

444 

445 def formatter_for_name( 

446 self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction] 

447 ) -> Formatter: 

448 """Look up or create a Formatter for the given identifier, 

449 if necessary. 

450 

451 :param formatter: Can be a `Formatter` object (used as-is), a 

452 function (used as the entity substitution hook for an 

453 `bs4.formatter.XMLFormatter` or 

454 `bs4.formatter.HTMLFormatter`), or a string (used to look 

455 up an `bs4.formatter.XMLFormatter` or 

456 `bs4.formatter.HTMLFormatter` in the appropriate registry. 

457 

458 """ 

459 if isinstance(formatter_name, Formatter): 

460 return formatter_name 

461 c: type[Formatter] 

462 registry: Mapping[Optional[str], Formatter] 

463 if self._is_xml: 

464 c = XMLFormatter 

465 registry = XMLFormatter.REGISTRY 

466 else: 

467 c = HTMLFormatter 

468 registry = HTMLFormatter.REGISTRY 

469 if callable(formatter_name): 

470 return c(entity_substitution=formatter_name) 

471 return registry[formatter_name] 

472 

473 @property 

474 def _is_xml(self) -> bool: 

475 """Is this element part of an XML tree or an HTML tree? 

476 

477 This is used in formatter_for_name, when deciding whether an 

478 XMLFormatter or HTMLFormatter is more appropriate. It can be 

479 inefficient, but it should be called very rarely. 

480 """ 

481 if self.known_xml is not None: 

482 # Most of the time we will have determined this when the 

483 # document is parsed. 

484 return self.known_xml 

485 

486 # Otherwise, it's likely that this element was created by 

487 # direct invocation of the constructor from within the user's 

488 # Python code. 

489 if self.parent is None: 

490 # This is the top-level object. It should have .known_xml set 

491 # from tree creation. If not, take a guess--BS is usually 

492 # used on HTML markup. 

493 return getattr(self, "is_xml", False) 

494 return self.parent._is_xml 

495 

496 nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0") 

497 previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0") 

498 

499 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: 

500 raise NotImplementedError() 

501 

502 def __copy__(self) -> Self: 

503 """A copy of a PageElement can only be a deep copy, because 

504 only one PageElement can occupy a given place in a parse tree. 

505 """ 

506 return self.__deepcopy__({}) 

507 

508 default: Iterable[type[NavigableString]] = tuple() #: :meta private: 

509 

510 def _all_strings( 

511 self, strip: bool = False, types: Iterable[type[NavigableString]] = default 

512 ) -> Iterator[str]: 

513 """Yield all strings of certain classes, possibly stripping them. 

514 

515 This is implemented differently in `Tag` and `NavigableString`. 

516 """ 

517 raise NotImplementedError() 

518 

519 @property 

520 def stripped_strings(self) -> Iterator[str]: 

521 """Yield all interesting strings in this PageElement, stripping them 

522 first. 

523 

524 See `Tag` for information on which strings are considered 

525 interesting in a given context. 

526 """ 

527 for string in self._all_strings(True): 

528 yield string 

529 

530 def get_text( 

531 self, 

532 separator: str = "", 

533 strip: bool = False, 

534 types: Iterable[Type[NavigableString]] = default, 

535 ) -> str: 

536 """Get all child strings of this PageElement, concatenated using the 

537 given separator. 

538 

539 :param separator: Strings will be concatenated using this separator. 

540 

541 :param strip: If True, strings will be stripped before being 

542 concatenated. 

543 

544 :param types: A tuple of NavigableString subclasses. Any 

545 strings of a subclass not found in this list will be 

546 ignored. Although there are exceptions, the default 

547 behavior in most cases is to consider only NavigableString 

548 and CData objects. That means no comments, processing 

549 instructions, etc. 

550 

551 :return: A string. 

552 """ 

553 return separator.join([s for s in self._all_strings(strip, types=types)]) 

554 

555 getText = get_text 

556 text = property(get_text) 

557 

558 def replace_with(self, *args: _InsertableElement) -> Self: 

559 """Replace this `PageElement` with one or more other elements, 

560 objects, keeping the rest of the tree the same. 

561 

562 :return: This `PageElement`, no longer part of the tree. 

563 """ 

564 if self.parent is None: 

565 raise ValueError( 

566 "Cannot replace one element with another when the " 

567 "element to be replaced is not part of a tree." 

568 ) 

569 if len(args) == 1 and args[0] is self: 

570 # Replacing an element with itself is a no-op. 

571 return self 

572 if any(x is self.parent for x in args): 

573 raise ValueError("Cannot replace a Tag with its parent.") 

574 old_parent = self.parent 

575 my_index = self.parent.index(self) 

576 self.extract(_self_index=my_index) 

577 for idx, replace_with in enumerate(args, start=my_index): 

578 old_parent.insert(idx, replace_with) 

579 return self 

580 

581 replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0") 

582 

583 def wrap(self, wrap_inside: Tag) -> Tag: 

584 """Wrap this `PageElement` inside a `Tag`. 

585 

586 :return: ``wrap_inside``, occupying the position in the tree that used 

587 to be occupied by this object, and with this object now inside it. 

588 """ 

589 me = self.replace_with(wrap_inside) 

590 wrap_inside.append(me) 

591 return wrap_inside 

592 

593 def extract(self, _self_index: Optional[int] = None) -> Self: 

594 """Destructively rips this element out of the tree. 

595 

596 :param _self_index: The location of this element in its parent's 

597 .contents, if known. Passing this in allows for a performance 

598 optimization. 

599 

600 :return: this `PageElement`, no longer part of the tree. 

601 """ 

602 if self.parent is not None: 

603 if _self_index is None: 

604 _self_index = self.parent.index(self) 

605 del self.parent.contents[_self_index] 

606 

607 # Find the two elements that would be next to each other if 

608 # this element (and any children) hadn't been parsed. Connect 

609 # the two. 

610 last_child = self._last_descendant() 

611 

612 # last_child can't be None because we passed accept_self=True 

613 # into _last_descendant. Worst case, last_child will be 

614 # self. Making this cast removes several mypy complaints later 

615 # on as we manipulate last_child. 

616 last_child = cast(PageElement, last_child) 

617 next_element = last_child.next_element 

618 

619 if self.previous_element is not None: 

620 if self.previous_element is not next_element: 

621 self.previous_element.next_element = next_element 

622 if next_element is not None and next_element is not self.previous_element: 

623 next_element.previous_element = self.previous_element 

624 self.previous_element = None 

625 last_child.next_element = None 

626 

627 self.parent = None 

628 if ( 

629 self.previous_sibling is not None 

630 and self.previous_sibling is not self.next_sibling 

631 ): 

632 self.previous_sibling.next_sibling = self.next_sibling 

633 if ( 

634 self.next_sibling is not None 

635 and self.next_sibling is not self.previous_sibling 

636 ): 

637 self.next_sibling.previous_sibling = self.previous_sibling 

638 self.previous_sibling = self.next_sibling = None 

639 return self 

640 

641 def decompose(self) -> None: 

642 """Recursively destroys this `PageElement` and its children. 

643 

644 The element will be removed from the tree and wiped out; so 

645 will everything beneath it. 

646 

647 The behavior of a decomposed `PageElement` is undefined and you 

648 should never use one for anything, but if you need to *check* 

649 whether an element has been decomposed, you can use the 

650 `PageElement.decomposed` property. 

651 """ 

652 self.extract() 

653 e: _AtMostOneElement = self 

654 next_up: _AtMostOneElement = None 

655 while e is not None: 

656 next_up = e.next_element 

657 e.__dict__.clear() 

658 if isinstance(e, Tag): 

659 e.name = "" 

660 e.contents = [] 

661 e._decomposed = True 

662 e = next_up 

663 

664 def _last_descendant( 

665 self, is_initialized: bool = True, accept_self: bool = True 

666 ) -> _AtMostOneElement: 

667 """Finds the last element beneath this object to be parsed. 

668 

669 Special note to help you figure things out if your type 

670 checking is tripped up by the fact that this method returns 

671 _AtMostOneElement instead of PageElement: the only time 

672 this method returns None is if `accept_self` is False and the 

673 `PageElement` has no children--either it's a NavigableString 

674 or an empty Tag. 

675 

676 :param is_initialized: Has `PageElement.setup` been called on 

677 this `PageElement` yet? 

678 

679 :param accept_self: Is ``self`` an acceptable answer to the 

680 question? 

681 """ 

682 if is_initialized and self.next_sibling is not None: 

683 last_child = self.next_sibling.previous_element 

684 else: 

685 last_child = self 

686 while isinstance(last_child, Tag) and last_child.contents: 

687 last_child = last_child.contents[-1] 

688 if not accept_self and last_child is self: 

689 last_child = None 

690 return last_child 

691 

692 _lastRecursiveChild = _deprecated_alias( 

693 "_lastRecursiveChild", "_last_descendant", "4.0.0" 

694 ) 

695 

696 def insert_before(self, *args: _InsertableElement) -> List[PageElement]: 

697 """Makes the given element(s) the immediate predecessor of this one. 

698 

699 All the elements will have the same `PageElement.parent` as 

700 this one, and the given elements will occur immediately before 

701 this one. 

702 

703 :param args: One or more PageElements. 

704 

705 :return The list of PageElements that were inserted. 

706 """ 

707 parent = self.parent 

708 if parent is None: 

709 raise ValueError("Element has no parent, so 'before' has no meaning.") 

710 if any(x is self for x in args): 

711 raise ValueError("Can't insert an element before itself.") 

712 results: List[PageElement] = [] 

713 for predecessor in args: 

714 # Extract first so that the index won't be screwed up if they 

715 # are siblings. 

716 if isinstance(predecessor, PageElement): 

717 predecessor.extract() 

718 index = parent.index(self) 

719 results.extend(parent.insert(index, predecessor)) 

720 

721 return results 

722 

723 def insert_after(self, *args: _InsertableElement) -> List[PageElement]: 

724 """Makes the given element(s) the immediate successor of this one. 

725 

726 The elements will have the same `PageElement.parent` as this 

727 one, and the given elements will occur immediately after this 

728 one. 

729 

730 :param args: One or more PageElements. 

731 

732 :return The list of PageElements that were inserted. 

733 """ 

734 # Do all error checking before modifying the tree. 

735 parent = self.parent 

736 if parent is None: 

737 raise ValueError("Element has no parent, so 'after' has no meaning.") 

738 if any(x is self for x in args): 

739 raise ValueError("Can't insert an element after itself.") 

740 

741 offset = 0 

742 results: List[PageElement] = [] 

743 for successor in args: 

744 # Extract first so that the index won't be screwed up if they 

745 # are siblings. 

746 if isinstance(successor, PageElement): 

747 successor.extract() 

748 index = parent.index(self) 

749 results.extend(parent.insert(index + 1 + offset, successor)) 

750 offset += 1 

751 

752 return results 

753 

754 # For the suppression of this pyright warning, see discussion here: 

755 # https://github.com/microsoft/pyright/issues/10929 

756 @overload 

757 def find_next( # pyright: ignore [reportOverlappingOverload] 

758 self, 

759 name: _FindMethodName = None, 

760 attrs: Optional[_StrainableAttributes] = None, 

761 string: None=None, 

762 **kwargs: _StrainableAttribute, 

763 ) -> _AtMostOneTag: 

764 ... 

765 

766 @overload 

767 def find_next( 

768 self, 

769 name: None=None, 

770 attrs: None=None, 

771 string: _StrainableString="", 

772 **kwargs: _StrainableAttribute, 

773 ) -> _AtMostOneNavigableString: 

774 ... 

775 

776 def find_next( 

777 self, 

778 name: _FindMethodName = None, 

779 attrs: Optional[_StrainableAttributes] = None, 

780 string: Optional[_StrainableString] = None, 

781 **kwargs: _StrainableAttribute, 

782 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]: 

783 """Find the first PageElement that matches the given criteria and 

784 appears later in the document than this PageElement. 

785 

786 All find_* methods take a common set of arguments. See the online 

787 documentation for detailed explanations. 

788 

789 :param name: A filter on tag name. 

790 :param attrs: Additional filters on attribute values. 

791 :param string: A filter for a NavigableString with specific text. 

792 :kwargs: Additional filters on attribute values. 

793 """ 

794 return self._find_one(self.find_all_next, name, attrs, string, **kwargs) 

795 

796 findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0") 

797 

798 @overload 

799 def find_all_next( # pyright: ignore [reportOverlappingOverload] 

800 self, 

801 name: _FindMethodName = None, 

802 attrs: Optional[_StrainableAttributes] = None, 

803 string: None = None, 

804 limit: Optional[int] = None, 

805 _stacklevel: int = 2, 

806 **kwargs: _StrainableAttribute, 

807 ) -> _SomeTags: 

808 ... 

809 

810 @overload 

811 def find_all_next( 

812 self, 

813 name: None = None, 

814 attrs: None = None, 

815 string: _StrainableString = "", 

816 limit: Optional[int] = None, 

817 _stacklevel: int = 2, 

818 **kwargs: _StrainableAttribute, 

819 ) -> _SomeNavigableStrings: 

820 ... 

821 

822 def find_all_next( 

823 self, 

824 name: _FindMethodName = None, 

825 attrs: Optional[_StrainableAttributes] = None, 

826 string: Optional[_StrainableString] = None, 

827 limit: Optional[int] = None, 

828 _stacklevel: int = 2, 

829 **kwargs: _StrainableAttribute, 

830 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]: 

831 """Find all `PageElement` objects that match the given criteria and 

832 appear later in the document than this `PageElement`. 

833 

834 All find_* methods take a common set of arguments. See the online 

835 documentation for detailed explanations. 

836 

837 :param name: A filter on tag name. 

838 :param attrs: Additional filters on attribute values. 

839 :param string: A filter for a NavigableString with specific text. 

840 :param limit: Stop looking after finding this many results. 

841 :param _stacklevel: Used internally to improve warning messages. 

842 :kwargs: Additional filters on attribute values. 

843 """ 

844 return self._find_all( 

845 name, 

846 attrs, 

847 string, 

848 limit, 

849 self.next_elements, 

850 _stacklevel=_stacklevel + 1, 

851 **kwargs, 

852 ) 

853 

854 findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0") 

855 

856 @overload 

857 def find_next_sibling( # pyright: ignore [reportOverlappingOverload] 

858 self, 

859 name: _FindMethodName = None, 

860 attrs: Optional[_StrainableAttributes] = None, 

861 string: None=None, 

862 **kwargs: _StrainableAttribute, 

863 ) -> _AtMostOneTag: 

864 ... 

865 

866 @overload 

867 def find_next_sibling( 

868 self, 

869 name: None=None, 

870 attrs: None=None, 

871 string: _StrainableString="", 

872 **kwargs: _StrainableAttribute, 

873 ) -> _AtMostOneNavigableString: 

874 ... 

875 

876 def find_next_sibling( 

877 self, 

878 name: _FindMethodName = None, 

879 attrs: Optional[_StrainableAttributes] = None, 

880 string: Optional[_StrainableString] = None, 

881 **kwargs: _StrainableAttribute, 

882 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]: 

883 """Find the closest sibling to this PageElement that matches the 

884 given criteria and appears later in the document. 

885 

886 All find_* methods take a common set of arguments. See the 

887 online documentation for detailed explanations. 

888 

889 :param name: A filter on tag name. 

890 :param attrs: Additional filters on attribute values. 

891 :param string: A filter for a `NavigableString` with specific text. 

892 :kwargs: Additional filters on attribute values. 

893 """ 

894 return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs) 

895 

896 findNextSibling = _deprecated_function_alias( 

897 "findNextSibling", "find_next_sibling", "4.0.0" 

898 ) 

899 

900 @overload 

901 def find_next_siblings( # pyright: ignore [reportOverlappingOverload] 

902 self, 

903 name: _FindMethodName = None, 

904 attrs: Optional[_StrainableAttributes] = None, 

905 string: None = None, 

906 limit: Optional[int] = None, 

907 _stacklevel: int = 2, 

908 **kwargs: _StrainableAttribute, 

909 ) -> _SomeTags: 

910 ... 

911 

912 @overload 

913 def find_next_siblings( 

914 self, 

915 name: None = None, 

916 attrs: None = None, 

917 string: _StrainableString = "", 

918 limit: Optional[int] = None, 

919 _stacklevel: int = 2, 

920 **kwargs: _StrainableAttribute, 

921 ) -> _SomeNavigableStrings: 

922 ... 

923 

924 def find_next_siblings( 

925 self, 

926 name: _FindMethodName = None, 

927 attrs: Optional[_StrainableAttributes] = None, 

928 string: Optional[_StrainableString] = None, 

929 limit: Optional[int] = None, 

930 _stacklevel: int = 2, 

931 **kwargs: _StrainableAttribute, 

932 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]: 

933 """Find all siblings of this `PageElement` that match the given criteria 

934 and appear later in the document. 

935 

936 All find_* methods take a common set of arguments. See the online 

937 documentation for detailed explanations. 

938 

939 :param name: A filter on tag name. 

940 :param attrs: Additional filters on attribute values. 

941 :param string: A filter for a `NavigableString` with specific text. 

942 :param limit: Stop looking after finding this many results. 

943 :param _stacklevel: Used internally to improve warning messages. 

944 :kwargs: Additional filters on attribute values. 

945 """ 

946 return self._find_all( 

947 name, 

948 attrs, 

949 string, 

950 limit, 

951 self.next_siblings, 

952 _stacklevel=_stacklevel + 1, 

953 **kwargs, 

954 ) 

955 

956 findNextSiblings = _deprecated_function_alias( 

957 "findNextSiblings", "find_next_siblings", "4.0.0" 

958 ) 

959 fetchNextSiblings = _deprecated_function_alias( 

960 "fetchNextSiblings", "find_next_siblings", "3.0.0" 

961 ) 

962 

963 @overload 

964 def find_previous( # pyright: ignore [reportOverlappingOverload] 

965 self, 

966 name: _FindMethodName = None, 

967 attrs: Optional[_StrainableAttributes] = None, 

968 string: None=None, 

969 **kwargs: _StrainableAttribute, 

970 ) -> _AtMostOneTag: 

971 ... 

972 

973 @overload 

974 def find_previous( 

975 self, 

976 name: None=None, 

977 attrs: None=None, 

978 string: _StrainableString="", 

979 **kwargs: _StrainableAttribute, 

980 ) -> _AtMostOneNavigableString: 

981 ... 

982 

983 def find_previous( 

984 self, 

985 name: _FindMethodName = None, 

986 attrs: Optional[_StrainableAttributes] = None, 

987 string: Optional[_StrainableString] = None, 

988 **kwargs: _StrainableAttribute, 

989 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]: 

990 """Look backwards in the document from this `PageElement` and find the 

991 first `PageElement` that matches the given criteria. 

992 

993 All find_* methods take a common set of arguments. See the online 

994 documentation for detailed explanations. 

995 

996 :param name: A filter on tag name. 

997 :param attrs: Additional filters on attribute values. 

998 :param string: A filter for a `NavigableString` with specific text. 

999 :kwargs: Additional filters on attribute values. 

1000 """ 

1001 return self._find_one(self.find_all_previous, name, attrs, string, **kwargs) 

1002 

1003 findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0") 

1004 

1005 @overload 

1006 def find_all_previous( # pyright: ignore [reportOverlappingOverload] 

1007 self, 

1008 name: _FindMethodName = None, 

1009 attrs: Optional[_StrainableAttributes] = None, 

1010 string: None = None, 

1011 limit: Optional[int] = None, 

1012 _stacklevel: int = 2, 

1013 **kwargs: _StrainableAttribute, 

1014 ) -> _SomeTags: 

1015 ... 

1016 

1017 @overload 

1018 def find_all_previous( 

1019 self, 

1020 name: None = None, 

1021 attrs: None = None, 

1022 string: _StrainableString = "", 

1023 limit: Optional[int] = None, 

1024 _stacklevel: int = 2, 

1025 **kwargs: _StrainableAttribute, 

1026 ) -> _SomeNavigableStrings: 

1027 ... 

1028 

1029 def find_all_previous( 

1030 self, 

1031 name: _FindMethodName = None, 

1032 attrs: Optional[_StrainableAttributes] = None, 

1033 string: Optional[_StrainableString] = None, 

1034 limit: Optional[int] = None, 

1035 _stacklevel: int = 2, 

1036 **kwargs: _StrainableAttribute, 

1037 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]: 

1038 """Look backwards in the document from this `PageElement` and find all 

1039 `PageElement` that match the given criteria. 

1040 

1041 All find_* methods take a common set of arguments. See the online 

1042 documentation for detailed explanations. 

1043 

1044 :param name: A filter on tag name. 

1045 :param attrs: Additional filters on attribute values. 

1046 :param string: A filter for a `NavigableString` with specific text. 

1047 :param limit: Stop looking after finding this many results. 

1048 :param _stacklevel: Used internally to improve warning messages. 

1049 :kwargs: Additional filters on attribute values. 

1050 """ 

1051 return self._find_all( 

1052 name, 

1053 attrs, 

1054 string, 

1055 limit, 

1056 self.previous_elements, 

1057 _stacklevel=_stacklevel + 1, 

1058 **kwargs, 

1059 ) 

1060 

1061 findAllPrevious = _deprecated_function_alias( 

1062 "findAllPrevious", "find_all_previous", "4.0.0" 

1063 ) 

1064 fetchAllPrevious = _deprecated_function_alias( 

1065 "fetchAllPrevious", "find_all_previous", "3.0.0" 

1066 ) 

1067 

1068 @overload 

1069 def find_previous_sibling( # pyright: ignore [reportOverlappingOverload] 

1070 self, 

1071 name: _FindMethodName = None, 

1072 attrs: Optional[_StrainableAttributes] = None, 

1073 string: None=None, 

1074 **kwargs: _StrainableAttribute, 

1075 ) -> _AtMostOneTag: 

1076 ... 

1077 

1078 @overload 

1079 def find_previous_sibling( 

1080 self, 

1081 name: None=None, 

1082 attrs: None=None, 

1083 string: _StrainableString="", 

1084 **kwargs: _StrainableAttribute, 

1085 ) -> _AtMostOneNavigableString: 

1086 ... 

1087 

1088 def find_previous_sibling( 

1089 self, 

1090 name: _FindMethodName = None, 

1091 attrs: Optional[_StrainableAttributes] = None, 

1092 string: Optional[_StrainableString] = None, 

1093 **kwargs: _StrainableAttribute, 

1094 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]: 

1095 """Returns the closest sibling to this `PageElement` that matches the 

1096 given criteria and appears earlier in the document. 

1097 

1098 All find_* methods take a common set of arguments. See the online 

1099 documentation for detailed explanations. 

1100 

1101 :param name: A filter on tag name. 

1102 :param attrs: Additional filters on attribute values. 

1103 :param string: A filter for a `NavigableString` with specific text. 

1104 :kwargs: Additional filters on attribute values. 

1105 """ 

1106 return self._find_one( 

1107 self.find_previous_siblings, name, attrs, string, **kwargs 

1108 ) 

1109 

1110 findPreviousSibling = _deprecated_function_alias( 

1111 "findPreviousSibling", "find_previous_sibling", "4.0.0" 

1112 ) 

1113 

1114 @overload 

1115 def find_previous_siblings( # pyright: ignore [reportOverlappingOverload] 

1116 self, 

1117 name: _FindMethodName = None, 

1118 attrs: Optional[_StrainableAttributes] = None, 

1119 string: None = None, 

1120 limit: Optional[int] = None, 

1121 _stacklevel: int = 2, 

1122 **kwargs: _StrainableAttribute, 

1123 ) -> _SomeTags: 

1124 ... 

1125 

1126 @overload 

1127 def find_previous_siblings( 

1128 self, 

1129 name: None = None, 

1130 attrs: None = None, 

1131 string: _StrainableString = "", 

1132 limit: Optional[int] = None, 

1133 _stacklevel: int = 2, 

1134 **kwargs: _StrainableAttribute, 

1135 ) -> _SomeNavigableStrings: 

1136 ... 

1137 

1138 def find_previous_siblings( 

1139 self, 

1140 name: _FindMethodName = None, 

1141 attrs: Optional[_StrainableAttributes] = None, 

1142 string: Optional[_StrainableString] = None, 

1143 limit: Optional[int] = None, 

1144 _stacklevel: int = 2, 

1145 **kwargs: _StrainableAttribute, 

1146 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]: 

1147 """Returns all siblings to this PageElement that match the 

1148 given criteria and appear earlier in the document. 

1149 

1150 All find_* methods take a common set of arguments. See the online 

1151 documentation for detailed explanations. 

1152 

1153 :param name: A filter on tag name. 

1154 :param attrs: Additional filters on attribute values. 

1155 :param string: A filter for a NavigableString with specific text. 

1156 :param limit: Stop looking after finding this many results. 

1157 :param _stacklevel: Used internally to improve warning messages. 

1158 :kwargs: Additional filters on attribute values. 

1159 """ 

1160 return self._find_all( 

1161 name, 

1162 attrs, 

1163 string, 

1164 limit, 

1165 self.previous_siblings, 

1166 _stacklevel=_stacklevel + 1, 

1167 **kwargs, 

1168 ) 

1169 

1170 findPreviousSiblings = _deprecated_function_alias( 

1171 "findPreviousSiblings", "find_previous_siblings", "4.0.0" 

1172 ) 

1173 fetchPreviousSiblings = _deprecated_function_alias( 

1174 "fetchPreviousSiblings", "find_previous_siblings", "3.0.0" 

1175 ) 

1176 

1177 def find_parent( 

1178 self, 

1179 name: _FindMethodName = None, 

1180 attrs: Optional[_StrainableAttributes] = None, 

1181 **kwargs: _StrainableAttribute, 

1182 ) -> _AtMostOneTag: 

1183 """Find the closest parent of this PageElement that matches the given 

1184 criteria. 

1185 

1186 All find_* methods take a common set of arguments. See the online 

1187 documentation for detailed explanations. 

1188 

1189 :param name: A filter on tag name. 

1190 :param attrs: Additional filters on attribute values. 

1191 :param self: Whether the PageElement itself should be considered 

1192 as one of its 'parents'. 

1193 :kwargs: Additional filters on attribute values. 

1194 """ 

1195 # NOTE: We can't use _find_one because findParents takes a different 

1196 # set of arguments. 

1197 r = None 

1198 results = self.find_parents( 

1199 name, attrs, 1, _stacklevel=3, **kwargs 

1200 ) 

1201 if results: 

1202 r = results[0] 

1203 return r 

1204 

1205 findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0") 

1206 

1207 def find_parents( 

1208 self, 

1209 name: _FindMethodName = None, 

1210 attrs: Optional[_StrainableAttributes] = None, 

1211 limit: Optional[int] = None, 

1212 _stacklevel: int = 2, 

1213 **kwargs: _StrainableAttribute, 

1214 ) -> _SomeTags: 

1215 """Find all parents of this `PageElement` that match the given criteria. 

1216 

1217 All find_* methods take a common set of arguments. See the online 

1218 documentation for detailed explanations. 

1219 

1220 :param name: A filter on tag name. 

1221 :param attrs: Additional filters on attribute values. 

1222 :param limit: Stop looking after finding this many results. 

1223 :param _stacklevel: Used internally to improve warning messages. 

1224 :kwargs: Additional filters on attribute values. 

1225 """ 

1226 iterator = self.parents 

1227 # Only Tags can have children, so this ResultSet will contain 

1228 # nothing but Tags. 

1229 return cast(ResultSet[Tag], self._find_all( 

1230 name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs 

1231 )) 

1232 

1233 findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0") 

1234 fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0") 

1235 

1236 @property 

1237 def next(self) -> _AtMostOneElement: 

1238 """The `PageElement`, if any, that was parsed just after this one.""" 

1239 return self.next_element 

1240 

1241 @property 

1242 def previous(self) -> _AtMostOneElement: 

1243 """The `PageElement`, if any, that was parsed just before this one.""" 

1244 return self.previous_element 

1245 

1246 # These methods do the real heavy lifting. 

1247 

1248 def _find_one( 

1249 self, 

1250 # TODO-TYPING: "There is no syntax to indicate optional or 

1251 # keyword arguments; such function types are rarely used 

1252 # as callback types." - So, not sure how to get more 

1253 # specific here. 

1254 method: Callable, 

1255 name: _FindMethodName, 

1256 attrs: Optional[_StrainableAttributes], 

1257 string: Optional[_StrainableString], 

1258 **kwargs: _StrainableAttribute, 

1259 ) -> _AtMostOneElement: 

1260 r: _AtMostOneElement = None 

1261 results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs) 

1262 if results: 

1263 r = results[0] 

1264 return r 

1265 

1266 def _find_all( 

1267 self, 

1268 name: _FindMethodName, 

1269 attrs: Optional[_StrainableAttributes], 

1270 string: Optional[_StrainableString], 

1271 limit: Optional[int], 

1272 generator: Iterator[PageElement], 

1273 _stacklevel: int = 3, 

1274 **kwargs: _StrainableAttribute, 

1275 ) -> _QueryResults: 

1276 """Iterates over a generator looking for things that match.""" 

1277 

1278 if string is None and "text" in kwargs: 

1279 string = kwargs.pop("text") 

1280 warnings.warn( 

1281 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", 

1282 DeprecationWarning, 

1283 stacklevel=_stacklevel, 

1284 ) 

1285 

1286 if "_class" in kwargs: 

1287 warnings.warn( 

1288 AttributeResemblesVariableWarning.MESSAGE 

1289 % dict( 

1290 original="_class", 

1291 autocorrect="class_", 

1292 ), 

1293 AttributeResemblesVariableWarning, 

1294 stacklevel=_stacklevel, 

1295 ) 

1296 

1297 from bs4.filter import ElementFilter 

1298 

1299 if isinstance(name, ElementFilter): 

1300 matcher = name 

1301 else: 

1302 matcher = SoupStrainer(name, attrs, string, **kwargs) 

1303 

1304 result: MutableSequence[_OneElement] 

1305 if string is None and not limit and not attrs and not kwargs: 

1306 if name is True or name is None: 

1307 # Optimization to find all tags. 

1308 result = [element for element in generator if isinstance(element, Tag)] 

1309 return ResultSet(matcher, result) 

1310 elif isinstance(name, str): 

1311 # Optimization to find all tags with a given name. 

1312 if name.count(":") == 1: 

1313 # This is a name with a prefix. If this is a namespace-aware document, 

1314 # we need to match the local name against tag.name. If not, 

1315 # we need to match the fully-qualified name against tag.name. 

1316 prefix, local_name = name.split(":", 1) 

1317 else: 

1318 prefix = None 

1319 local_name = name 

1320 result = [] 

1321 for element in generator: 

1322 if not isinstance(element, Tag): 

1323 continue 

1324 if element.name == name or ( 

1325 element.name == local_name 

1326 and (prefix is None or element.prefix == prefix) 

1327 ): 

1328 result.append(element) 

1329 return ResultSet(matcher, result) 

1330 return matcher.find_all(generator, limit) 

1331 

1332 # These generators can be used to navigate starting from both 

1333 # NavigableStrings and Tags. 

1334 @property 

1335 def next_elements(self) -> Iterator[PageElement]: 

1336 """All PageElements that were parsed after this one.""" 

1337 i = self.next_element 

1338 while i is not None: 

1339 successor = i.next_element 

1340 yield i 

1341 i = successor 

1342 

1343 @property 

1344 def self_and_next_elements(self) -> Iterator[PageElement]: 

1345 """This PageElement, then all PageElements that were parsed after it.""" 

1346 return self._self_and(self.next_elements) 

1347 

1348 @property 

1349 def next_siblings(self) -> Iterator[PageElement]: 

1350 """All PageElements that are siblings of this one but were parsed 

1351 later. 

1352 """ 

1353 i = self.next_sibling 

1354 while i is not None: 

1355 successor = i.next_sibling 

1356 yield i 

1357 i = successor 

1358 

1359 @property 

1360 def self_and_next_siblings(self) -> Iterator[PageElement]: 

1361 """This PageElement, then all of its siblings.""" 

1362 return self._self_and(self.next_siblings) 

1363 

1364 @property 

1365 def previous_elements(self) -> Iterator[PageElement]: 

1366 """All PageElements that were parsed before this one. 

1367 

1368 :yield: A sequence of PageElements. 

1369 """ 

1370 i = self.previous_element 

1371 while i is not None: 

1372 successor = i.previous_element 

1373 yield i 

1374 i = successor 

1375 

1376 @property 

1377 def self_and_previous_elements(self) -> Iterator[PageElement]: 

1378 """This PageElement, then all elements that were parsed 

1379 earlier.""" 

1380 return self._self_and(self.previous_elements) 

1381 

1382 @property 

1383 def previous_siblings(self) -> Iterator[PageElement]: 

1384 """All PageElements that are siblings of this one but were parsed 

1385 earlier. 

1386 

1387 :yield: A sequence of PageElements. 

1388 """ 

1389 i = self.previous_sibling 

1390 while i is not None: 

1391 successor = i.previous_sibling 

1392 yield i 

1393 i = successor 

1394 

1395 @property 

1396 def self_and_previous_siblings(self) -> Iterator[PageElement]: 

1397 """This PageElement, then all of its siblings that were parsed 

1398 earlier.""" 

1399 return self._self_and(self.previous_siblings) 

1400 

1401 @property 

1402 def parents(self) -> Iterator[Tag]: 

1403 """All elements that are parents of this PageElement. 

1404 

1405 :yield: A sequence of Tags, ending with a BeautifulSoup object. 

1406 """ 

1407 i = self.parent 

1408 while i is not None: 

1409 successor = i.parent 

1410 yield i 

1411 i = successor 

1412 

1413 @property 

1414 def self_and_parents(self) -> Iterator[PageElement]: 

1415 """This element, then all of its parents. 

1416 

1417 :yield: A sequence of PageElements, ending with a BeautifulSoup object. 

1418 """ 

1419 return self._self_and(self.parents) 

1420 

1421 def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]: 

1422 """Modify a generator by yielding this element, then everything 

1423 yielded by the other generator. 

1424 """ 

1425 if not self.hidden: 

1426 yield self 

1427 for i in other_generator: 

1428 yield i 

1429 

1430 @property 

1431 def decomposed(self) -> bool: 

1432 """Check whether a PageElement has been decomposed.""" 

1433 return getattr(self, "_decomposed", False) or False 

1434 

1435 @_deprecated("next_elements", "4.0.0") 

1436 def nextGenerator(self) -> Iterator[PageElement]: 

1437 ":meta private:" 

1438 return self.next_elements 

1439 

1440 @_deprecated("next_siblings", "4.0.0") 

1441 def nextSiblingGenerator(self) -> Iterator[PageElement]: 

1442 ":meta private:" 

1443 return self.next_siblings 

1444 

1445 @_deprecated("previous_elements", "4.0.0") 

1446 def previousGenerator(self) -> Iterator[PageElement]: 

1447 ":meta private:" 

1448 return self.previous_elements 

1449 

1450 @_deprecated("previous_siblings", "4.0.0") 

1451 def previousSiblingGenerator(self) -> Iterator[PageElement]: 

1452 ":meta private:" 

1453 return self.previous_siblings 

1454 

1455 @_deprecated("parents", "4.0.0") 

1456 def parentGenerator(self) -> Iterator[PageElement]: 

1457 ":meta private:" 

1458 return self.parents 

1459 

1460 

1461class NavigableString(str, PageElement): 

1462 """A Python string that is part of a parse tree. 

1463 

1464 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will 

1465 create a `NavigableString` for the string "penguin". 

1466 """ 

1467 

1468 #: A string prepended to the body of the 'real' string 

1469 #: when formatting it as part of a document, such as the '<!--' 

1470 #: in an HTML comment. 

1471 PREFIX: str = "" 

1472 

1473 #: A string appended to the body of the 'real' string 

1474 #: when formatting it as part of a document, such as the '-->' 

1475 #: in an HTML comment. 

1476 SUFFIX: str = "" 

1477 

1478 def __new__(cls, value: Union[str, bytes]) -> Self: 

1479 """Create a new NavigableString. 

1480 

1481 When unpickling a NavigableString, this method is called with 

1482 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 

1483 passed in to the superclass's __new__ or the superclass won't know 

1484 how to handle non-ASCII characters. 

1485 """ 

1486 if isinstance(value, str): 

1487 u = str.__new__(cls, value) 

1488 else: 

1489 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 

1490 u.hidden = False 

1491 u.setup() 

1492 return u 

1493 

1494 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: 

1495 """A copy of a NavigableString has the same contents and class 

1496 as the original, but it is not connected to the parse tree. 

1497 

1498 :param recursive: This parameter is ignored; it's only defined 

1499 so that NavigableString.__deepcopy__ implements the same 

1500 signature as Tag.__deepcopy__. 

1501 """ 

1502 return type(self)(self) 

1503 

1504 def __getnewargs__(self) -> Tuple[str]: 

1505 return (str(self),) 

1506 

1507 # TODO-TYPING This should be SupportsIndex|slice but SupportsIndex 

1508 # is introduced in 3.8. This can be changed once 3.7 support is dropped. 

1509 def __getitem__(self, key: Union[int|slice]) -> str: # type:ignore 

1510 """Raise an exception """ 

1511 if isinstance(key, str): 

1512 raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__)) 

1513 return super(NavigableString, self).__getitem__(key) 

1514 

1515 @property 

1516 def string(self) -> str: 

1517 """Convenience property defined to match `Tag.string`. 

1518 

1519 :return: This property always returns the `NavigableString` it was 

1520 called on. 

1521 

1522 :meta private: 

1523 """ 

1524 return self 

1525 

1526 def output_ready(self, formatter: _FormatterOrName = "minimal") -> str: 

1527 """Run the string through the provided formatter, making it 

1528 ready for output as part of an HTML or XML document. 

1529 

1530 :param formatter: A `Formatter` object, or a string naming one 

1531 of the standard formatters. 

1532 """ 

1533 output = self.format_string(self, formatter) 

1534 return self.PREFIX + output + self.SUFFIX 

1535 

1536 @property 

1537 def name(self) -> None: 

1538 """Since a NavigableString is not a Tag, it has no .name. 

1539 

1540 This property is implemented so that code like this doesn't crash 

1541 when run on a mixture of Tag and NavigableString objects: 

1542 [x.name for x in tag.children] 

1543 

1544 :meta private: 

1545 """ 

1546 return None 

1547 

1548 @name.setter 

1549 def name(self, name: str) -> None: 

1550 """Prevent NavigableString.name from ever being set. 

1551 

1552 :meta private: 

1553 """ 

1554 raise AttributeError("A NavigableString cannot be given a name.") 

1555 

1556 def _all_strings( 

1557 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default 

1558 ) -> Iterator[str]: 

1559 """Yield all strings of certain classes, possibly stripping them. 

1560 

1561 This makes it easy for NavigableString to implement methods 

1562 like get_text() as conveniences, creating a consistent 

1563 text-extraction API across all PageElements. 

1564 

1565 :param strip: If True, all strings will be stripped before being 

1566 yielded. 

1567 

1568 :param types: A tuple of NavigableString subclasses. If this 

1569 NavigableString isn't one of those subclasses, the 

1570 sequence will be empty. By default, the subclasses 

1571 considered are NavigableString and CData objects. That 

1572 means no comments, processing instructions, etc. 

1573 

1574 :yield: A sequence that either contains this string, or is empty. 

1575 """ 

1576 if types is self.default: 

1577 # This is kept in Tag because it's full of subclasses of 

1578 # this class, which aren't defined until later in the file. 

1579 types = Tag.MAIN_CONTENT_STRING_TYPES 

1580 

1581 # Do nothing if the caller is looking for specific types of 

1582 # string, and we're of a different type. 

1583 # 

1584 # We check specific types instead of using isinstance(self, 

1585 # types) because all of these classes subclass 

1586 # NavigableString. Anyone who's using this feature probably 

1587 # wants generic NavigableStrings but not other stuff. 

1588 my_type = type(self) 

1589 if types is not None: 

1590 if isinstance(types, type): 

1591 # Looking for a single type. 

1592 if my_type is not types: 

1593 return 

1594 elif my_type not in types: 

1595 # Looking for one of a list of types. 

1596 return 

1597 

1598 value = self 

1599 if strip: 

1600 final_value = value.strip() 

1601 else: 

1602 final_value = self 

1603 if len(final_value) > 0: 

1604 yield final_value 

1605 

1606 @property 

1607 def strings(self) -> Iterator[str]: 

1608 """Yield this string, but only if it is interesting. 

1609 

1610 This is defined the way it is for compatibility with 

1611 `Tag.strings`. See `Tag` for information on which strings are 

1612 interesting in a given context. 

1613 

1614 :yield: A sequence that either contains this string, or is empty. 

1615 """ 

1616 return self._all_strings() 

1617 

1618 

1619class PreformattedString(NavigableString): 

1620 """A `NavigableString` not subject to the normal formatting rules. 

1621 

1622 This is an abstract class used for special kinds of strings such 

1623 as comments (`Comment`) and CDATA blocks (`CData`). 

1624 """ 

1625 

1626 PREFIX: str = "" 

1627 SUFFIX: str = "" 

1628 

1629 def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str: 

1630 """Make this string ready for output by adding any subclass-specific 

1631 prefix or suffix. 

1632 

1633 :param formatter: A `Formatter` object, or a string naming one 

1634 of the standard formatters. The string will be passed into the 

1635 `Formatter`, but only to trigger any side effects: the return 

1636 value is ignored. 

1637 

1638 :return: The string, with any subclass-specific prefix and 

1639 suffix added on. 

1640 """ 

1641 if formatter is not None: 

1642 self.format_string(self, formatter) 

1643 return self.PREFIX + self + self.SUFFIX 

1644 

1645 

1646class CData(PreformattedString): 

1647 """A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_.""" 

1648 

1649 PREFIX: str = "<![CDATA[" 

1650 SUFFIX: str = "]]>" 

1651 

1652 

1653class ProcessingInstruction(PreformattedString): 

1654 """A SGML processing instruction.""" 

1655 

1656 PREFIX: str = "<?" 

1657 SUFFIX: str = ">" 

1658 

1659 

1660class XMLProcessingInstruction(ProcessingInstruction): 

1661 """An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_.""" 

1662 

1663 PREFIX: str = "<?" 

1664 SUFFIX: str = "?>" 

1665 

1666 

1667class Comment(PreformattedString): 

1668 """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_.""" 

1669 

1670 PREFIX: str = "<!--" 

1671 SUFFIX: str = "-->" 

1672 

1673 

1674class Declaration(PreformattedString): 

1675 """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_.""" 

1676 

1677 PREFIX: str = "<?" 

1678 SUFFIX: str = "?>" 

1679 

1680 

1681class Doctype(PreformattedString): 

1682 """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_.""" 

1683 

1684 @classmethod 

1685 def for_name_and_ids( 

1686 cls, name: str, pub_id: Optional[str], system_id: Optional[str] 

1687 ) -> Doctype: 

1688 """Generate an appropriate document type declaration for a given 

1689 public ID and system ID. 

1690 

1691 :param name: The name of the document's root element, e.g. 'html'. 

1692 :param pub_id: The Formal Public Identifier for this document type, 

1693 e.g. '-//W3C//DTD XHTML 1.1//EN' 

1694 :param system_id: The system identifier for this document type, 

1695 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' 

1696 """ 

1697 return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id)) 

1698 

1699 @classmethod 

1700 def _string_for_name_and_ids( 

1701 cls, name: str, pub_id: Optional[str], system_id: Optional[str] 

1702 ) -> str: 

1703 """Generate a string to be used as the basis of a Doctype object. 

1704 

1705 This is a separate method from for_name_and_ids() because the lxml 

1706 TreeBuilder needs to call it. 

1707 """ 

1708 value = name or "" 

1709 if pub_id is not None: 

1710 value += ' PUBLIC "%s"' % pub_id 

1711 if system_id is not None: 

1712 value += ' "%s"' % system_id 

1713 elif system_id is not None: 

1714 value += ' SYSTEM "%s"' % system_id 

1715 return value 

1716 

1717 PREFIX: str = "<!DOCTYPE " 

1718 SUFFIX: str = ">\n" 

1719 

1720 

1721class Stylesheet(NavigableString): 

1722 """A `NavigableString` representing the contents of a `<style> HTML 

1723 tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_ 

1724 (probably CSS). 

1725 

1726 Used to distinguish embedded stylesheets from textual content. 

1727 """ 

1728 

1729 

1730class Script(NavigableString): 

1731 """A `NavigableString` representing the contents of a `<script> 

1732 HTML tag 

1733 <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_ 

1734 (probably Javascript). 

1735 

1736 Used to distinguish executable code from textual content. 

1737 """ 

1738 

1739 

1740class TemplateString(NavigableString): 

1741 """A `NavigableString` representing a string found inside an `HTML 

1742 <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_ 

1743 embedded in a larger document. 

1744 

1745 Used to distinguish such strings from the main body of the document. 

1746 """ 

1747 

1748 

1749class RubyTextString(NavigableString): 

1750 """A NavigableString representing the contents of an `<rt> HTML 

1751 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_. 

1752 

1753 Can be used to distinguish such strings from the strings they're 

1754 annotating. 

1755 """ 

1756 

1757 

1758class RubyParenthesisString(NavigableString): 

1759 """A NavigableString representing the contents of an `<rp> HTML 

1760 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_. 

1761 """ 

1762 

1763 

1764class Tag(PageElement): 

1765 """An HTML or XML tag that is part of a parse tree, along with its 

1766 attributes, contents, and relationships to other parts of the tree. 

1767 

1768 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will 

1769 create a `Tag` object representing the ``<b>`` tag. You can 

1770 instantiate `Tag` objects directly, but it's not necessary unless 

1771 you're adding entirely new markup to a parsed document. Most of 

1772 the constructor arguments are intended for use by the `TreeBuilder` 

1773 that's parsing a document. 

1774 

1775 :param parser: A `BeautifulSoup` object representing the parse tree this 

1776 `Tag` will be part of. 

1777 :param builder: The `TreeBuilder` being used to build the tree. 

1778 :param name: The name of the tag. 

1779 :param namespace: The URI of this tag's XML namespace, if any. 

1780 :param prefix: The prefix for this tag's XML namespace, if any. 

1781 :param attrs: A dictionary of attribute values. 

1782 :param parent: The `Tag` to use as the parent of this `Tag`. May be 

1783 the `BeautifulSoup` object itself. 

1784 :param previous: The `PageElement` that was parsed immediately before 

1785 parsing this tag. 

1786 :param is_xml: If True, this is an XML tag. Otherwise, this is an 

1787 HTML tag. 

1788 :param sourceline: The line number where this tag was found in its 

1789 source document. 

1790 :param sourcepos: The character position within ``sourceline`` where this 

1791 tag was found. 

1792 :param can_be_empty_element: If True, this tag should be 

1793 represented as <tag/>. If False, this tag should be represented 

1794 as <tag></tag>. 

1795 :param cdata_list_attributes: A dictionary of attributes whose values should 

1796 be parsed as lists of strings if they ever show up on this tag. 

1797 :param preserve_whitespace_tags: Names of tags whose contents 

1798 should have their whitespace preserved if they are encountered inside 

1799 this tag. 

1800 :param interesting_string_types: When iterating over this tag's 

1801 string contents in methods like `Tag.strings` or 

1802 `PageElement.get_text`, these are the types of strings that are 

1803 interesting enough to be considered. By default, 

1804 `NavigableString` (normal strings) and `CData` (CDATA 

1805 sections) are the only interesting string subtypes. 

1806 :param namespaces: A dictionary mapping currently active 

1807 namespace prefixes to URIs, as of the point in the parsing process when 

1808 this tag was encountered. This can be used later to 

1809 construct CSS selectors. 

1810 

1811 """ 

1812 

1813 def __init__( 

1814 self, 

1815 parser: Optional[BeautifulSoup] = None, 

1816 builder: Optional[TreeBuilder] = None, 

1817 name: Optional[str] = None, 

1818 namespace: Optional[str] = None, 

1819 prefix: Optional[str] = None, 

1820 attrs: Optional[_RawOrProcessedAttributeValues] = None, 

1821 parent: Optional[Union[BeautifulSoup, Tag]] = None, 

1822 previous: _AtMostOneElement = None, 

1823 is_xml: Optional[bool] = None, 

1824 sourceline: Optional[int] = None, 

1825 sourcepos: Optional[int] = None, 

1826 can_be_empty_element: Optional[bool] = None, 

1827 cdata_list_attributes: Optional[Dict[str, Set[str]]] = None, 

1828 preserve_whitespace_tags: Optional[Set[str]] = None, 

1829 interesting_string_types: Optional[Set[Type[NavigableString]]] = None, 

1830 namespaces: Optional[Dict[str, str]] = None, 

1831 # NOTE: Any new arguments here need to be mirrored in 

1832 # Tag.copy_self, and potentially BeautifulSoup.new_tag 

1833 # as well. 

1834 ): 

1835 if parser is None: 

1836 self.parser_class = None 

1837 else: 

1838 # We don't actually store the parser object: that lets extracted 

1839 # chunks be garbage-collected. 

1840 self.parser_class = parser.__class__ 

1841 if name is None: 

1842 raise ValueError("No value provided for new tag's name.") 

1843 self.name = name 

1844 self.namespace = namespace 

1845 self._namespaces = namespaces or {} 

1846 self.prefix = prefix 

1847 if (not builder or builder.store_line_numbers) and ( 

1848 sourceline is not None or sourcepos is not None 

1849 ): 

1850 self.sourceline = sourceline 

1851 self.sourcepos = sourcepos 

1852 else: 

1853 self.sourceline = sourceline 

1854 self.sourcepos = sourcepos 

1855 

1856 attr_dict_class: type[AttributeDict] 

1857 attribute_value_list_class: type[AttributeValueList] 

1858 if builder is None: 

1859 if is_xml: 

1860 attr_dict_class = XMLAttributeDict 

1861 else: 

1862 attr_dict_class = HTMLAttributeDict 

1863 attribute_value_list_class = AttributeValueList 

1864 else: 

1865 attr_dict_class = builder.attribute_dict_class 

1866 attribute_value_list_class = builder.attribute_value_list_class 

1867 self.attribute_value_list_class = attribute_value_list_class 

1868 

1869 if attrs is None: 

1870 self.attrs = attr_dict_class() 

1871 else: 

1872 if builder is not None and builder.cdata_list_attributes: 

1873 self.attrs = builder._replace_cdata_list_attribute_values( 

1874 self.name, attrs 

1875 ) 

1876 else: 

1877 self.attrs = attr_dict_class() 

1878 # Make sure that the values of any multi-valued 

1879 # attributes (e.g. when a Tag is copied) are stored in 

1880 # new lists. 

1881 for k, v in attrs.items(): 

1882 if isinstance(v, list): 

1883 v = v.__class__(v) 

1884 self.attrs[k] = v 

1885 

1886 # If possible, determine ahead of time whether this tag is an 

1887 # XML tag. 

1888 if builder: 

1889 self.known_xml = builder.is_xml 

1890 else: 

1891 self.known_xml = is_xml 

1892 self.contents: List[PageElement] = [] 

1893 self.setup(parent, previous) 

1894 self.hidden = False 

1895 

1896 if builder is None: 

1897 # In the absence of a TreeBuilder, use whatever values were 

1898 # passed in here. They're probably None, unless this is a copy of some 

1899 # other tag. 

1900 self.can_be_empty_element = can_be_empty_element 

1901 self.cdata_list_attributes = cdata_list_attributes 

1902 self.preserve_whitespace_tags = preserve_whitespace_tags 

1903 self.interesting_string_types = interesting_string_types 

1904 else: 

1905 # Set up any substitutions for this tag, such as the charset in a META tag. 

1906 self.attribute_value_list_class = builder.attribute_value_list_class 

1907 builder.set_up_substitutions(self) 

1908 

1909 # Ask the TreeBuilder whether this tag might be an empty-element tag. 

1910 self.can_be_empty_element = builder.can_be_empty_element(name) 

1911 

1912 # Keep track of the list of attributes of this tag that 

1913 # might need to be treated as a list. 

1914 # 

1915 # For performance reasons, we store the whole data structure 

1916 # rather than asking the question of every tag. Asking would 

1917 # require building a new data structure every time, and 

1918 # (unlike can_be_empty_element), we almost never need 

1919 # to check this. 

1920 self.cdata_list_attributes = builder.cdata_list_attributes 

1921 

1922 # Keep track of the names that might cause this tag to be treated as a 

1923 # whitespace-preserved tag. 

1924 self.preserve_whitespace_tags = builder.preserve_whitespace_tags 

1925 

1926 if self.name in builder.string_containers: 

1927 # This sort of tag uses a special string container 

1928 # subclass for most of its strings. We need to be able 

1929 # to look up the proper container subclass. 

1930 self.interesting_string_types = {builder.string_containers[self.name]} 

1931 else: 

1932 self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES 

1933 

1934 parser_class: Optional[type[BeautifulSoup]] 

1935 name: str 

1936 namespace: Optional[str] 

1937 prefix: Optional[str] 

1938 attrs: _AttributeValues 

1939 sourceline: Optional[int] 

1940 sourcepos: Optional[int] 

1941 known_xml: Optional[bool] 

1942 contents: List[PageElement] 

1943 hidden: bool 

1944 interesting_string_types: Optional[Set[Type[NavigableString]]] 

1945 

1946 can_be_empty_element: Optional[bool] 

1947 cdata_list_attributes: Optional[Dict[str, Set[str]]] 

1948 preserve_whitespace_tags: Optional[Set[str]] 

1949 

1950 #: :meta private: 

1951 parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0") 

1952 

1953 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self: 

1954 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree. 

1955 Its contents are a copy of the old Tag's contents. 

1956 """ 

1957 clone = self.copy_self() 

1958 

1959 if recursive: 

1960 # Clone this tag's descendants recursively, but without 

1961 # making any recursive function calls. 

1962 tag_stack: List[Tag] = [clone] 

1963 for event, element in self._event_stream(self.descendants): 

1964 if event is Tag.END_ELEMENT_EVENT: 

1965 # Stop appending incoming Tags to the Tag that was 

1966 # just closed. 

1967 tag_stack.pop() 

1968 else: 

1969 descendant_clone = element.__deepcopy__(memo, recursive=False) 

1970 # Add to its parent's .contents 

1971 tag_stack[-1].append(descendant_clone) 

1972 

1973 if event is Tag.START_ELEMENT_EVENT: 

1974 # Add the Tag itself to the stack so that its 

1975 # children will be .appended to it. 

1976 tag_stack.append(cast(Tag, descendant_clone)) 

1977 return clone 

1978 

1979 def copy_self(self) -> Self: 

1980 """Create a new Tag just like this one, but with no 

1981 contents and unattached to any parse tree. 

1982 

1983 This is the first step in the deepcopy process, but you can 

1984 call it on its own to create a copy of a Tag without copying its 

1985 contents. 

1986 """ 

1987 clone = type(self)( 

1988 None, 

1989 None, 

1990 self.name, 

1991 self.namespace, 

1992 self.prefix, 

1993 self.attrs, 

1994 is_xml=self._is_xml, 

1995 sourceline=self.sourceline, 

1996 sourcepos=self.sourcepos, 

1997 can_be_empty_element=self.can_be_empty_element, 

1998 cdata_list_attributes=self.cdata_list_attributes, 

1999 preserve_whitespace_tags=self.preserve_whitespace_tags, 

2000 interesting_string_types=self.interesting_string_types, 

2001 namespaces=self._namespaces, 

2002 ) 

2003 for attr in ("can_be_empty_element", "hidden"): 

2004 setattr(clone, attr, getattr(self, attr)) 

2005 return clone 

2006 

2007 @property 

2008 def is_empty_element(self) -> bool: 

2009 """Is this tag an empty-element tag? (aka a self-closing tag) 

2010 

2011 A tag that has contents is never an empty-element tag. 

2012 

2013 A tag that has no contents may or may not be an empty-element 

2014 tag. It depends on the `TreeBuilder` used to create the 

2015 tag. If the builder has a designated list of empty-element 

2016 tags, then only a tag whose name shows up in that list is 

2017 considered an empty-element tag. This is usually the case 

2018 for HTML documents. 

2019 

2020 If the builder has no designated list of empty-element, then 

2021 any tag with no contents is an empty-element tag. This is usually 

2022 the case for XML documents. 

2023 """ 

2024 return len(self.contents) == 0 and self.can_be_empty_element is True 

2025 

2026 @_deprecated("is_empty_element", "4.0.0") 

2027 def isSelfClosing(self) -> bool: 

2028 ": :meta private:" 

2029 return self.is_empty_element 

2030 

2031 @property 

2032 def string(self) -> Optional[str]: 

2033 """Convenience property to get the single string within this 

2034 `Tag`, assuming there is just one. 

2035 

2036 :return: If this `Tag` has a single child that's a 

2037 `NavigableString`, the return value is that string. If this 

2038 element has one child `Tag`, the return value is that child's 

2039 `Tag.string`, recursively. If this `Tag` has no children, 

2040 or has more than one child, the return value is ``None``. 

2041 

2042 If this property is unexpectedly returning ``None`` for you, 

2043 it's probably because your `Tag` has more than one thing 

2044 inside it. 

2045 """ 

2046 if len(self.contents) != 1: 

2047 return None 

2048 child = self.contents[0] 

2049 if isinstance(child, NavigableString): 

2050 return child 

2051 elif isinstance(child, Tag): 

2052 return child.string 

2053 return None 

2054 

2055 @string.setter 

2056 def string(self, string: str) -> None: 

2057 """Replace the `Tag.contents` of this `Tag` with a single string.""" 

2058 self.clear() 

2059 if isinstance(string, NavigableString): 

2060 new_class = string.__class__ 

2061 else: 

2062 new_class = NavigableString 

2063 self.append(new_class(string)) 

2064 

2065 #: :meta private: 

2066 MAIN_CONTENT_STRING_TYPES = {NavigableString, CData} 

2067 

2068 def _all_strings( 

2069 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default 

2070 ) -> Iterator[str]: 

2071 """Yield all strings of certain classes, possibly stripping them. 

2072 

2073 :param strip: If True, all strings will be stripped before being 

2074 yielded. 

2075 

2076 :param types: A tuple of NavigableString subclasses. Any strings of 

2077 a subclass not found in this list will be ignored. By 

2078 default, the subclasses considered are the ones found in 

2079 self.interesting_string_types. If that's not specified, 

2080 only NavigableString and CData objects will be 

2081 considered. That means no comments, processing 

2082 instructions, etc. 

2083 """ 

2084 if types is self.default: 

2085 if self.interesting_string_types is None: 

2086 types = self.MAIN_CONTENT_STRING_TYPES 

2087 else: 

2088 types = self.interesting_string_types 

2089 

2090 for descendant in self.descendants: 

2091 if not isinstance(descendant, NavigableString): 

2092 continue 

2093 descendant_type = type(descendant) 

2094 if isinstance(types, type): 

2095 if descendant_type is not types: 

2096 # We're not interested in strings of this type. 

2097 continue 

2098 elif types is not None and descendant_type not in types: 

2099 # We're not interested in strings of this type. 

2100 continue 

2101 if strip: 

2102 stripped = descendant.strip() 

2103 if len(stripped) == 0: 

2104 continue 

2105 yield stripped 

2106 else: 

2107 yield descendant 

2108 

2109 strings = property(_all_strings) 

2110 

2111 def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]: 

2112 """Insert one or more new PageElements as a child of this `Tag`. 

2113 

2114 This works similarly to :py:meth:`list.insert`, except you can insert 

2115 multiple elements at once. 

2116 

2117 :param position: The numeric position that should be occupied 

2118 in this Tag's `Tag.children` by the first new `PageElement`. 

2119 

2120 :param new_children: The PageElements to insert. 

2121 

2122 :return The newly inserted PageElements. 

2123 """ 

2124 inserted: List[PageElement] = [] 

2125 for new_child in new_children: 

2126 inserted.extend(self._insert(position, new_child)) 

2127 position += 1 

2128 return inserted 

2129 

2130 def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]: 

2131 if new_child is None: 

2132 raise ValueError("Cannot insert None into a tag.") 

2133 if new_child is self: 

2134 raise ValueError("Cannot insert a tag into itself.") 

2135 if isinstance(new_child, str) and not isinstance(new_child, NavigableString): 

2136 new_child = NavigableString(new_child) 

2137 

2138 from bs4 import BeautifulSoup 

2139 if isinstance(new_child, BeautifulSoup): 

2140 # We don't want to end up with a situation where one BeautifulSoup 

2141 # object contains another. Insert the BeautifulSoup's children and 

2142 # return them. 

2143 return self.insert(position, *list(new_child.contents)) 

2144 position = min(position, len(self.contents)) 

2145 if hasattr(new_child, "parent") and new_child.parent is not None: 

2146 # We're 'inserting' an element that's already one 

2147 # of this object's children. 

2148 if new_child.parent is self: 

2149 current_index = self.index(new_child) 

2150 if current_index < position: 

2151 # We're moving this element further down the list 

2152 # of this object's children. That means that when 

2153 # we extract this element, our target index will 

2154 # jump down one. 

2155 position -= 1 

2156 elif current_index == position: 

2157 # We're 'inserting' an element into its current location. 

2158 # This is a no-op. 

2159 return [new_child] 

2160 new_child.extract() 

2161 

2162 new_child.parent = self 

2163 previous_child = None 

2164 if position == 0: 

2165 new_child.previous_sibling = None 

2166 new_child.previous_element = self 

2167 else: 

2168 previous_child = self.contents[position - 1] 

2169 new_child.previous_sibling = previous_child 

2170 new_child.previous_sibling.next_sibling = new_child 

2171 new_child.previous_element = previous_child._last_descendant(False) 

2172 if new_child.previous_element is not None: 

2173 new_child.previous_element.next_element = new_child 

2174 

2175 new_childs_last_element = new_child._last_descendant( 

2176 is_initialized=False, accept_self=True 

2177 ) 

2178 # new_childs_last_element can't be None because we passed 

2179 # accept_self=True into _last_descendant. Worst case, 

2180 # new_childs_last_element will be new_child itself. Making 

2181 # this cast removes several mypy complaints later on as we 

2182 # manipulate new_childs_last_element. 

2183 new_childs_last_element = cast(PageElement, new_childs_last_element) 

2184 

2185 if position >= len(self.contents): 

2186 new_child.next_sibling = None 

2187 

2188 parent: Optional[Tag] = self 

2189 parents_next_sibling = None 

2190 while parents_next_sibling is None and parent is not None: 

2191 parents_next_sibling = parent.next_sibling 

2192 parent = parent.parent 

2193 if parents_next_sibling is not None: 

2194 # We found the element that comes next in the document. 

2195 break 

2196 if parents_next_sibling is not None: 

2197 new_childs_last_element.next_element = parents_next_sibling 

2198 else: 

2199 # The last element of this tag is the last element in 

2200 # the document. 

2201 new_childs_last_element.next_element = None 

2202 else: 

2203 next_child = self.contents[position] 

2204 new_child.next_sibling = next_child 

2205 if new_child.next_sibling is not None: 

2206 new_child.next_sibling.previous_sibling = new_child 

2207 new_childs_last_element.next_element = next_child 

2208 

2209 if new_childs_last_element.next_element is not None: 

2210 new_childs_last_element.next_element.previous_element = ( 

2211 new_childs_last_element 

2212 ) 

2213 self.contents.insert(position, new_child) 

2214 

2215 return [new_child] 

2216 

2217 def unwrap(self) -> Self: 

2218 """Replace this `PageElement` with its contents. 

2219 

2220 :return: This object, no longer part of the tree. 

2221 """ 

2222 my_parent = self.parent 

2223 if my_parent is None: 

2224 raise ValueError( 

2225 "Cannot replace an element with its contents when that " 

2226 "element is not part of a tree." 

2227 ) 

2228 my_index = my_parent.index(self) 

2229 self.extract(_self_index=my_index) 

2230 for child in reversed(self.contents[:]): 

2231 my_parent.insert(my_index, child) 

2232 return self 

2233 

2234 replace_with_children = unwrap 

2235 

2236 @_deprecated("unwrap", "4.0.0") 

2237 def replaceWithChildren(self) -> _OneElement: 

2238 ": :meta private:" 

2239 return self.unwrap() 

2240 

2241 def append(self, tag: _InsertableElement) -> PageElement: 

2242 """ 

2243 Appends the given `PageElement` to the contents of this `Tag`. 

2244 

2245 :param tag: A PageElement. 

2246 

2247 :return The newly appended PageElement. 

2248 """ 

2249 return self.insert(len(self.contents), tag)[0] 

2250 

2251 def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]: 

2252 """Appends one or more objects to the contents of this 

2253 `Tag`. 

2254 

2255 :param tags: If a list of `PageElement` objects is provided, 

2256 they will be appended to this tag's contents, one at a time. 

2257 If a single `Tag` is provided, its `Tag.contents` will be 

2258 used to extend this object's `Tag.contents`. 

2259 

2260 :return The list of PageElements that were appended. 

2261 """ 

2262 tag_list: Iterable[_InsertableElement] 

2263 

2264 if isinstance(tags, Tag): 

2265 tag_list = list(tags.contents) 

2266 elif isinstance(tags, (PageElement, str)): 

2267 # The caller should really be using append() instead, 

2268 # but we can make it work. 

2269 warnings.warn( 

2270 "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.", 

2271 UserWarning, 

2272 stacklevel=2, 

2273 ) 

2274 if isinstance(tags, str) and not isinstance(tags, PageElement): 

2275 tags = NavigableString(tags) 

2276 tag_list = [tags] 

2277 elif isinstance(tags, Iterable): 

2278 # Moving items around the tree may change their position in 

2279 # the original list. Make a list that won't change. 

2280 tag_list = list(tags) 

2281 

2282 results: List[PageElement] = [] 

2283 for tag in tag_list: 

2284 results.append(self.append(tag)) 

2285 

2286 return results 

2287 

2288 def clear(self, decompose: bool = False) -> None: 

2289 """Destroy all children of this `Tag` by calling 

2290 `PageElement.extract` on them. 

2291 

2292 :param decompose: If this is True, `PageElement.decompose` (a 

2293 more destructive method) will be called instead of 

2294 `PageElement.extract`. 

2295 """ 

2296 for element in self.contents[:]: 

2297 if decompose: 

2298 element.decompose() 

2299 else: 

2300 element.extract() 

2301 

2302 def smooth(self) -> None: 

2303 """Smooth out the children of this `Tag` by consolidating consecutive 

2304 strings. 

2305 

2306 If you perform a lot of operations that modify the tree, 

2307 calling this method afterwards can make pretty-printed output 

2308 look more natural. 

2309 """ 

2310 # Mark the first position of every pair of children that need 

2311 # to be consolidated. Do this rather than making a copy of 

2312 # self.contents, since in most cases very few strings will be 

2313 # affected. 

2314 marked = [] 

2315 for i, a in enumerate(self.contents): 

2316 if isinstance(a, Tag): 

2317 # Recursively smooth children. 

2318 a.smooth() 

2319 if i == len(self.contents) - 1: 

2320 # This is the last item in .contents, and it's not a 

2321 # tag. There's no chance it needs any work. 

2322 continue 

2323 b = self.contents[i + 1] 

2324 if ( 

2325 isinstance(a, NavigableString) 

2326 and isinstance(b, NavigableString) 

2327 and not isinstance(a, PreformattedString) 

2328 and not isinstance(b, PreformattedString) 

2329 ): 

2330 marked.append(i) 

2331 

2332 # Go over the marked positions in reverse order, so that 

2333 # removing items from .contents won't affect the remaining 

2334 # positions. 

2335 for i in reversed(marked): 

2336 a = cast(NavigableString, self.contents[i]) 

2337 b = cast(NavigableString, self.contents[i + 1]) 

2338 b.extract() 

2339 n = NavigableString(a + b) 

2340 a.replace_with(n) 

2341 

2342 def index(self, element: PageElement) -> int: 

2343 """Find the index of a child of this `Tag` (by identity, not value). 

2344 

2345 Doing this by identity avoids issues when a `Tag` contains two 

2346 children that have string equality. 

2347 

2348 :param element: Look for this `PageElement` in this object's contents. 

2349 """ 

2350 for i, child in enumerate(self.contents): 

2351 if child is element: 

2352 return i 

2353 raise ValueError("Tag.index: element not in tag") 

2354 

2355 def get( 

2356 self, key: str, default: Optional[_AttributeValue] = None 

2357 ) -> Optional[_AttributeValue]: 

2358 """Returns the value of the 'key' attribute for the tag, or 

2359 the value given for 'default' if it doesn't have that 

2360 attribute. 

2361 

2362 :param key: The attribute to look for. 

2363 :param default: Use this value if the attribute is not present 

2364 on this `Tag`. 

2365 """ 

2366 return self.attrs.get(key, default) 

2367 

2368 def get_attribute_list( 

2369 self, key: str, default: Optional[AttributeValueList] = None 

2370 ) -> AttributeValueList: 

2371 """The same as get(), but always returns a (possibly empty) list. 

2372 

2373 :param key: The attribute to look for. 

2374 :param default: Use this value if the attribute is not present 

2375 on this `Tag`. 

2376 :return: A list of strings, usually empty or containing only a single 

2377 value. 

2378 """ 

2379 list_value: AttributeValueList 

2380 value = self.get(key, default) 

2381 if value is None: 

2382 list_value = self.attribute_value_list_class() 

2383 elif isinstance(value, list): 

2384 list_value = value 

2385 else: 

2386 if not isinstance(value, str): 

2387 value = cast(str, value) 

2388 list_value = self.attribute_value_list_class([value]) 

2389 return list_value 

2390 

2391 def has_attr(self, key: str) -> bool: 

2392 """Does this `Tag` have an attribute with the given name?""" 

2393 return key in self.attrs 

2394 

2395 def __hash__(self) -> int: 

2396 return str(self).__hash__() 

2397 

2398 def __getitem__(self, key: str) -> _AttributeValue: 

2399 """tag[key] returns the value of the 'key' attribute for the Tag, 

2400 and throws an exception if it's not there.""" 

2401 return self.attrs[key] 

2402 

2403 def __iter__(self) -> Iterator[PageElement]: 

2404 "Iterating over a Tag iterates over its contents." 

2405 return iter(self.contents) 

2406 

2407 def __len__(self) -> int: 

2408 "The length of a Tag is the length of its list of contents." 

2409 return len(self.contents) 

2410 

2411 def __contains__(self, x: Any) -> bool: 

2412 return x in self.contents 

2413 

2414 def __bool__(self) -> bool: 

2415 "A tag is non-None even if it has no contents." 

2416 return True 

2417 

2418 def __setitem__(self, key: str, value: _AttributeValue) -> None: 

2419 """Setting tag[key] sets the value of the 'key' attribute for the 

2420 tag.""" 

2421 self.attrs[key] = value 

2422 

2423 def __delitem__(self, key: str) -> None: 

2424 "Deleting tag[key] deletes all 'key' attributes for the tag." 

2425 self.attrs.pop(key, None) 

2426 

2427 @overload 

2428 def __call__( # pyright: ignore [reportOverlappingOverload] 

2429 self, 

2430 name: _FindMethodName = None, 

2431 attrs: Optional[_StrainableAttributes] = None, 

2432 recursive: bool = True, 

2433 string: None = None, 

2434 limit: Optional[int] = None, 

2435 _stacklevel: int = 2, 

2436 **kwargs: _StrainableAttribute, 

2437 ) -> _SomeTags: 

2438 ... 

2439 

2440 @overload 

2441 def __call__( 

2442 self, 

2443 name: None = None, 

2444 attrs: None = None, 

2445 recursive: bool = True, 

2446 string: _StrainableString = "", 

2447 limit: Optional[int] = None, 

2448 _stacklevel: int = 2, 

2449 **kwargs: _StrainableAttribute, 

2450 ) -> _SomeNavigableStrings: 

2451 ... 

2452 

2453 def __call__( 

2454 self, 

2455 name: _FindMethodName = None, 

2456 attrs: Optional[_StrainableAttributes] = None, 

2457 recursive: bool = True, 

2458 string: Optional[_StrainableString] = None, 

2459 limit: Optional[int] = None, 

2460 _stacklevel: int = 2, 

2461 **kwargs: _StrainableAttribute, 

2462 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]: 

2463 """Calling a Tag like a function is the same as calling its 

2464 find_all() method. Eg. tag('a') returns a list of all the A tags 

2465 found within this tag.""" 

2466 if string is not None and (name is not None or attrs is not None or kwargs): 

2467 # TODO: Using the @overload decorator to express the three ways you 

2468 # could get into this path is way too much code for a rarely(?) used 

2469 # feature. 

2470 return cast(ResultSet[Tag], self.find_all(name, attrs, recursive, string, limit, _stacklevel, **kwargs)) #type: ignore 

2471 

2472 if string is None: 

2473 # If string is None, we're searching for tags. 

2474 tags:ResultSet[Tag] = self.find_all( 

2475 name, attrs, recursive, None, limit, _stacklevel, **kwargs 

2476 ) 

2477 return tags 

2478 

2479 # Otherwise, we're searching for strings. 

2480 strings:ResultSet[NavigableString] = self.find_all( 

2481 None, None, recursive, string, limit, _stacklevel, **kwargs 

2482 ) 

2483 return strings 

2484 

2485 def __getattr__(self, subtag: str) -> Optional[Tag]: 

2486 """Calling tag.subtag is the same as calling tag.find(name="subtag")""" 

2487 # print("Getattr %s.%s" % (self.__class__, tag)) 

2488 result: _AtMostOneElement 

2489 if len(subtag) > 3 and subtag.endswith("Tag"): 

2490 # BS3: soup.aTag -> "soup.find("a") 

2491 tag_name = subtag[:-3] 

2492 warnings.warn( 

2493 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' 

2494 % dict(name=tag_name), 

2495 DeprecationWarning, 

2496 stacklevel=2, 

2497 ) 

2498 result = self.find(tag_name) 

2499 # We special case contents to avoid recursion. 

2500 elif not subtag.startswith("__") and not subtag == "contents": 

2501 result = self.find(subtag) 

2502 else: 

2503 raise AttributeError( 

2504 "'%s' object has no attribute '%s'" % (self.__class__, subtag) 

2505 ) 

2506 return result 

2507 

2508 def __eq__(self, other: Any) -> bool: 

2509 """Returns true iff this Tag has the same name, the same attributes, 

2510 and the same contents (recursively) as `other`.""" 

2511 if self is other: 

2512 return True 

2513 if not isinstance(other, Tag): 

2514 return False 

2515 if ( 

2516 not hasattr(other, "name") 

2517 or not hasattr(other, "attrs") 

2518 or not hasattr(other, "contents") 

2519 or self.name != other.name 

2520 or self.attrs != other.attrs 

2521 or len(self) != len(other) 

2522 ): 

2523 return False 

2524 for i, my_child in enumerate(self.contents): 

2525 if my_child != other.contents[i]: 

2526 return False 

2527 return True 

2528 

2529 def __ne__(self, other: Any) -> bool: 

2530 """Returns true iff this Tag is not identical to `other`, 

2531 as defined in __eq__.""" 

2532 return not self == other 

2533 

2534 def __repr__(self) -> str: 

2535 """Renders this `Tag` as a string.""" 

2536 return self.decode() 

2537 

2538 __str__ = __unicode__ = __repr__ 

2539 

2540 def encode( 

2541 self, 

2542 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2543 indent_level: Optional[int] = None, 

2544 formatter: _FormatterOrName = "minimal", 

2545 errors: str = "xmlcharrefreplace", 

2546 ) -> bytes: 

2547 """Render this `Tag` and its contents as a bytestring. 

2548 

2549 :param encoding: The encoding to use when converting to 

2550 a bytestring. This may also affect the text of the document, 

2551 specifically any encoding declarations within the document. 

2552 :param indent_level: Each line of the rendering will be 

2553 indented this many levels. (The ``formatter`` decides what a 

2554 'level' means, in terms of spaces or other characters 

2555 output.) This is used internally in recursive calls while 

2556 pretty-printing. 

2557 :param formatter: Either a `Formatter` object, or a string naming one of 

2558 the standard formatters. 

2559 :param errors: An error handling strategy such as 

2560 'xmlcharrefreplace'. This value is passed along into 

2561 :py:meth:`str.encode` and its value should be one of the `error 

2562 handling constants defined by Python's codecs module 

2563 <https://docs.python.org/3/library/codecs.html#error-handlers>`_. 

2564 """ 

2565 # Turn the data structure into Unicode, then encode the 

2566 # Unicode. 

2567 u = self.decode(indent_level, encoding, formatter) 

2568 return u.encode(encoding, errors) 

2569 

2570 def decode( 

2571 self, 

2572 indent_level: Optional[int] = None, 

2573 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2574 formatter: _FormatterOrName = "minimal", 

2575 iterator: Optional[Iterator[PageElement]] = None, 

2576 ) -> str: 

2577 """Render this `Tag` and its contents as a Unicode string. 

2578 

2579 :param indent_level: Each line of the rendering will be 

2580 indented this many levels. (The ``formatter`` decides what a 

2581 'level' means, in terms of spaces or other characters 

2582 output.) This is used internally in recursive calls while 

2583 pretty-printing. 

2584 :param encoding: The encoding you intend to use when 

2585 converting the string to a bytestring. decode() is *not* 

2586 responsible for performing that encoding. This information 

2587 is needed so that a real encoding can be substituted in if 

2588 the document contains an encoding declaration (e.g. in a 

2589 <meta> tag). 

2590 :param formatter: Either a `Formatter` object, or a string 

2591 naming one of the standard formatters. 

2592 :param iterator: The iterator to use when navigating over the 

2593 parse tree. This is only used by `Tag.decode_contents` and 

2594 you probably won't need to use it. 

2595 """ 

2596 pieces = [] 

2597 # First off, turn a non-Formatter `formatter` into a Formatter 

2598 # object. This will stop the lookup from happening over and 

2599 # over again. 

2600 if not isinstance(formatter, Formatter): 

2601 formatter = self.formatter_for_name(formatter) 

2602 

2603 if indent_level is True: 

2604 indent_level = 0 

2605 

2606 # The currently active tag that put us into string literal 

2607 # mode. Until this element is closed, children will be treated 

2608 # as string literals and not pretty-printed. String literal 

2609 # mode is turned on immediately after this tag begins, and 

2610 # turned off immediately before it's closed. This means there 

2611 # will be whitespace before and after the tag itself. 

2612 string_literal_tag = None 

2613 

2614 for event, element in self._event_stream(iterator): 

2615 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): 

2616 element = cast(Tag, element) 

2617 piece = element._format_tag(eventual_encoding, formatter, opening=True) 

2618 elif event is Tag.END_ELEMENT_EVENT: 

2619 element = cast(Tag, element) 

2620 piece = element._format_tag(eventual_encoding, formatter, opening=False) 

2621 if indent_level is not None: 

2622 indent_level -= 1 

2623 else: 

2624 element = cast(NavigableString, element) 

2625 piece = element.output_ready(formatter) 

2626 

2627 # Now we need to apply the 'prettiness' -- extra 

2628 # whitespace before and/or after this tag. This can get 

2629 # complicated because certain tags, like <pre> and 

2630 # <script>, can't be prettified, since adding whitespace would 

2631 # change the meaning of the content. 

2632 

2633 # The default behavior is to add whitespace before and 

2634 # after an element when string literal mode is off, and to 

2635 # leave things as they are when string literal mode is on. 

2636 if string_literal_tag: 

2637 indent_before = indent_after = False 

2638 else: 

2639 indent_before = indent_after = True 

2640 

2641 # The only time the behavior is more complex than that is 

2642 # when we encounter an opening or closing tag that might 

2643 # put us into or out of string literal mode. 

2644 if ( 

2645 event is Tag.START_ELEMENT_EVENT 

2646 and not string_literal_tag 

2647 and not cast(Tag, element)._should_pretty_print() 

2648 ): 

2649 # We are about to enter string literal mode. Add 

2650 # whitespace before this tag, but not after. We 

2651 # will stay in string literal mode until this tag 

2652 # is closed. 

2653 indent_before = True 

2654 indent_after = False 

2655 string_literal_tag = element 

2656 elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag: 

2657 # We are about to exit string literal mode by closing 

2658 # the tag that sent us into that mode. Add whitespace 

2659 # after this tag, but not before. 

2660 indent_before = False 

2661 indent_after = True 

2662 string_literal_tag = None 

2663 

2664 # Now we know whether to add whitespace before and/or 

2665 # after this element. 

2666 if indent_level is not None: 

2667 if indent_before or indent_after: 

2668 if isinstance(element, NavigableString): 

2669 piece = piece.strip() 

2670 if piece: 

2671 piece = self._indent_string( 

2672 piece, indent_level, formatter, indent_before, indent_after 

2673 ) 

2674 if event == Tag.START_ELEMENT_EVENT: 

2675 indent_level += 1 

2676 pieces.append(piece) 

2677 return "".join(pieces) 

2678 

2679 class _TreeTraversalEvent(object): 

2680 """An internal class representing an event in the process 

2681 of traversing a parse tree. 

2682 

2683 :meta private: 

2684 """ 

2685 

2686 # Stand-ins for the different events yielded by _event_stream 

2687 START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2688 END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2689 EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2690 STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2691 

2692 def _event_stream( 

2693 self, iterator: Optional[Iterator[PageElement]] = None 

2694 ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]: 

2695 """Yield a sequence of events that can be used to reconstruct the DOM 

2696 for this element. 

2697 

2698 This lets us recreate the nested structure of this element 

2699 (e.g. when formatting it as a string) without using recursive 

2700 method calls. 

2701 

2702 This is similar in concept to the SAX API, but it's a simpler 

2703 interface designed for internal use. The events are different 

2704 from SAX and the arguments associated with the events are Tags 

2705 and other Beautiful Soup objects. 

2706 

2707 :param iterator: An alternate iterator to use when traversing 

2708 the tree. 

2709 """ 

2710 tag_stack: List[Tag] = [] 

2711 

2712 iterator = iterator or self.self_and_descendants 

2713 

2714 for c in iterator: 

2715 # If the parent of the element we're about to yield is not 

2716 # the tag currently on the stack, it means that the tag on 

2717 # the stack closed before this element appeared. 

2718 while tag_stack and c.parent != tag_stack[-1]: 

2719 now_closed_tag = tag_stack.pop() 

2720 yield Tag.END_ELEMENT_EVENT, now_closed_tag 

2721 

2722 if isinstance(c, Tag): 

2723 if c.is_empty_element: 

2724 yield Tag.EMPTY_ELEMENT_EVENT, c 

2725 else: 

2726 yield Tag.START_ELEMENT_EVENT, c 

2727 tag_stack.append(c) 

2728 continue 

2729 else: 

2730 yield Tag.STRING_ELEMENT_EVENT, c 

2731 

2732 while tag_stack: 

2733 now_closed_tag = tag_stack.pop() 

2734 yield Tag.END_ELEMENT_EVENT, now_closed_tag 

2735 

2736 def _indent_string( 

2737 self, 

2738 s: str, 

2739 indent_level: int, 

2740 formatter: Formatter, 

2741 indent_before: bool, 

2742 indent_after: bool, 

2743 ) -> str: 

2744 """Add indentation whitespace before and/or after a string. 

2745 

2746 :param s: The string to amend with whitespace. 

2747 :param indent_level: The indentation level; affects how much 

2748 whitespace goes before the string. 

2749 :param indent_before: Whether or not to add whitespace 

2750 before the string. 

2751 :param indent_after: Whether or not to add whitespace 

2752 (a newline) after the string. 

2753 """ 

2754 space_before = "" 

2755 if indent_before and indent_level: 

2756 space_before = formatter.indent * indent_level 

2757 

2758 space_after = "" 

2759 if indent_after: 

2760 space_after = "\n" 

2761 

2762 return space_before + s + space_after 

2763 

2764 def _format_tag( 

2765 self, eventual_encoding: str, formatter: Formatter, opening: bool 

2766 ) -> str: 

2767 if self.hidden: 

2768 # A hidden tag is invisible, although its contents 

2769 # are visible. 

2770 return "" 

2771 

2772 # A tag starts with the < character (see below). 

2773 

2774 # Then the / character, if this is a closing tag. 

2775 closing_slash = "" 

2776 if not opening: 

2777 closing_slash = "/" 

2778 

2779 # Then an optional namespace prefix. 

2780 prefix = "" 

2781 if self.prefix: 

2782 prefix = self.prefix + ":" 

2783 

2784 # Then a list of attribute values, if this is an opening tag. 

2785 attribute_string = "" 

2786 if opening: 

2787 attributes = formatter.attributes(self) 

2788 attrs = [] 

2789 for key, val in attributes: 

2790 if val is None: 

2791 decoded = key 

2792 else: 

2793 if isinstance(val, list) or isinstance(val, tuple): 

2794 val = " ".join(val) 

2795 elif not isinstance(val, str): 

2796 val = str(val) 

2797 elif ( 

2798 isinstance(val, AttributeValueWithCharsetSubstitution) 

2799 and eventual_encoding is not None 

2800 ): 

2801 val = val.substitute_encoding(eventual_encoding) 

2802 

2803 text = formatter.attribute_value(val) 

2804 decoded = str(key) + "=" + formatter.quoted_attribute_value(text) 

2805 attrs.append(decoded) 

2806 if attrs: 

2807 attribute_string = " " + " ".join(attrs) 

2808 

2809 # Then an optional closing slash (for a void element in an 

2810 # XML document). 

2811 void_element_closing_slash = "" 

2812 if self.is_empty_element: 

2813 void_element_closing_slash = formatter.void_element_close_prefix or "" 

2814 

2815 # Put it all together. 

2816 return ( 

2817 "<" 

2818 + closing_slash 

2819 + prefix 

2820 + self.name 

2821 + attribute_string 

2822 + void_element_closing_slash 

2823 + ">" 

2824 ) 

2825 

2826 def _should_pretty_print(self, indent_level: int = 1) -> bool: 

2827 """Should this tag be pretty-printed? 

2828 

2829 Most of them should, but some (such as <pre> in HTML 

2830 documents) should not. 

2831 """ 

2832 return indent_level is not None and ( 

2833 not self.preserve_whitespace_tags 

2834 or self.name not in self.preserve_whitespace_tags 

2835 ) 

2836 

2837 @overload 

2838 def prettify( 

2839 self, 

2840 encoding: None = None, 

2841 formatter: _FormatterOrName = "minimal", 

2842 ) -> str: 

2843 ... 

2844 

2845 @overload 

2846 def prettify( 

2847 self, 

2848 encoding: _Encoding, 

2849 formatter: _FormatterOrName = "minimal", 

2850 ) -> bytes: 

2851 ... 

2852 

2853 def prettify( 

2854 self, 

2855 encoding: Optional[_Encoding] = None, 

2856 formatter: _FormatterOrName = "minimal", 

2857 ) -> Union[str, bytes]: 

2858 """Pretty-print this `Tag` as a string or bytestring. 

2859 

2860 :param encoding: The encoding of the bytestring, or None if you want Unicode. 

2861 :param formatter: A Formatter object, or a string naming one of 

2862 the standard formatters. 

2863 :return: A string (if no ``encoding`` is provided) or a bytestring 

2864 (otherwise). 

2865 """ 

2866 if encoding is None: 

2867 return self.decode(indent_level=0, formatter=formatter) 

2868 else: 

2869 return self.encode(encoding=encoding, indent_level=0, formatter=formatter) 

2870 

2871 def decode_contents( 

2872 self, 

2873 indent_level: Optional[int] = None, 

2874 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2875 formatter: _FormatterOrName = "minimal", 

2876 ) -> str: 

2877 """Renders the contents of this tag as a Unicode string. 

2878 

2879 :param indent_level: Each line of the rendering will be 

2880 indented this many levels. (The formatter decides what a 

2881 'level' means in terms of spaces or other characters 

2882 output.) Used internally in recursive calls while 

2883 pretty-printing. 

2884 

2885 :param eventual_encoding: The tag is destined to be 

2886 encoded into this encoding. decode_contents() is *not* 

2887 responsible for performing that encoding. This information 

2888 is needed so that a real encoding can be substituted in if 

2889 the document contains an encoding declaration (e.g. in a 

2890 <meta> tag). 

2891 

2892 :param formatter: A `Formatter` object, or a string naming one of 

2893 the standard Formatters. 

2894 """ 

2895 return self.decode( 

2896 indent_level, eventual_encoding, formatter, iterator=self.descendants 

2897 ) 

2898 

2899 def encode_contents( 

2900 self, 

2901 indent_level: Optional[int] = None, 

2902 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2903 formatter: _FormatterOrName = "minimal", 

2904 ) -> bytes: 

2905 """Renders the contents of this PageElement as a bytestring. 

2906 

2907 :param indent_level: Each line of the rendering will be 

2908 indented this many levels. (The ``formatter`` decides what a 

2909 'level' means, in terms of spaces or other characters 

2910 output.) This is used internally in recursive calls while 

2911 pretty-printing. 

2912 :param formatter: Either a `Formatter` object, or a string naming one of 

2913 the standard formatters. 

2914 :param encoding: The bytestring will be in this encoding. 

2915 """ 

2916 contents = self.decode_contents(indent_level, encoding, formatter) 

2917 return contents.encode(encoding) 

2918 

2919 @_deprecated("encode_contents", "4.0.0") 

2920 def renderContents( 

2921 self, 

2922 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2923 prettyPrint: bool = False, 

2924 indentLevel: Optional[int] = 0, 

2925 ) -> bytes: 

2926 """Deprecated method for BS3 compatibility. 

2927 

2928 :meta private: 

2929 """ 

2930 if not prettyPrint: 

2931 indentLevel = None 

2932 return self.encode_contents(indent_level=indentLevel, encoding=encoding) 

2933 

2934 # Soup methods 

2935 

2936 @overload 

2937 def find( 

2938 self, 

2939 name: _FindMethodName = None, 

2940 attrs: Optional[_StrainableAttributes] = None, 

2941 recursive: bool = True, 

2942 string: None=None, 

2943 **kwargs: _StrainableAttribute, 

2944 ) -> _AtMostOneTag: 

2945 ... 

2946 

2947 @overload 

2948 def find( 

2949 self, 

2950 name: None=None, 

2951 attrs: None=None, 

2952 recursive: bool = True, 

2953 string: _StrainableString="", 

2954 ) -> _AtMostOneNavigableString: 

2955 ... 

2956 

2957 def find( 

2958 self, 

2959 name: _FindMethodName = None, 

2960 attrs: Optional[_StrainableAttributes] = None, 

2961 recursive: bool = True, 

2962 string: Optional[_StrainableString] = None, 

2963 **kwargs: _StrainableAttribute, 

2964 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]: 

2965 """Look in the children of this PageElement and find the first 

2966 PageElement that matches the given criteria. 

2967 

2968 All find_* methods take a common set of arguments. See the online 

2969 documentation for detailed explanations. 

2970 

2971 :param name: A filter on tag name. 

2972 :param attrs: Additional filters on attribute values. 

2973 :param recursive: If this is True, find() will perform a 

2974 recursive search of this Tag's children. Otherwise, 

2975 only the direct children will be considered. 

2976 :param string: A filter on the `Tag.string` attribute. 

2977 :kwargs: Additional filters on attribute values. 

2978 """ 

2979 if string is not None and (name is not None or attrs is not None or kwargs): 

2980 # TODO: Using the @overload decorator to express the three ways you 

2981 # could get into this path is way too much code for a rarely(?) used 

2982 # feature. 

2983 elements = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) # type:ignore 

2984 if elements: 

2985 return cast(Tag, elements[0]) 

2986 elif string is None: 

2987 tags = self.find_all(name, attrs, recursive, None, 1, _stacklevel=3, **kwargs) 

2988 if tags: 

2989 return cast(Tag, tags[0]) 

2990 else: 

2991 strings = self.find_all(None, None, recursive, string, 1, _stacklevel=3, **kwargs) 

2992 if strings: 

2993 return cast(NavigableString, strings[0]) 

2994 return None 

2995 

2996 findChild = _deprecated_function_alias("findChild", "find", "3.0.0") 

2997 

2998 @overload 

2999 def find_all( # pyright: ignore [reportOverlappingOverload] 

3000 self, 

3001 name: _FindMethodName = None, 

3002 attrs: Optional[_StrainableAttributes] = None, 

3003 recursive: bool = True, 

3004 string: None = None, 

3005 limit: Optional[int] = None, 

3006 _stacklevel: int = 2, 

3007 **kwargs: _StrainableAttribute, 

3008 ) -> _SomeTags: 

3009 ... 

3010 

3011 @overload 

3012 def find_all( 

3013 self, 

3014 name: None = None, 

3015 attrs: None = None, 

3016 recursive: bool = True, 

3017 string: _StrainableString = "", 

3018 limit: Optional[int] = None, 

3019 _stacklevel: int = 2, 

3020 **kwargs: _StrainableAttribute, 

3021 ) -> _SomeNavigableStrings: 

3022 ... 

3023 

3024 def find_all( 

3025 self, 

3026 name: _FindMethodName = None, 

3027 attrs: Optional[_StrainableAttributes] = None, 

3028 recursive: bool = True, 

3029 string: Optional[_StrainableString] = None, 

3030 limit: Optional[int] = None, 

3031 _stacklevel: int = 2, 

3032 **kwargs: _StrainableAttribute, 

3033 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]: 

3034 """Look in the children of this `PageElement` and find all 

3035 `PageElement` objects that match the given criteria. 

3036 

3037 All find_* methods take a common set of arguments. See the online 

3038 documentation for detailed explanations. 

3039 

3040 :param name: A filter on tag name. 

3041 :param attrs: Additional filters on attribute values. 

3042 :param recursive: If this is True, find_all() will perform a 

3043 recursive search of this PageElement's children. Otherwise, 

3044 only the direct children will be considered. 

3045 :param limit: Stop looking after finding this many results. 

3046 :param _stacklevel: Used internally to improve warning messages. 

3047 :kwargs: Additional filters on attribute values. 

3048 """ 

3049 generator = self.descendants 

3050 if not recursive: 

3051 generator = self.children 

3052 _stacklevel += 1 

3053 

3054 if string is not None and (name is not None or attrs is not None or kwargs): 

3055 # TODO: Using the @overload decorator to express the three ways you 

3056 # could get into this path is way too much code for a rarely(?) used 

3057 # feature. 

3058 return cast(ResultSet[Tag], 

3059 self._find_all(name, attrs, string, limit, generator, 

3060 _stacklevel=_stacklevel, **kwargs) 

3061 ) 

3062 

3063 if string is None: 

3064 # If string is None, we're searching for tags. 

3065 return cast(ResultSet[Tag], self._find_all( 

3066 name, attrs, None, limit, generator, _stacklevel=_stacklevel, **kwargs 

3067 )) 

3068 

3069 # Otherwise, we're searching for strings. 

3070 return cast(ResultSet[NavigableString], self._find_all( 

3071 None, None, string, limit, generator, _stacklevel=_stacklevel, **kwargs 

3072 )) 

3073 

3074 findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0") 

3075 findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0") 

3076 

3077 # Generator methods 

3078 @property 

3079 def children(self) -> Iterator[PageElement]: 

3080 """Iterate over all direct children of this `PageElement`.""" 

3081 return (x for x in self.contents) 

3082 

3083 @property 

3084 def self_and_descendants(self) -> Iterator[PageElement]: 

3085 """Iterate over this `Tag` and its children in a 

3086 breadth-first sequence. 

3087 """ 

3088 return self._self_and(self.descendants) 

3089 

3090 @property 

3091 def descendants(self) -> Iterator[PageElement]: 

3092 """Iterate over all children of this `Tag` in a 

3093 breadth-first sequence. 

3094 """ 

3095 if not len(self.contents): 

3096 return 

3097 # _last_descendant() can't return None here because 

3098 # accept_self is True. Worst case, last_descendant will end up 

3099 # as self. 

3100 last_descendant = cast(PageElement, self._last_descendant(accept_self=True)) 

3101 stopNode = last_descendant.next_element 

3102 current: _AtMostOneElement = self.contents[0] 

3103 while current is not stopNode and current is not None: 

3104 successor = current.next_element 

3105 yield current 

3106 current = successor 

3107 

3108 # CSS selector code 

3109 def select_one( 

3110 self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any 

3111 ) -> Optional[Tag]: 

3112 """Perform a CSS selection operation on the current element. 

3113 

3114 :param selector: A CSS selector. 

3115 

3116 :param namespaces: A dictionary mapping namespace prefixes 

3117 used in the CSS selector to namespace URIs. By default, 

3118 Beautiful Soup will use the prefixes it encountered while 

3119 parsing the document. 

3120 

3121 :param kwargs: Keyword arguments to be passed into Soup Sieve's 

3122 soupsieve.select() method. 

3123 """ 

3124 return self.css.select_one(selector, namespaces, **kwargs) 

3125 

3126 def select( 

3127 self, 

3128 selector: str, 

3129 namespaces: Optional[Dict[str, str]] = None, 

3130 limit: int = 0, 

3131 **kwargs: Any, 

3132 ) -> ResultSet[Tag]: 

3133 """Perform a CSS selection operation on the current element. 

3134 

3135 This uses the SoupSieve library. 

3136 

3137 :param selector: A string containing a CSS selector. 

3138 

3139 :param namespaces: A dictionary mapping namespace prefixes 

3140 used in the CSS selector to namespace URIs. By default, 

3141 Beautiful Soup will use the prefixes it encountered while 

3142 parsing the document. 

3143 

3144 :param limit: After finding this number of results, stop looking. 

3145 

3146 :param kwargs: Keyword arguments to be passed into SoupSieve's 

3147 soupsieve.select() method. 

3148 """ 

3149 return self.css.select(selector, namespaces, limit, **kwargs) 

3150 

3151 @property 

3152 def css(self) -> CSS: 

3153 """Return an interface to the CSS selector API.""" 

3154 return CSS(self) 

3155 

3156 # Old names for backwards compatibility 

3157 @_deprecated("children", "4.0.0") 

3158 def childGenerator(self) -> Iterator[PageElement]: 

3159 """Deprecated generator. 

3160 

3161 :meta private: 

3162 """ 

3163 return self.children 

3164 

3165 @_deprecated("descendants", "4.0.0") 

3166 def recursiveChildGenerator(self) -> Iterator[PageElement]: 

3167 """Deprecated generator. 

3168 

3169 :meta private: 

3170 """ 

3171 return self.descendants 

3172 

3173 @_deprecated("has_attr", "4.0.0") 

3174 def has_key(self, key: str) -> bool: 

3175 """Deprecated method. This was kind of misleading because has_key() 

3176 (attributes) was different from __in__ (contents). 

3177 

3178 has_key() is gone in Python 3, anyway. 

3179 

3180 :meta private: 

3181 """ 

3182 return self.has_attr(key) 

3183 

3184 

3185_PageElementT = TypeVar("_PageElementT", bound=PageElement) 

3186 

3187class ResultSet(List[_PageElementT], Generic[_PageElementT]): 

3188 """A ResultSet is a list of `PageElement` objects, gathered as the result 

3189 of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of 

3190 search results. 

3191 """ 

3192 

3193 source: Optional[ElementFilter] 

3194 

3195 def __init__( 

3196 self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = () 

3197 ) -> None: 

3198 super(ResultSet, self).__init__(result) 

3199 self.source = source 

3200 

3201 def __getattr__(self, key: str) -> None: 

3202 """Raise a helpful exception to explain a common code fix.""" 

3203 raise AttributeError( 

3204 f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?""" 

3205 ) 

3206 

3207# Now that all the classes used by SoupStrainer have been defined, 

3208# import SoupStrainer itself into this module to preserve the 

3209# backwards compatibility of anyone who imports 

3210# bs4.element.SoupStrainer. 

3211from bs4.filter import SoupStrainer # noqa: E402