Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/element.py: 55%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1066 statements  

1from __future__ import annotations 

2 

3# Use of this source code is governed by the MIT license. 

4__license__ = "MIT" 

5 

6import re 

7import warnings 

8 

9from bs4.css import CSS 

10from bs4._deprecation import ( 

11 _deprecated, 

12 _deprecated_alias, 

13 _deprecated_function_alias, 

14) 

15from bs4.formatter import ( 

16 Formatter, 

17 HTMLFormatter, 

18 XMLFormatter, 

19) 

20from bs4._warnings import AttributeResemblesVariableWarning 

21 

22from typing import ( 

23 Any, 

24 Callable, 

25 Dict, 

26 Generic, 

27 Iterable, 

28 Iterator, 

29 List, 

30 Mapping, 

31 MutableSequence, 

32 Optional, 

33 Pattern, 

34 Set, 

35 TYPE_CHECKING, 

36 Tuple, 

37 Type, 

38 TypeVar, 

39 Union, 

40 cast, 

41 overload, 

42) 

43from typing_extensions import ( 

44 Self, 

45 TypeAlias, 

46) 

47 

48if TYPE_CHECKING: 

49 from bs4 import BeautifulSoup 

50 from bs4.builder import TreeBuilder 

51 from bs4.filter import ElementFilter 

52 from bs4.formatter import ( 

53 _EntitySubstitutionFunction, 

54 _FormatterOrName, 

55 ) 

56 from bs4._typing import ( 

57 _AtMostOneElement, 

58 _AtMostOneTag, 

59 _AtMostOneNavigableString, 

60 _AttributeValue, 

61 _AttributeValues, 

62 _Encoding, 

63 _InsertableElement, 

64 _OneElement, 

65 _QueryResults, 

66 _RawOrProcessedAttributeValues, 

67 _StrainableElement, 

68 _StrainableAttribute, 

69 _StrainableAttributes, 

70 _StrainableString, 

71 _SomeNavigableStrings, 

72 _SomeTags, 

73 ) 

74 

75_OneOrMoreStringTypes: TypeAlias = Union[ 

76 Type["NavigableString"], Iterable[Type["NavigableString"]] 

77] 

78 

79_FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]] 

80 

81# Deprecated module-level attributes. 

82# See https://peps.python.org/pep-0562/ 

83_deprecated_names = dict( 

84 whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy." 

85) 

86#: :meta private: 

87_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+") 

88 

89 

90def __getattr__(name: str) -> Any: 

91 if name in _deprecated_names: 

92 message = _deprecated_names[name] 

93 warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2) 

94 

95 return globals()[f"_deprecated_{name}"] 

96 raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 

97 

98 

99#: Documents output by Beautiful Soup will be encoded with 

100#: this encoding unless you specify otherwise. 

101DEFAULT_OUTPUT_ENCODING: str = "utf-8" 

102 

103#: A regular expression that can be used to split on whitespace. 

104nonwhitespace_re: Pattern[str] = re.compile(r"\S+") 

105 

106#: These encodings are recognized by Python (so `Tag.encode` 

107#: could theoretically support them) but XML and HTML don't recognize 

108#: them (so they should not show up in an XML or HTML document as that 

109#: document's encoding). 

110#: 

111#: If an XML document is encoded in one of these encodings, no encoding 

112#: will be mentioned in the XML declaration. If an HTML document is 

113#: encoded in one of these encodings, and the HTML document has a 

114#: <meta> tag that mentions an encoding, the encoding will be given as 

115#: the empty string. 

116#: 

117#: Source: 

118#: Python documentation, `Python Specific Encodings <https://docs.python.org/3/library/codecs.html#python-specific-encodings>`_ 

119PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set( 

120 [ 

121 "idna", 

122 "mbcs", 

123 "oem", 

124 "palmos", 

125 "punycode", 

126 "raw_unicode_escape", 

127 "undefined", 

128 "unicode_escape", 

129 "raw-unicode-escape", 

130 "unicode-escape", 

131 "string-escape", 

132 "string_escape", 

133 ] 

134) 

135 

136 

137class NamespacedAttribute(str): 

138 """A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"') 

139 which remembers the namespace prefix ('xml') and the name ('lang') 

140 that were used to create it. 

141 """ 

142 

143 prefix: Optional[str] 

144 name: Optional[str] 

145 namespace: Optional[str] 

146 

147 def __new__( 

148 cls, 

149 prefix: Optional[str], 

150 name: Optional[str] = None, 

151 namespace: Optional[str] = None, 

152 ) -> Self: 

153 if not name: 

154 # This is the default namespace. Its name "has no value" 

155 # per https://www.w3.org/TR/xml-names/#defaulting 

156 name = None 

157 

158 if not name: 

159 obj = str.__new__(cls, prefix) 

160 elif not prefix: 

161 # Not really namespaced. 

162 obj = str.__new__(cls, name) 

163 else: 

164 obj = str.__new__(cls, prefix + ":" + name) 

165 obj.prefix = prefix 

166 obj.name = name 

167 obj.namespace = namespace 

168 return obj 

169 

170 

171class AttributeValueWithCharsetSubstitution(str): 

172 """An abstract class standing in for a character encoding specified 

173 inside an HTML ``<meta>`` tag. 

174 

175 Subclasses exist for each place such a character encoding might be 

176 found: either inside the ``charset`` attribute 

177 (`CharsetMetaAttributeValue`) or inside the ``content`` attribute 

178 (`ContentMetaAttributeValue`) 

179 

180 This allows Beautiful Soup to replace that part of the HTML file 

181 with a different encoding when ouputting a tree as a string. 

182 """ 

183 

184 # The original, un-encoded value of the ``content`` attribute. 

185 #: :meta private: 

186 original_value: str 

187 

188 def substitute_encoding(self, eventual_encoding: str) -> str: 

189 """Do whatever's necessary in this implementation-specific 

190 portion an HTML document to substitute in a specific encoding. 

191 """ 

192 raise NotImplementedError() 

193 

194 

195class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 

196 """A generic stand-in for the value of a ``<meta>`` tag's ``charset`` 

197 attribute. 

198 

199 When Beautiful Soup parses the markup ``<meta charset="utf8">``, the 

200 value of the ``charset`` attribute will become one of these objects. 

201 

202 If the document is later encoded to an encoding other than UTF-8, its 

203 ``<meta>`` tag will mention the new encoding instead of ``utf8``. 

204 """ 

205 

206 def __new__(cls, original_value: str) -> Self: 

207 # We don't need to use the original value for anything, but 

208 # it might be useful for the user to know. 

209 obj = str.__new__(cls, original_value) 

210 obj.original_value = original_value 

211 return obj 

212 

213 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: 

214 """When an HTML document is being encoded to a given encoding, the 

215 value of a ``<meta>`` tag's ``charset`` becomes the name of 

216 the encoding. 

217 """ 

218 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 

219 return "" 

220 return eventual_encoding 

221 

222 

223class AttributeValueList(List[str]): 

224 """Class for the list used to hold the values of attributes which 

225 have multiple values (such as HTML's 'class'). It's just a regular 

226 list, but you can subclass it and pass it in to the TreeBuilder 

227 constructor as attribute_value_list_class, to have your subclass 

228 instantiated instead. 

229 """ 

230 

231 

232class AttributeDict(Dict[Any,Any]): 

233 """Superclass for the dictionary used to hold a tag's 

234 attributes. You can use this, but it's just a regular dict with no 

235 special logic. 

236 """ 

237 

238 

239class XMLAttributeDict(AttributeDict): 

240 """A dictionary for holding a Tag's attributes, which processes 

241 incoming values for consistency with the HTML spec. 

242 """ 

243 

244 def __setitem__(self, key: str, value: Any) -> None: 

245 """Set an attribute value, possibly modifying it to comply with 

246 the XML spec. 

247 

248 This just means converting common non-string values to 

249 strings: XML attributes may have "any literal string as a 

250 value." 

251 """ 

252 if value is None: 

253 value = "" 

254 if isinstance(value, bool): 

255 # XML does not define any rules for boolean attributes. 

256 # Preserve the old Beautiful Soup behavior (a bool that 

257 # gets converted to a string on output) rather than 

258 # guessing what the value should be. 

259 pass 

260 elif isinstance(value, (int, float)): 

261 # It's dangerous to convert _every_ attribute value into a 

262 # plain string, since an attribute value may be a more 

263 # sophisticated string-like object 

264 # (e.g. CharsetMetaAttributeValue). But we can definitely 

265 # convert numeric values and booleans, which are the most common. 

266 value = str(value) 

267 

268 super().__setitem__(key, value) 

269 

270 

271class HTMLAttributeDict(AttributeDict): 

272 """A dictionary for holding a Tag's attributes, which processes 

273 incoming values for consistency with the HTML spec, which says 

274 'Attribute values are a mixture of text and character 

275 references...' 

276 

277 Basically, this means converting common non-string values into 

278 strings, like XMLAttributeDict, though HTML also has some rules 

279 around boolean attributes that XML doesn't have. 

280 """ 

281 

282 def __setitem__(self, key: str, value: Any) -> None: 

283 """Set an attribute value, possibly modifying it to comply 

284 with the HTML spec, 

285 """ 

286 if value in (False, None): 

287 # 'The values "true" and "false" are not allowed on 

288 # boolean attributes. To represent a false value, the 

289 # attribute has to be omitted altogether.' 

290 if key in self: 

291 del self[key] 

292 return 

293 if isinstance(value, bool): 

294 # 'If the [boolean] attribute is present, its value must 

295 # either be the empty string or a value that is an ASCII 

296 # case-insensitive match for the attribute's canonical 

297 # name, with no leading or trailing whitespace.' 

298 # 

299 # [fixme] It's not clear to me whether "canonical name" 

300 # means fully-qualified name, unqualified name, or 

301 # (probably not) name with namespace prefix. For now I'm 

302 # going with unqualified name. 

303 if isinstance(key, NamespacedAttribute): 

304 value = key.name 

305 else: 

306 value = key 

307 elif isinstance(value, (int, float)): 

308 # See note in XMLAttributeDict for the reasoning why we 

309 # only do this to numbers. 

310 value = str(value) 

311 super().__setitem__(key, value) 

312 

313 

314class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): 

315 """A generic stand-in for the value of a ``<meta>`` tag's ``content`` 

316 attribute. 

317 

318 When Beautiful Soup parses the markup: 

319 ``<meta http-equiv="content-type" content="text/html; charset=utf8">`` 

320 

321 The value of the ``content`` attribute will become one of these objects. 

322 

323 If the document is later encoded to an encoding other than UTF-8, its 

324 ``<meta>`` tag will mention the new encoding instead of ``utf8``. 

325 """ 

326 

327 #: Match the 'charset' argument inside the 'content' attribute 

328 #: of a <meta> tag. 

329 #: :meta private: 

330 CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) 

331 

332 def __new__(cls, original_value: str) -> Self: 

333 cls.CHARSET_RE.search(original_value) 

334 obj = str.__new__(cls, original_value) 

335 obj.original_value = original_value 

336 return obj 

337 

338 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: 

339 """When an HTML document is being encoded to a given encoding, the 

340 value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes 

341 the name of the encoding. 

342 """ 

343 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 

344 return self.CHARSET_RE.sub("", self.original_value) 

345 

346 def rewrite(match: re.Match[str]) -> str: 

347 return match.group(1) + eventual_encoding 

348 

349 return self.CHARSET_RE.sub(rewrite, self.original_value) 

350 

351 

352class PageElement(object): 

353 """An abstract class representing a single element in the parse tree. 

354 

355 `NavigableString`, `Tag`, etc. are all subclasses of 

356 `PageElement`. For this reason you'll see a lot of methods that 

357 return `PageElement`, but you'll never see an actual `PageElement` 

358 object. For the most part you can think of `PageElement` as 

359 meaning "a `Tag` or a `NavigableString`." 

360 """ 

361 

362 #: In general, we can't tell just by looking at an element whether 

363 #: it's contained in an XML document or an HTML document. But for 

364 #: `Tag` objects (q.v.) we can store this information at parse time. 

365 #: :meta private: 

366 known_xml: Optional[bool] = None 

367 

368 #: Whether or not this element has been decomposed from the tree 

369 #: it was created in. 

370 _decomposed: bool 

371 

372 parent: Optional[Tag] 

373 next_element: _AtMostOneElement 

374 previous_element: _AtMostOneElement 

375 next_sibling: _AtMostOneElement 

376 previous_sibling: _AtMostOneElement 

377 

378 #: Whether or not this element is hidden from generated output. 

379 #: Only the `BeautifulSoup` object itself is hidden. 

380 hidden: bool = False 

381 

382 def setup( 

383 self, 

384 parent: Optional[Tag] = None, 

385 previous_element: _AtMostOneElement = None, 

386 next_element: _AtMostOneElement = None, 

387 previous_sibling: _AtMostOneElement = None, 

388 next_sibling: _AtMostOneElement = None, 

389 ) -> None: 

390 """Sets up the initial relations between this element and 

391 other elements. 

392 

393 :param parent: The parent of this element. 

394 

395 :param previous_element: The element parsed immediately before 

396 this one. 

397 

398 :param next_element: The element parsed immediately after 

399 this one. 

400 

401 :param previous_sibling: The most recently encountered element 

402 on the same level of the parse tree as this one. 

403 

404 :param previous_sibling: The next element to be encountered 

405 on the same level of the parse tree as this one. 

406 """ 

407 self.parent = parent 

408 

409 self.previous_element = previous_element 

410 if self.previous_element is not None: 

411 self.previous_element.next_element = self 

412 

413 self.next_element = next_element 

414 if self.next_element is not None: 

415 self.next_element.previous_element = self 

416 

417 self.next_sibling = next_sibling 

418 if self.next_sibling is not None: 

419 self.next_sibling.previous_sibling = self 

420 

421 if ( 

422 previous_sibling is None 

423 and self.parent is not None 

424 and self.parent.contents 

425 ): 

426 previous_sibling = self.parent.contents[-1] 

427 

428 self.previous_sibling = previous_sibling 

429 if self.previous_sibling is not None: 

430 self.previous_sibling.next_sibling = self 

431 

432 def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str: 

433 """Format the given string using the given formatter. 

434 

435 :param s: A string. 

436 :param formatter: A Formatter object, or a string naming one of the standard formatters. 

437 """ 

438 if formatter is None: 

439 return s 

440 if not isinstance(formatter, Formatter): 

441 formatter = self.formatter_for_name(formatter) 

442 output = formatter.substitute(s) 

443 return output 

444 

445 def formatter_for_name( 

446 self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction] 

447 ) -> Formatter: 

448 """Look up or create a Formatter for the given identifier, 

449 if necessary. 

450 

451 :param formatter: Can be a `Formatter` object (used as-is), a 

452 function (used as the entity substitution hook for an 

453 `bs4.formatter.XMLFormatter` or 

454 `bs4.formatter.HTMLFormatter`), or a string (used to look 

455 up an `bs4.formatter.XMLFormatter` or 

456 `bs4.formatter.HTMLFormatter` in the appropriate registry. 

457 

458 """ 

459 if isinstance(formatter_name, Formatter): 

460 return formatter_name 

461 c: type[Formatter] 

462 registry: Mapping[Optional[str], Formatter] 

463 if self._is_xml: 

464 c = XMLFormatter 

465 registry = XMLFormatter.REGISTRY 

466 else: 

467 c = HTMLFormatter 

468 registry = HTMLFormatter.REGISTRY 

469 if callable(formatter_name): 

470 return c(entity_substitution=formatter_name) 

471 return registry[formatter_name] 

472 

473 @property 

474 def _is_xml(self) -> bool: 

475 """Is this element part of an XML tree or an HTML tree? 

476 

477 This is used in formatter_for_name, when deciding whether an 

478 XMLFormatter or HTMLFormatter is more appropriate. It can be 

479 inefficient, but it should be called very rarely. 

480 """ 

481 if self.known_xml is not None: 

482 # Most of the time we will have determined this when the 

483 # document is parsed. 

484 return self.known_xml 

485 

486 # Otherwise, it's likely that this element was created by 

487 # direct invocation of the constructor from within the user's 

488 # Python code. 

489 if self.parent is None: 

490 # This is the top-level object. It should have .known_xml set 

491 # from tree creation. If not, take a guess--BS is usually 

492 # used on HTML markup. 

493 return getattr(self, "is_xml", False) 

494 return self.parent._is_xml 

495 

496 nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0") 

497 previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0") 

498 

499 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: 

500 raise NotImplementedError() 

501 

502 def __copy__(self) -> Self: 

503 """A copy of a PageElement can only be a deep copy, because 

504 only one PageElement can occupy a given place in a parse tree. 

505 """ 

506 return self.__deepcopy__({}) 

507 

508 default: Iterable[type[NavigableString]] = tuple() #: :meta private: 

509 

510 def _all_strings( 

511 self, strip: bool = False, types: Iterable[type[NavigableString]] = default 

512 ) -> Iterator[str]: 

513 """Yield all strings of certain classes, possibly stripping them. 

514 

515 This is implemented differently in `Tag` and `NavigableString`. 

516 """ 

517 raise NotImplementedError() 

518 

519 @property 

520 def stripped_strings(self) -> Iterator[str]: 

521 """Yield all interesting strings in this PageElement, stripping them 

522 first. 

523 

524 See `Tag` for information on which strings are considered 

525 interesting in a given context. 

526 """ 

527 for string in self._all_strings(True): 

528 yield string 

529 

530 def get_text( 

531 self, 

532 separator: str = "", 

533 strip: bool = False, 

534 types: Iterable[Type[NavigableString]] = default, 

535 ) -> str: 

536 """Get all child strings of this PageElement, concatenated using the 

537 given separator. 

538 

539 :param separator: Strings will be concatenated using this separator. 

540 

541 :param strip: If True, strings will be stripped before being 

542 concatenated. 

543 

544 :param types: A tuple of NavigableString subclasses. Any 

545 strings of a subclass not found in this list will be 

546 ignored. Although there are exceptions, the default 

547 behavior in most cases is to consider only NavigableString 

548 and CData objects. That means no comments, processing 

549 instructions, etc. 

550 

551 :return: A string. 

552 """ 

553 return separator.join([s for s in self._all_strings(strip, types=types)]) 

554 

555 getText = get_text 

556 text = property(get_text) 

557 

558 def replace_with(self, *args: _InsertableElement) -> Self: 

559 """Replace this `PageElement` with one or more other elements, 

560 objects, keeping the rest of the tree the same. 

561 

562 :return: This `PageElement`, no longer part of the tree. 

563 """ 

564 if self.parent is None: 

565 raise ValueError( 

566 "Cannot replace one element with another when the " 

567 "element to be replaced is not part of a tree." 

568 ) 

569 if len(args) == 1 and args[0] is self: 

570 # Replacing an element with itself is a no-op. 

571 return self 

572 if any(x is self.parent for x in args): 

573 raise ValueError("Cannot replace a Tag with its parent.") 

574 old_parent = self.parent 

575 my_index = self.parent.index(self) 

576 self.extract(_self_index=my_index) 

577 for idx, replace_with in enumerate(args, start=my_index): 

578 old_parent.insert(idx, replace_with) 

579 return self 

580 

581 replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0") 

582 

583 def wrap(self, wrap_inside: Tag) -> Tag: 

584 """Wrap this `PageElement` inside a `Tag`. 

585 

586 :return: ``wrap_inside``, occupying the position in the tree that used 

587 to be occupied by this object, and with this object now inside it. 

588 """ 

589 me = self.replace_with(wrap_inside) 

590 wrap_inside.append(me) 

591 return wrap_inside 

592 

593 def extract(self, _self_index: Optional[int] = None) -> Self: 

594 """Destructively rips this element out of the tree. 

595 

596 :param _self_index: The location of this element in its parent's 

597 .contents, if known. Passing this in allows for a performance 

598 optimization. 

599 

600 :return: this `PageElement`, no longer part of the tree. 

601 """ 

602 if self.parent is not None: 

603 if _self_index is None: 

604 _self_index = self.parent.index(self) 

605 del self.parent.contents[_self_index] 

606 

607 # Find the two elements that would be next to each other if 

608 # this element (and any children) hadn't been parsed. Connect 

609 # the two. 

610 last_child = self._last_descendant() 

611 

612 # last_child can't be None because we passed accept_self=True 

613 # into _last_descendant. Worst case, last_child will be 

614 # self. Making this cast removes several mypy complaints later 

615 # on as we manipulate last_child. 

616 last_child = cast(PageElement, last_child) 

617 next_element = last_child.next_element 

618 

619 if self.previous_element is not None: 

620 if self.previous_element is not next_element: 

621 self.previous_element.next_element = next_element 

622 if next_element is not None and next_element is not self.previous_element: 

623 next_element.previous_element = self.previous_element 

624 self.previous_element = None 

625 last_child.next_element = None 

626 

627 self.parent = None 

628 if ( 

629 self.previous_sibling is not None 

630 and self.previous_sibling is not self.next_sibling 

631 ): 

632 self.previous_sibling.next_sibling = self.next_sibling 

633 if ( 

634 self.next_sibling is not None 

635 and self.next_sibling is not self.previous_sibling 

636 ): 

637 self.next_sibling.previous_sibling = self.previous_sibling 

638 self.previous_sibling = self.next_sibling = None 

639 return self 

640 

641 def decompose(self) -> None: 

642 """Recursively destroys this `PageElement` and its children. 

643 

644 The element will be removed from the tree and wiped out; so 

645 will everything beneath it. 

646 

647 The behavior of a decomposed `PageElement` is undefined and you 

648 should never use one for anything, but if you need to *check* 

649 whether an element has been decomposed, you can use the 

650 `PageElement.decomposed` property. 

651 """ 

652 self.extract() 

653 e: _AtMostOneElement = self 

654 next_up: _AtMostOneElement = None 

655 while e is not None: 

656 next_up = e.next_element 

657 e.__dict__.clear() 

658 if isinstance(e, Tag): 

659 e.name = "" 

660 e.contents = [] 

661 e._decomposed = True 

662 e = next_up 

663 

664 def _last_descendant( 

665 self, is_initialized: bool = True, accept_self: bool = True 

666 ) -> _AtMostOneElement: 

667 """Finds the last element beneath this object to be parsed. 

668 

669 Special note to help you figure things out if your type 

670 checking is tripped up by the fact that this method returns 

671 _AtMostOneElement instead of PageElement: the only time 

672 this method returns None is if `accept_self` is False and the 

673 `PageElement` has no children--either it's a NavigableString 

674 or an empty Tag. 

675 

676 :param is_initialized: Has `PageElement.setup` been called on 

677 this `PageElement` yet? 

678 

679 :param accept_self: Is ``self`` an acceptable answer to the 

680 question? 

681 """ 

682 if is_initialized and self.next_sibling is not None: 

683 last_child = self.next_sibling.previous_element 

684 else: 

685 last_child = self 

686 while isinstance(last_child, Tag) and last_child.contents: 

687 last_child = last_child.contents[-1] 

688 if not accept_self and last_child is self: 

689 last_child = None 

690 return last_child 

691 

692 _lastRecursiveChild = _deprecated_alias( 

693 "_lastRecursiveChild", "_last_descendant", "4.0.0" 

694 ) 

695 

696 def insert_before(self, *args: _InsertableElement) -> List[PageElement]: 

697 """Makes the given element(s) the immediate predecessor of this one. 

698 

699 All the elements will have the same `PageElement.parent` as 

700 this one, and the given elements will occur immediately before 

701 this one. 

702 

703 :param args: One or more PageElements. 

704 

705 :return The list of PageElements that were inserted. 

706 """ 

707 parent = self.parent 

708 if parent is None: 

709 raise ValueError("Element has no parent, so 'before' has no meaning.") 

710 if any(x is self for x in args): 

711 raise ValueError("Can't insert an element before itself.") 

712 results: List[PageElement] = [] 

713 for predecessor in args: 

714 # Extract first so that the index won't be screwed up if they 

715 # are siblings. 

716 if isinstance(predecessor, PageElement): 

717 predecessor.extract() 

718 index = parent.index(self) 

719 results.extend(parent.insert(index, predecessor)) 

720 

721 return results 

722 

723 def insert_after(self, *args: _InsertableElement) -> List[PageElement]: 

724 """Makes the given element(s) the immediate successor of this one. 

725 

726 The elements will have the same `PageElement.parent` as this 

727 one, and the given elements will occur immediately after this 

728 one. 

729 

730 :param args: One or more PageElements. 

731 

732 :return The list of PageElements that were inserted. 

733 """ 

734 # Do all error checking before modifying the tree. 

735 parent = self.parent 

736 if parent is None: 

737 raise ValueError("Element has no parent, so 'after' has no meaning.") 

738 if any(x is self for x in args): 

739 raise ValueError("Can't insert an element after itself.") 

740 

741 offset = 0 

742 results: List[PageElement] = [] 

743 for successor in args: 

744 # Extract first so that the index won't be screwed up if they 

745 # are siblings. 

746 if isinstance(successor, PageElement): 

747 successor.extract() 

748 index = parent.index(self) 

749 results.extend(parent.insert(index + 1 + offset, successor)) 

750 offset += 1 

751 

752 return results 

753 

754 # For the suppression of this pyright warning, see discussion here: 

755 # https://github.com/microsoft/pyright/issues/10929 

756 @overload 

757 def find_next( # pyright: ignore [reportOverlappingOverload] 

758 self, 

759 name: _FindMethodName = None, 

760 attrs: Optional[_StrainableAttributes] = None, 

761 string: None=None, 

762 **kwargs: _StrainableAttribute, 

763 ) -> _AtMostOneTag: 

764 ... 

765 

766 @overload 

767 def find_next( 

768 self, 

769 name: None=None, 

770 attrs: None=None, 

771 string: _StrainableString="", 

772 **kwargs: _StrainableAttribute, 

773 ) -> _AtMostOneNavigableString: 

774 ... 

775 

776 def find_next( 

777 self, 

778 name: _FindMethodName = None, 

779 attrs: Optional[_StrainableAttributes] = None, 

780 string: Optional[_StrainableString] = None, 

781 **kwargs: _StrainableAttribute, 

782 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]: 

783 """Find the first PageElement that matches the given criteria and 

784 appears later in the document than this PageElement. 

785 

786 All find_* methods take a common set of arguments. See the online 

787 documentation for detailed explanations. 

788 

789 :param name: A filter on tag name. 

790 :param attrs: Additional filters on attribute values. 

791 :param string: A filter for a NavigableString with specific text. 

792 :kwargs: Additional filters on attribute values. 

793 """ 

794 return self._find_one(self.find_all_next, name, attrs, string, **kwargs) 

795 

796 findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0") 

797 

798 @overload 

799 def find_all_next( # pyright: ignore [reportOverlappingOverload] 

800 self, 

801 name: _FindMethodName = None, 

802 attrs: Optional[_StrainableAttributes] = None, 

803 string: None = None, 

804 limit: Optional[int] = None, 

805 _stacklevel: int = 2, 

806 **kwargs: _StrainableAttribute, 

807 ) -> _SomeTags: 

808 ... 

809 

810 @overload 

811 def find_all_next( 

812 self, 

813 name: None = None, 

814 attrs: None = None, 

815 string: _StrainableString = "", 

816 limit: Optional[int] = None, 

817 _stacklevel: int = 2, 

818 **kwargs: _StrainableAttribute, 

819 ) -> _SomeNavigableStrings: 

820 ... 

821 

822 def find_all_next( 

823 self, 

824 name: _FindMethodName = None, 

825 attrs: Optional[_StrainableAttributes] = None, 

826 string: Optional[_StrainableString] = None, 

827 limit: Optional[int] = None, 

828 _stacklevel: int = 2, 

829 **kwargs: _StrainableAttribute, 

830 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]: 

831 """Find all `PageElement` objects that match the given criteria and 

832 appear later in the document than this `PageElement`. 

833 

834 All find_* methods take a common set of arguments. See the online 

835 documentation for detailed explanations. 

836 

837 :param name: A filter on tag name. 

838 :param attrs: Additional filters on attribute values. 

839 :param string: A filter for a NavigableString with specific text. 

840 :param limit: Stop looking after finding this many results. 

841 :param _stacklevel: Used internally to improve warning messages. 

842 :kwargs: Additional filters on attribute values. 

843 """ 

844 return self._find_all( 

845 name, 

846 attrs, 

847 string, 

848 limit, 

849 self.next_elements, 

850 _stacklevel=_stacklevel + 1, 

851 **kwargs, 

852 ) 

853 

854 findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0") 

855 

856 @overload 

857 def find_next_sibling( # pyright: ignore [reportOverlappingOverload] 

858 self, 

859 name: _FindMethodName = None, 

860 attrs: Optional[_StrainableAttributes] = None, 

861 string: None=None, 

862 **kwargs: _StrainableAttribute, 

863 ) -> _AtMostOneTag: 

864 ... 

865 

866 @overload 

867 def find_next_sibling( 

868 self, 

869 name: None=None, 

870 attrs: None=None, 

871 string: _StrainableString="", 

872 **kwargs: _StrainableAttribute, 

873 ) -> _AtMostOneNavigableString: 

874 ... 

875 

876 def find_next_sibling( 

877 self, 

878 name: _FindMethodName = None, 

879 attrs: Optional[_StrainableAttributes] = None, 

880 string: Optional[_StrainableString] = None, 

881 **kwargs: _StrainableAttribute, 

882 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]: 

883 """Find the closest sibling to this PageElement that matches the 

884 given criteria and appears later in the document. 

885 

886 All find_* methods take a common set of arguments. See the 

887 online documentation for detailed explanations. 

888 

889 :param name: A filter on tag name. 

890 :param attrs: Additional filters on attribute values. 

891 :param string: A filter for a `NavigableString` with specific text. 

892 :kwargs: Additional filters on attribute values. 

893 """ 

894 return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs) 

895 

896 findNextSibling = _deprecated_function_alias( 

897 "findNextSibling", "find_next_sibling", "4.0.0" 

898 ) 

899 

900 @overload 

901 def find_next_siblings( # pyright: ignore [reportOverlappingOverload] 

902 self, 

903 name: _FindMethodName = None, 

904 attrs: Optional[_StrainableAttributes] = None, 

905 string: None = None, 

906 limit: Optional[int] = None, 

907 _stacklevel: int = 2, 

908 **kwargs: _StrainableAttribute, 

909 ) -> _SomeTags: 

910 ... 

911 

912 @overload 

913 def find_next_siblings( 

914 self, 

915 name: None = None, 

916 attrs: None = None, 

917 string: _StrainableString = "", 

918 limit: Optional[int] = None, 

919 _stacklevel: int = 2, 

920 **kwargs: _StrainableAttribute, 

921 ) -> _SomeNavigableStrings: 

922 ... 

923 

924 def find_next_siblings( 

925 self, 

926 name: _FindMethodName = None, 

927 attrs: Optional[_StrainableAttributes] = None, 

928 string: Optional[_StrainableString] = None, 

929 limit: Optional[int] = None, 

930 _stacklevel: int = 2, 

931 **kwargs: _StrainableAttribute, 

932 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]: 

933 """Find all siblings of this `PageElement` that match the given criteria 

934 and appear later in the document. 

935 

936 All find_* methods take a common set of arguments. See the online 

937 documentation for detailed explanations. 

938 

939 :param name: A filter on tag name. 

940 :param attrs: Additional filters on attribute values. 

941 :param string: A filter for a `NavigableString` with specific text. 

942 :param limit: Stop looking after finding this many results. 

943 :param _stacklevel: Used internally to improve warning messages. 

944 :kwargs: Additional filters on attribute values. 

945 """ 

946 return self._find_all( 

947 name, 

948 attrs, 

949 string, 

950 limit, 

951 self.next_siblings, 

952 _stacklevel=_stacklevel + 1, 

953 **kwargs, 

954 ) 

955 

956 findNextSiblings = _deprecated_function_alias( 

957 "findNextSiblings", "find_next_siblings", "4.0.0" 

958 ) 

959 fetchNextSiblings = _deprecated_function_alias( 

960 "fetchNextSiblings", "find_next_siblings", "3.0.0" 

961 ) 

962 

963 @overload 

964 def find_previous( # pyright: ignore [reportOverlappingOverload] 

965 self, 

966 name: _FindMethodName = None, 

967 attrs: Optional[_StrainableAttributes] = None, 

968 string: None=None, 

969 **kwargs: _StrainableAttribute, 

970 ) -> _AtMostOneTag: 

971 ... 

972 

973 @overload 

974 def find_previous( 

975 self, 

976 name: None=None, 

977 attrs: None=None, 

978 string: _StrainableString="", 

979 **kwargs: _StrainableAttribute, 

980 ) -> _AtMostOneNavigableString: 

981 ... 

982 

983 def find_previous( 

984 self, 

985 name: _FindMethodName = None, 

986 attrs: Optional[_StrainableAttributes] = None, 

987 string: Optional[_StrainableString] = None, 

988 **kwargs: _StrainableAttribute, 

989 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]: 

990 """Look backwards in the document from this `PageElement` and find the 

991 first `PageElement` that matches the given criteria. 

992 

993 All find_* methods take a common set of arguments. See the online 

994 documentation for detailed explanations. 

995 

996 :param name: A filter on tag name. 

997 :param attrs: Additional filters on attribute values. 

998 :param string: A filter for a `NavigableString` with specific text. 

999 :kwargs: Additional filters on attribute values. 

1000 """ 

1001 return self._find_one(self.find_all_previous, name, attrs, string, **kwargs) 

1002 

1003 findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0") 

1004 

1005 @overload 

1006 def find_all_previous( # pyright: ignore [reportOverlappingOverload] 

1007 self, 

1008 name: _FindMethodName = None, 

1009 attrs: Optional[_StrainableAttributes] = None, 

1010 string: None = None, 

1011 limit: Optional[int] = None, 

1012 _stacklevel: int = 2, 

1013 **kwargs: _StrainableAttribute, 

1014 ) -> _SomeTags: 

1015 ... 

1016 

1017 @overload 

1018 def find_all_previous( 

1019 self, 

1020 name: None = None, 

1021 attrs: None = None, 

1022 string: _StrainableString = "", 

1023 limit: Optional[int] = None, 

1024 _stacklevel: int = 2, 

1025 **kwargs: _StrainableAttribute, 

1026 ) -> _SomeNavigableStrings: 

1027 ... 

1028 

1029 def find_all_previous( 

1030 self, 

1031 name: _FindMethodName = None, 

1032 attrs: Optional[_StrainableAttributes] = None, 

1033 string: Optional[_StrainableString] = None, 

1034 limit: Optional[int] = None, 

1035 _stacklevel: int = 2, 

1036 **kwargs: _StrainableAttribute, 

1037 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]: 

1038 """Look backwards in the document from this `PageElement` and find all 

1039 `PageElement` that match the given criteria. 

1040 

1041 All find_* methods take a common set of arguments. See the online 

1042 documentation for detailed explanations. 

1043 

1044 :param name: A filter on tag name. 

1045 :param attrs: Additional filters on attribute values. 

1046 :param string: A filter for a `NavigableString` with specific text. 

1047 :param limit: Stop looking after finding this many results. 

1048 :param _stacklevel: Used internally to improve warning messages. 

1049 :kwargs: Additional filters on attribute values. 

1050 """ 

1051 return self._find_all( 

1052 name, 

1053 attrs, 

1054 string, 

1055 limit, 

1056 self.previous_elements, 

1057 _stacklevel=_stacklevel + 1, 

1058 **kwargs, 

1059 ) 

1060 

1061 findAllPrevious = _deprecated_function_alias( 

1062 "findAllPrevious", "find_all_previous", "4.0.0" 

1063 ) 

1064 fetchAllPrevious = _deprecated_function_alias( 

1065 "fetchAllPrevious", "find_all_previous", "3.0.0" 

1066 ) 

1067 

1068 @overload 

1069 def find_previous_sibling( # pyright: ignore [reportOverlappingOverload] 

1070 self, 

1071 name: _FindMethodName = None, 

1072 attrs: Optional[_StrainableAttributes] = None, 

1073 string: None=None, 

1074 **kwargs: _StrainableAttribute, 

1075 ) -> _AtMostOneTag: 

1076 ... 

1077 

1078 @overload 

1079 def find_previous_sibling( 

1080 self, 

1081 name: None=None, 

1082 attrs: None=None, 

1083 string: _StrainableString="", 

1084 **kwargs: _StrainableAttribute, 

1085 ) -> _AtMostOneNavigableString: 

1086 ... 

1087 

1088 def find_previous_sibling( 

1089 self, 

1090 name: _FindMethodName = None, 

1091 attrs: Optional[_StrainableAttributes] = None, 

1092 string: Optional[_StrainableString] = None, 

1093 **kwargs: _StrainableAttribute, 

1094 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]: 

1095 """Returns the closest sibling to this `PageElement` that matches the 

1096 given criteria and appears earlier in the document. 

1097 

1098 All find_* methods take a common set of arguments. See the online 

1099 documentation for detailed explanations. 

1100 

1101 :param name: A filter on tag name. 

1102 :param attrs: Additional filters on attribute values. 

1103 :param string: A filter for a `NavigableString` with specific text. 

1104 :kwargs: Additional filters on attribute values. 

1105 """ 

1106 return self._find_one( 

1107 self.find_previous_siblings, name, attrs, string, **kwargs 

1108 ) 

1109 

1110 findPreviousSibling = _deprecated_function_alias( 

1111 "findPreviousSibling", "find_previous_sibling", "4.0.0" 

1112 ) 

1113 

1114 @overload 

1115 def find_previous_siblings( # pyright: ignore [reportOverlappingOverload] 

1116 self, 

1117 name: _FindMethodName = None, 

1118 attrs: Optional[_StrainableAttributes] = None, 

1119 string: None = None, 

1120 limit: Optional[int] = None, 

1121 _stacklevel: int = 2, 

1122 **kwargs: _StrainableAttribute, 

1123 ) -> _SomeTags: 

1124 ... 

1125 

1126 @overload 

1127 def find_previous_siblings( 

1128 self, 

1129 name: None = None, 

1130 attrs: None = None, 

1131 string: _StrainableString = "", 

1132 limit: Optional[int] = None, 

1133 _stacklevel: int = 2, 

1134 **kwargs: _StrainableAttribute, 

1135 ) -> _SomeNavigableStrings: 

1136 ... 

1137 

1138 def find_previous_siblings( 

1139 self, 

1140 name: _FindMethodName = None, 

1141 attrs: Optional[_StrainableAttributes] = None, 

1142 string: Optional[_StrainableString] = None, 

1143 limit: Optional[int] = None, 

1144 _stacklevel: int = 2, 

1145 **kwargs: _StrainableAttribute, 

1146 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]: 

1147 """Returns all siblings to this PageElement that match the 

1148 given criteria and appear earlier in the document. 

1149 

1150 All find_* methods take a common set of arguments. See the online 

1151 documentation for detailed explanations. 

1152 

1153 :param name: A filter on tag name. 

1154 :param attrs: Additional filters on attribute values. 

1155 :param string: A filter for a NavigableString with specific text. 

1156 :param limit: Stop looking after finding this many results. 

1157 :param _stacklevel: Used internally to improve warning messages. 

1158 :kwargs: Additional filters on attribute values. 

1159 """ 

1160 return self._find_all( 

1161 name, 

1162 attrs, 

1163 string, 

1164 limit, 

1165 self.previous_siblings, 

1166 _stacklevel=_stacklevel + 1, 

1167 **kwargs, 

1168 ) 

1169 

1170 findPreviousSiblings = _deprecated_function_alias( 

1171 "findPreviousSiblings", "find_previous_siblings", "4.0.0" 

1172 ) 

1173 fetchPreviousSiblings = _deprecated_function_alias( 

1174 "fetchPreviousSiblings", "find_previous_siblings", "3.0.0" 

1175 ) 

1176 

1177 def find_parent( 

1178 self, 

1179 name: _FindMethodName = None, 

1180 attrs: Optional[_StrainableAttributes] = None, 

1181 **kwargs: _StrainableAttribute, 

1182 ) -> _AtMostOneTag: 

1183 """Find the closest parent of this PageElement that matches the given 

1184 criteria. 

1185 

1186 All find_* methods take a common set of arguments. See the online 

1187 documentation for detailed explanations. 

1188 

1189 :param name: A filter on tag name. 

1190 :param attrs: Additional filters on attribute values. 

1191 :param self: Whether the PageElement itself should be considered 

1192 as one of its 'parents'. 

1193 :kwargs: Additional filters on attribute values. 

1194 """ 

1195 # NOTE: We can't use _find_one because findParents takes a different 

1196 # set of arguments. 

1197 r = None 

1198 results = self.find_parents( 

1199 name, attrs, 1, _stacklevel=3, **kwargs 

1200 ) 

1201 if results: 

1202 r = results[0] 

1203 return r 

1204 

1205 findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0") 

1206 

1207 def find_parents( 

1208 self, 

1209 name: _FindMethodName = None, 

1210 attrs: Optional[_StrainableAttributes] = None, 

1211 limit: Optional[int] = None, 

1212 _stacklevel: int = 2, 

1213 **kwargs: _StrainableAttribute, 

1214 ) -> _SomeTags: 

1215 """Find all parents of this `PageElement` that match the given criteria. 

1216 

1217 All find_* methods take a common set of arguments. See the online 

1218 documentation for detailed explanations. 

1219 

1220 :param name: A filter on tag name. 

1221 :param attrs: Additional filters on attribute values. 

1222 :param limit: Stop looking after finding this many results. 

1223 :param _stacklevel: Used internally to improve warning messages. 

1224 :kwargs: Additional filters on attribute values. 

1225 """ 

1226 iterator = self.parents 

1227 # Only Tags can have children, so this ResultSet will contain 

1228 # nothing but Tags. 

1229 return cast(ResultSet[Tag], self._find_all( 

1230 name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs 

1231 )) 

1232 

1233 findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0") 

1234 fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0") 

1235 

1236 @property 

1237 def next(self) -> _AtMostOneElement: 

1238 """The `PageElement`, if any, that was parsed just after this one.""" 

1239 return self.next_element 

1240 

1241 @property 

1242 def previous(self) -> _AtMostOneElement: 

1243 """The `PageElement`, if any, that was parsed just before this one.""" 

1244 return self.previous_element 

1245 

1246 # These methods do the real heavy lifting. 

1247 

1248 def _find_one( 

1249 self, 

1250 # TODO-TYPING: "There is no syntax to indicate optional or 

1251 # keyword arguments; such function types are rarely used 

1252 # as callback types." - So, not sure how to get more 

1253 # specific here. 

1254 method: Callable, 

1255 name: _FindMethodName, 

1256 attrs: Optional[_StrainableAttributes], 

1257 string: Optional[_StrainableString], 

1258 **kwargs: _StrainableAttribute, 

1259 ) -> _AtMostOneElement: 

1260 r: _AtMostOneElement = None 

1261 results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs) 

1262 if results: 

1263 r = results[0] 

1264 return r 

1265 

1266 def _find_all( 

1267 self, 

1268 name: _FindMethodName, 

1269 attrs: Optional[_StrainableAttributes], 

1270 string: Optional[_StrainableString], 

1271 limit: Optional[int], 

1272 generator: Iterator[PageElement], 

1273 _stacklevel: int = 3, 

1274 **kwargs: _StrainableAttribute, 

1275 ) -> _QueryResults: 

1276 """Iterates over a generator looking for things that match.""" 

1277 

1278 if string is None and "text" in kwargs: 

1279 string = kwargs.pop("text") 

1280 warnings.warn( 

1281 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", 

1282 DeprecationWarning, 

1283 stacklevel=_stacklevel, 

1284 ) 

1285 

1286 if "_class" in kwargs: 

1287 warnings.warn( 

1288 AttributeResemblesVariableWarning.MESSAGE 

1289 % dict( 

1290 original="_class", 

1291 autocorrect="class_", 

1292 ), 

1293 AttributeResemblesVariableWarning, 

1294 stacklevel=_stacklevel, 

1295 ) 

1296 

1297 from bs4.filter import ElementFilter 

1298 

1299 if isinstance(name, ElementFilter): 

1300 matcher = name 

1301 else: 

1302 matcher = SoupStrainer(name, attrs, string, **kwargs) 

1303 

1304 result: MutableSequence[_OneElement] 

1305 if string is None and not limit and not attrs and not kwargs: 

1306 if name is True or name is None: 

1307 # Optimization to find all tags. 

1308 result = [element for element in generator if isinstance(element, Tag)] 

1309 return ResultSet(matcher, result) 

1310 elif isinstance(name, str): 

1311 # Optimization to find all tags with a given name. 

1312 if name.count(":") == 1: 

1313 # This is a name with a prefix. If this is a namespace-aware document, 

1314 # we need to match the local name against tag.name. If not, 

1315 # we need to match the fully-qualified name against tag.name. 

1316 prefix, local_name = name.split(":", 1) 

1317 else: 

1318 prefix = None 

1319 local_name = name 

1320 result = [] 

1321 for element in generator: 

1322 if not isinstance(element, Tag): 

1323 continue 

1324 if element.name == name or ( 

1325 element.name == local_name 

1326 and (prefix is None or element.prefix == prefix) 

1327 ): 

1328 result.append(element) 

1329 return ResultSet(matcher, result) 

1330 return matcher.find_all(generator, limit) 

1331 

1332 # These generators can be used to navigate starting from both 

1333 # NavigableStrings and Tags. 

1334 @property 

1335 def next_elements(self) -> Iterator[PageElement]: 

1336 """All PageElements that were parsed after this one.""" 

1337 i = self.next_element 

1338 while i is not None: 

1339 successor = i.next_element 

1340 yield i 

1341 i = successor 

1342 

1343 @property 

1344 def self_and_next_elements(self) -> Iterator[PageElement]: 

1345 """This PageElement, then all PageElements that were parsed after it.""" 

1346 return self._self_and(self.next_elements) 

1347 

1348 @property 

1349 def next_siblings(self) -> Iterator[PageElement]: 

1350 """All PageElements that are siblings of this one but were parsed 

1351 later. 

1352 """ 

1353 i = self.next_sibling 

1354 while i is not None: 

1355 successor = i.next_sibling 

1356 yield i 

1357 i = successor 

1358 

1359 @property 

1360 def self_and_next_siblings(self) -> Iterator[PageElement]: 

1361 """This PageElement, then all of its siblings.""" 

1362 return self._self_and(self.next_siblings) 

1363 

1364 @property 

1365 def previous_elements(self) -> Iterator[PageElement]: 

1366 """All PageElements that were parsed before this one. 

1367 

1368 :yield: A sequence of PageElements. 

1369 """ 

1370 i = self.previous_element 

1371 while i is not None: 

1372 successor = i.previous_element 

1373 yield i 

1374 i = successor 

1375 

1376 @property 

1377 def self_and_previous_elements(self) -> Iterator[PageElement]: 

1378 """This PageElement, then all elements that were parsed 

1379 earlier.""" 

1380 return self._self_and(self.previous_elements) 

1381 

1382 @property 

1383 def previous_siblings(self) -> Iterator[PageElement]: 

1384 """All PageElements that are siblings of this one but were parsed 

1385 earlier. 

1386 

1387 :yield: A sequence of PageElements. 

1388 """ 

1389 i = self.previous_sibling 

1390 while i is not None: 

1391 successor = i.previous_sibling 

1392 yield i 

1393 i = successor 

1394 

1395 @property 

1396 def self_and_previous_siblings(self) -> Iterator[PageElement]: 

1397 """This PageElement, then all of its siblings that were parsed 

1398 earlier.""" 

1399 return self._self_and(self.previous_siblings) 

1400 

1401 @property 

1402 def parents(self) -> Iterator[Tag]: 

1403 """All elements that are parents of this PageElement. 

1404 

1405 :yield: A sequence of Tags, ending with a BeautifulSoup object. 

1406 """ 

1407 i = self.parent 

1408 while i is not None: 

1409 successor = i.parent 

1410 yield i 

1411 i = successor 

1412 

1413 @property 

1414 def self_and_parents(self) -> Iterator[PageElement]: 

1415 """This element, then all of its parents. 

1416 

1417 :yield: A sequence of PageElements, ending with a BeautifulSoup object. 

1418 """ 

1419 return self._self_and(self.parents) 

1420 

1421 def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]: 

1422 """Modify a generator by yielding this element, then everything 

1423 yielded by the other generator. 

1424 """ 

1425 if not self.hidden: 

1426 yield self 

1427 for i in other_generator: 

1428 yield i 

1429 

1430 @property 

1431 def decomposed(self) -> bool: 

1432 """Check whether a PageElement has been decomposed.""" 

1433 return getattr(self, "_decomposed", False) or False 

1434 

1435 @_deprecated("next_elements", "4.0.0") 

1436 def nextGenerator(self) -> Iterator[PageElement]: 

1437 ":meta private:" 

1438 return self.next_elements 

1439 

1440 @_deprecated("next_siblings", "4.0.0") 

1441 def nextSiblingGenerator(self) -> Iterator[PageElement]: 

1442 ":meta private:" 

1443 return self.next_siblings 

1444 

1445 @_deprecated("previous_elements", "4.0.0") 

1446 def previousGenerator(self) -> Iterator[PageElement]: 

1447 ":meta private:" 

1448 return self.previous_elements 

1449 

1450 @_deprecated("previous_siblings", "4.0.0") 

1451 def previousSiblingGenerator(self) -> Iterator[PageElement]: 

1452 ":meta private:" 

1453 return self.previous_siblings 

1454 

1455 @_deprecated("parents", "4.0.0") 

1456 def parentGenerator(self) -> Iterator[PageElement]: 

1457 ":meta private:" 

1458 return self.parents 

1459 

1460 

1461class NavigableString(str, PageElement): 

1462 """A Python string that is part of a parse tree. 

1463 

1464 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will 

1465 create a `NavigableString` for the string "penguin". 

1466 """ 

1467 

1468 #: A string prepended to the body of the 'real' string 

1469 #: when formatting it as part of a document, such as the '<!--' 

1470 #: in an HTML comment. 

1471 PREFIX: str = "" 

1472 

1473 #: A string appended to the body of the 'real' string 

1474 #: when formatting it as part of a document, such as the '-->' 

1475 #: in an HTML comment. 

1476 SUFFIX: str = "" 

1477 

1478 def __new__(cls, value: Union[str, bytes]) -> Self: 

1479 """Create a new NavigableString. 

1480 

1481 When unpickling a NavigableString, this method is called with 

1482 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 

1483 passed in to the superclass's __new__ or the superclass won't know 

1484 how to handle non-ASCII characters. 

1485 """ 

1486 if isinstance(value, str): 

1487 u = str.__new__(cls, value) 

1488 else: 

1489 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 

1490 u.hidden = False 

1491 u.setup() 

1492 return u 

1493 

1494 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: 

1495 """A copy of a NavigableString has the same contents and class 

1496 as the original, but it is not connected to the parse tree. 

1497 

1498 :param recursive: This parameter is ignored; it's only defined 

1499 so that NavigableString.__deepcopy__ implements the same 

1500 signature as Tag.__deepcopy__. 

1501 """ 

1502 return type(self)(self) 

1503 

1504 def __getnewargs__(self) -> Tuple[str]: 

1505 return (str(self),) 

1506 

1507 # TODO-TYPING This should be SupportsIndex|slice but SupportsIndex 

1508 # is introduced in 3.8. This can be changed once 3.7 support is dropped. 

1509 def __getitem__(self, key: Union[int|slice]) -> str: # type:ignore 

1510 """Raise an exception """ 

1511 if isinstance(key, str): 

1512 raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__)) 

1513 return super(NavigableString, self).__getitem__(key) 

1514 

1515 @property 

1516 def string(self) -> str: 

1517 """Convenience property defined to match `Tag.string`. 

1518 

1519 :return: This property always returns the `NavigableString` it was 

1520 called on. 

1521 

1522 :meta private: 

1523 """ 

1524 return self 

1525 

1526 def output_ready(self, formatter: _FormatterOrName = "minimal") -> str: 

1527 """Run the string through the provided formatter, making it 

1528 ready for output as part of an HTML or XML document. 

1529 

1530 :param formatter: A `Formatter` object, or a string naming one 

1531 of the standard formatters. 

1532 """ 

1533 output = self.format_string(self, formatter) 

1534 return self.PREFIX + output + self.SUFFIX 

1535 

1536 @property 

1537 def name(self) -> None: 

1538 """Since a NavigableString is not a Tag, it has no .name. 

1539 

1540 This property is implemented so that code like this doesn't crash 

1541 when run on a mixture of Tag and NavigableString objects: 

1542 [x.name for x in tag.children] 

1543 

1544 :meta private: 

1545 """ 

1546 return None 

1547 

1548 @name.setter 

1549 def name(self, name: str) -> None: 

1550 """Prevent NavigableString.name from ever being set. 

1551 

1552 :meta private: 

1553 """ 

1554 raise AttributeError("A NavigableString cannot be given a name.") 

1555 

1556 def _all_strings( 

1557 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default 

1558 ) -> Iterator[str]: 

1559 """Yield all strings of certain classes, possibly stripping them. 

1560 

1561 This makes it easy for NavigableString to implement methods 

1562 like get_text() as conveniences, creating a consistent 

1563 text-extraction API across all PageElements. 

1564 

1565 :param strip: If True, all strings will be stripped before being 

1566 yielded. 

1567 

1568 :param types: A tuple of NavigableString subclasses. If this 

1569 NavigableString isn't one of those subclasses, the 

1570 sequence will be empty. By default, the subclasses 

1571 considered are NavigableString and CData objects. That 

1572 means no comments, processing instructions, etc. 

1573 

1574 :yield: A sequence that either contains this string, or is empty. 

1575 """ 

1576 if types is self.default: 

1577 # This is kept in Tag because it's full of subclasses of 

1578 # this class, which aren't defined until later in the file. 

1579 types = Tag.MAIN_CONTENT_STRING_TYPES 

1580 

1581 # Do nothing if the caller is looking for specific types of 

1582 # string, and we're of a different type. 

1583 # 

1584 # We check specific types instead of using isinstance(self, 

1585 # types) because all of these classes subclass 

1586 # NavigableString. Anyone who's using this feature probably 

1587 # wants generic NavigableStrings but not other stuff. 

1588 my_type = type(self) 

1589 if types is not None: 

1590 if isinstance(types, type): 

1591 # Looking for a single type. 

1592 if my_type is not types: 

1593 return 

1594 elif my_type not in types: 

1595 # Looking for one of a list of types. 

1596 return 

1597 

1598 value = self 

1599 if strip: 

1600 final_value = value.strip() 

1601 else: 

1602 final_value = self 

1603 if len(final_value) > 0: 

1604 yield final_value 

1605 

1606 @property 

1607 def strings(self) -> Iterator[str]: 

1608 """Yield this string, but only if it is interesting. 

1609 

1610 This is defined the way it is for compatibility with 

1611 `Tag.strings`. See `Tag` for information on which strings are 

1612 interesting in a given context. 

1613 

1614 :yield: A sequence that either contains this string, or is empty. 

1615 """ 

1616 return self._all_strings() 

1617 

1618 

1619class PreformattedString(NavigableString): 

1620 """A `NavigableString` not subject to the normal formatting rules. 

1621 

1622 This is an abstract class used for special kinds of strings such 

1623 as comments (`Comment`) and CDATA blocks (`CData`). 

1624 """ 

1625 

1626 PREFIX: str = "" 

1627 SUFFIX: str = "" 

1628 

1629 def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str: 

1630 """Make this string ready for output by adding any subclass-specific 

1631 prefix or suffix. 

1632 

1633 :param formatter: A `Formatter` object, or a string naming one 

1634 of the standard formatters. The string will be passed into the 

1635 `Formatter`, but only to trigger any side effects: the return 

1636 value is ignored. 

1637 

1638 :return: The string, with any subclass-specific prefix and 

1639 suffix added on. 

1640 """ 

1641 if formatter is not None: 

1642 self.format_string(self, formatter) 

1643 return self.PREFIX + self + self.SUFFIX 

1644 

1645 

1646class CData(PreformattedString): 

1647 """A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_.""" 

1648 

1649 PREFIX: str = "<![CDATA[" 

1650 SUFFIX: str = "]]>" 

1651 

1652 

1653class ProcessingInstruction(PreformattedString): 

1654 """A SGML processing instruction.""" 

1655 

1656 PREFIX: str = "<?" 

1657 SUFFIX: str = ">" 

1658 

1659 

1660class XMLProcessingInstruction(ProcessingInstruction): 

1661 """An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_.""" 

1662 

1663 PREFIX: str = "<?" 

1664 SUFFIX: str = "?>" 

1665 

1666 

1667class Comment(PreformattedString): 

1668 """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_.""" 

1669 

1670 PREFIX: str = "<!--" 

1671 SUFFIX: str = "-->" 

1672 

1673 

1674class Declaration(PreformattedString): 

1675 """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_.""" 

1676 

1677 PREFIX: str = "<?" 

1678 SUFFIX: str = "?>" 

1679 

1680 

1681class Doctype(PreformattedString): 

1682 """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_.""" 

1683 

1684 @classmethod 

1685 def for_name_and_ids( 

1686 cls, name: str, pub_id: Optional[str], system_id: Optional[str] 

1687 ) -> Doctype: 

1688 """Generate an appropriate document type declaration for a given 

1689 public ID and system ID. 

1690 

1691 :param name: The name of the document's root element, e.g. 'html'. 

1692 :param pub_id: The Formal Public Identifier for this document type, 

1693 e.g. '-//W3C//DTD XHTML 1.1//EN' 

1694 :param system_id: The system identifier for this document type, 

1695 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' 

1696 """ 

1697 return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id)) 

1698 

1699 @classmethod 

1700 def _string_for_name_and_ids( 

1701 cls, name: str, pub_id: Optional[str], system_id: Optional[str] 

1702 ) -> str: 

1703 """Generate a string to be used as the basis of a Doctype object. 

1704 

1705 This is a separate method from for_name_and_ids() because the lxml 

1706 TreeBuilder needs to call it. 

1707 """ 

1708 value = name or "" 

1709 if pub_id is not None: 

1710 value += ' PUBLIC "%s"' % pub_id 

1711 if system_id is not None: 

1712 value += ' "%s"' % system_id 

1713 elif system_id is not None: 

1714 value += ' SYSTEM "%s"' % system_id 

1715 return value 

1716 

1717 PREFIX: str = "<!DOCTYPE " 

1718 SUFFIX: str = ">\n" 

1719 

1720 

1721class Stylesheet(NavigableString): 

1722 """A `NavigableString` representing the contents of a `<style> HTML 

1723 tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_ 

1724 (probably CSS). 

1725 

1726 Used to distinguish embedded stylesheets from textual content. 

1727 """ 

1728 

1729 

1730class Script(NavigableString): 

1731 """A `NavigableString` representing the contents of a `<script> 

1732 HTML tag 

1733 <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_ 

1734 (probably Javascript). 

1735 

1736 Used to distinguish executable code from textual content. 

1737 """ 

1738 

1739 

1740class TemplateString(NavigableString): 

1741 """A `NavigableString` representing a string found inside an `HTML 

1742 <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_ 

1743 embedded in a larger document. 

1744 

1745 Used to distinguish such strings from the main body of the document. 

1746 """ 

1747 

1748 

1749class RubyTextString(NavigableString): 

1750 """A NavigableString representing the contents of an `<rt> HTML 

1751 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_. 

1752 

1753 Can be used to distinguish such strings from the strings they're 

1754 annotating. 

1755 """ 

1756 

1757 

1758class RubyParenthesisString(NavigableString): 

1759 """A NavigableString representing the contents of an `<rp> HTML 

1760 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_. 

1761 """ 

1762 

1763 

1764class Tag(PageElement): 

1765 """An HTML or XML tag that is part of a parse tree, along with its 

1766 attributes, contents, and relationships to other parts of the tree. 

1767 

1768 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will 

1769 create a `Tag` object representing the ``<b>`` tag. You can 

1770 instantiate `Tag` objects directly, but it's not necessary unless 

1771 you're adding entirely new markup to a parsed document. Most of 

1772 the constructor arguments are intended for use by the `TreeBuilder` 

1773 that's parsing a document. 

1774 

1775 :param parser: A `BeautifulSoup` object representing the parse tree this 

1776 `Tag` will be part of. 

1777 :param builder: The `TreeBuilder` being used to build the tree. 

1778 :param name: The name of the tag. 

1779 :param namespace: The URI of this tag's XML namespace, if any. 

1780 :param prefix: The prefix for this tag's XML namespace, if any. 

1781 :param attrs: A dictionary of attribute values. 

1782 :param parent: The `Tag` to use as the parent of this `Tag`. May be 

1783 the `BeautifulSoup` object itself. 

1784 :param previous: The `PageElement` that was parsed immediately before 

1785 parsing this tag. 

1786 :param is_xml: If True, this is an XML tag. Otherwise, this is an 

1787 HTML tag. 

1788 :param sourceline: The line number where this tag was found in its 

1789 source document. 

1790 :param sourcepos: The character position within ``sourceline`` where this 

1791 tag was found. 

1792 :param can_be_empty_element: If True, this tag should be 

1793 represented as <tag/>. If False, this tag should be represented 

1794 as <tag></tag>. 

1795 :param cdata_list_attributes: A dictionary of attributes whose values should 

1796 be parsed as lists of strings if they ever show up on this tag. 

1797 :param preserve_whitespace_tags: Names of tags whose contents 

1798 should have their whitespace preserved if they are encountered inside 

1799 this tag. 

1800 :param interesting_string_types: When iterating over this tag's 

1801 string contents in methods like `Tag.strings` or 

1802 `PageElement.get_text`, these are the types of strings that are 

1803 interesting enough to be considered. By default, 

1804 `NavigableString` (normal strings) and `CData` (CDATA 

1805 sections) are the only interesting string subtypes. 

1806 :param namespaces: A dictionary mapping currently active 

1807 namespace prefixes to URIs, as of the point in the parsing process when 

1808 this tag was encountered. This can be used later to 

1809 construct CSS selectors. 

1810 

1811 """ 

1812 

1813 def __init__( 

1814 self, 

1815 parser: Optional[BeautifulSoup] = None, 

1816 builder: Optional[TreeBuilder] = None, 

1817 name: Optional[str] = None, 

1818 namespace: Optional[str] = None, 

1819 prefix: Optional[str] = None, 

1820 attrs: Optional[_RawOrProcessedAttributeValues] = None, 

1821 parent: Optional[Union[BeautifulSoup, Tag]] = None, 

1822 previous: _AtMostOneElement = None, 

1823 is_xml: Optional[bool] = None, 

1824 sourceline: Optional[int] = None, 

1825 sourcepos: Optional[int] = None, 

1826 can_be_empty_element: Optional[bool] = None, 

1827 cdata_list_attributes: Optional[Dict[str, Set[str]]] = None, 

1828 preserve_whitespace_tags: Optional[Set[str]] = None, 

1829 interesting_string_types: Optional[Set[Type[NavigableString]]] = None, 

1830 namespaces: Optional[Dict[str, str]] = None, 

1831 # NOTE: Any new arguments here need to be mirrored in 

1832 # Tag.copy_self, and potentially BeautifulSoup.new_tag 

1833 # as well. 

1834 ): 

1835 if parser is None: 

1836 self.parser_class = None 

1837 else: 

1838 # We don't actually store the parser object: that lets extracted 

1839 # chunks be garbage-collected. 

1840 self.parser_class = parser.__class__ 

1841 if name is None: 

1842 raise ValueError("No value provided for new tag's name.") 

1843 self.name = name 

1844 self.namespace = namespace 

1845 self._namespaces = namespaces or {} 

1846 self.prefix = prefix 

1847 if (not builder or builder.store_line_numbers) and ( 

1848 sourceline is not None or sourcepos is not None 

1849 ): 

1850 self.sourceline = sourceline 

1851 self.sourcepos = sourcepos 

1852 else: 

1853 self.sourceline = sourceline 

1854 self.sourcepos = sourcepos 

1855 

1856 attr_dict_class: type[AttributeDict] 

1857 attribute_value_list_class: type[AttributeValueList] 

1858 if builder is None: 

1859 if is_xml: 

1860 attr_dict_class = XMLAttributeDict 

1861 else: 

1862 attr_dict_class = HTMLAttributeDict 

1863 attribute_value_list_class = AttributeValueList 

1864 else: 

1865 attr_dict_class = builder.attribute_dict_class 

1866 attribute_value_list_class = builder.attribute_value_list_class 

1867 self.attribute_value_list_class = attribute_value_list_class 

1868 

1869 if attrs is None: 

1870 self.attrs = attr_dict_class() 

1871 else: 

1872 if builder is not None and builder.cdata_list_attributes: 

1873 self.attrs = builder._replace_cdata_list_attribute_values( 

1874 self.name, attrs 

1875 ) 

1876 else: 

1877 self.attrs = attr_dict_class() 

1878 # Make sure that the values of any multi-valued 

1879 # attributes (e.g. when a Tag is copied) are stored in 

1880 # new lists. 

1881 for k, v in attrs.items(): 

1882 if isinstance(v, list): 

1883 v = v.__class__(v) 

1884 self.attrs[k] = v 

1885 

1886 # If possible, determine ahead of time whether this tag is an 

1887 # XML tag. 

1888 if builder: 

1889 self.known_xml = builder.is_xml 

1890 else: 

1891 self.known_xml = is_xml 

1892 self.contents: List[PageElement] = [] 

1893 self.setup(parent, previous) 

1894 self.hidden = False 

1895 

1896 if builder is None: 

1897 # In the absence of a TreeBuilder, use whatever values were 

1898 # passed in here. They're probably None, unless this is a copy of some 

1899 # other tag. 

1900 self.can_be_empty_element = can_be_empty_element 

1901 self.cdata_list_attributes = cdata_list_attributes 

1902 self.preserve_whitespace_tags = preserve_whitespace_tags 

1903 self.interesting_string_types = interesting_string_types 

1904 else: 

1905 # Set up any substitutions for this tag, such as the charset in a META tag. 

1906 self.attribute_value_list_class = builder.attribute_value_list_class 

1907 builder.set_up_substitutions(self) 

1908 

1909 # Ask the TreeBuilder whether this tag might be an empty-element tag. 

1910 self.can_be_empty_element = builder.can_be_empty_element(name) 

1911 

1912 # Keep track of the list of attributes of this tag that 

1913 # might need to be treated as a list. 

1914 # 

1915 # For performance reasons, we store the whole data structure 

1916 # rather than asking the question of every tag. Asking would 

1917 # require building a new data structure every time, and 

1918 # (unlike can_be_empty_element), we almost never need 

1919 # to check this. 

1920 self.cdata_list_attributes = builder.cdata_list_attributes 

1921 

1922 # Keep track of the names that might cause this tag to be treated as a 

1923 # whitespace-preserved tag. 

1924 self.preserve_whitespace_tags = builder.preserve_whitespace_tags 

1925 

1926 if self.name in builder.string_containers: 

1927 # This sort of tag uses a special string container 

1928 # subclass for most of its strings. We need to be able 

1929 # to look up the proper container subclass. 

1930 self.interesting_string_types = {builder.string_containers[self.name]} 

1931 else: 

1932 self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES 

1933 

1934 parser_class: Optional[type[BeautifulSoup]] 

1935 name: str 

1936 namespace: Optional[str] 

1937 prefix: Optional[str] 

1938 attrs: _AttributeValues 

1939 sourceline: Optional[int] 

1940 sourcepos: Optional[int] 

1941 known_xml: Optional[bool] 

1942 contents: List[PageElement] 

1943 hidden: bool 

1944 interesting_string_types: Optional[Set[Type[NavigableString]]] 

1945 

1946 can_be_empty_element: Optional[bool] 

1947 cdata_list_attributes: Optional[Dict[str, Set[str]]] 

1948 preserve_whitespace_tags: Optional[Set[str]] 

1949 

1950 #: :meta private: 

1951 parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0") 

1952 

1953 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self: 

1954 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree. 

1955 Its contents are a copy of the old Tag's contents. 

1956 """ 

1957 clone = self.copy_self() 

1958 

1959 if recursive: 

1960 # Clone this tag's descendants recursively, but without 

1961 # making any recursive function calls. 

1962 tag_stack: List[Tag] = [clone] 

1963 for event, element in self._event_stream(self.descendants): 

1964 if event is Tag.END_ELEMENT_EVENT: 

1965 # Stop appending incoming Tags to the Tag that was 

1966 # just closed. 

1967 tag_stack.pop() 

1968 else: 

1969 descendant_clone = element.__deepcopy__(memo, recursive=False) 

1970 # Add to its parent's .contents 

1971 tag_stack[-1].append(descendant_clone) 

1972 

1973 if event is Tag.START_ELEMENT_EVENT: 

1974 # Add the Tag itself to the stack so that its 

1975 # children will be .appended to it. 

1976 tag_stack.append(cast(Tag, descendant_clone)) 

1977 return clone 

1978 

1979 def copy_self(self) -> Self: 

1980 """Create a new Tag just like this one, but with no 

1981 contents and unattached to any parse tree. 

1982 

1983 This is the first step in the deepcopy process, but you can 

1984 call it on its own to create a copy of a Tag without copying its 

1985 contents. 

1986 """ 

1987 clone = type(self)( 

1988 None, 

1989 None, 

1990 self.name, 

1991 self.namespace, 

1992 self.prefix, 

1993 self.attrs, 

1994 is_xml=self._is_xml, 

1995 sourceline=self.sourceline, 

1996 sourcepos=self.sourcepos, 

1997 can_be_empty_element=self.can_be_empty_element, 

1998 cdata_list_attributes=self.cdata_list_attributes, 

1999 preserve_whitespace_tags=self.preserve_whitespace_tags, 

2000 interesting_string_types=self.interesting_string_types, 

2001 namespaces=self._namespaces, 

2002 ) 

2003 for attr in ("can_be_empty_element", "hidden"): 

2004 setattr(clone, attr, getattr(self, attr)) 

2005 return clone 

2006 

2007 @property 

2008 def is_empty_element(self) -> bool: 

2009 """Is this tag an empty-element tag? (aka a self-closing tag) 

2010 

2011 A tag that has contents is never an empty-element tag. 

2012 

2013 A tag that has no contents may or may not be an empty-element 

2014 tag. It depends on the `TreeBuilder` used to create the 

2015 tag. If the builder has a designated list of empty-element 

2016 tags, then only a tag whose name shows up in that list is 

2017 considered an empty-element tag. This is usually the case 

2018 for HTML documents. 

2019 

2020 If the builder has no designated list of empty-element, then 

2021 any tag with no contents is an empty-element tag. This is usually 

2022 the case for XML documents. 

2023 """ 

2024 return len(self.contents) == 0 and self.can_be_empty_element is True 

2025 

2026 @_deprecated("is_empty_element", "4.0.0") 

2027 def isSelfClosing(self) -> bool: 

2028 ": :meta private:" 

2029 return self.is_empty_element 

2030 

2031 @property 

2032 def string(self) -> Optional[str]: 

2033 """Convenience property to get the single string within this 

2034 `Tag`, assuming there is just one. 

2035 

2036 :return: If this `Tag` has a single child that's a 

2037 `NavigableString`, the return value is that string. If this 

2038 element has one child `Tag`, the return value is that child's 

2039 `Tag.string`, recursively. If this `Tag` has no children, 

2040 or has more than one child, the return value is ``None``. 

2041 

2042 If this property is unexpectedly returning ``None`` for you, 

2043 it's probably because your `Tag` has more than one thing 

2044 inside it. 

2045 """ 

2046 if len(self.contents) != 1: 

2047 return None 

2048 child = self.contents[0] 

2049 if isinstance(child, NavigableString): 

2050 return child 

2051 elif isinstance(child, Tag): 

2052 return child.string 

2053 return None 

2054 

2055 @string.setter 

2056 def string(self, string: str) -> None: 

2057 """Replace the `Tag.contents` of this `Tag` with a single string.""" 

2058 self.clear() 

2059 if isinstance(string, NavigableString): 

2060 new_class = string.__class__ 

2061 else: 

2062 new_class = NavigableString 

2063 self.append(new_class(string)) 

2064 

2065 #: :meta private: 

2066 MAIN_CONTENT_STRING_TYPES = {NavigableString, CData} 

2067 

2068 def _all_strings( 

2069 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default 

2070 ) -> Iterator[str]: 

2071 """Yield all strings of certain classes, possibly stripping them. 

2072 

2073 :param strip: If True, all strings will be stripped before being 

2074 yielded. 

2075 

2076 :param types: A tuple of NavigableString subclasses. Any strings of 

2077 a subclass not found in this list will be ignored. By 

2078 default, the subclasses considered are the ones found in 

2079 self.interesting_string_types. If that's not specified, 

2080 only NavigableString and CData objects will be 

2081 considered. That means no comments, processing 

2082 instructions, etc. 

2083 """ 

2084 if types is self.default: 

2085 if self.interesting_string_types is None: 

2086 types = self.MAIN_CONTENT_STRING_TYPES 

2087 else: 

2088 types = self.interesting_string_types 

2089 

2090 for descendant in self.descendants: 

2091 if not isinstance(descendant, NavigableString): 

2092 continue 

2093 descendant_type = type(descendant) 

2094 if isinstance(types, type): 

2095 if descendant_type is not types: 

2096 # We're not interested in strings of this type. 

2097 continue 

2098 elif types is not None and descendant_type not in types: 

2099 # We're not interested in strings of this type. 

2100 continue 

2101 if strip: 

2102 stripped = descendant.strip() 

2103 if len(stripped) == 0: 

2104 continue 

2105 yield stripped 

2106 else: 

2107 yield descendant 

2108 

2109 strings = property(_all_strings) 

2110 

2111 def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]: 

2112 """Insert one or more new PageElements as a child of this `Tag`. 

2113 

2114 This works similarly to :py:meth:`list.insert`, except you can insert 

2115 multiple elements at once. 

2116 

2117 :param position: The numeric position that should be occupied 

2118 in this Tag's `Tag.children` by the first new `PageElement`. 

2119 

2120 :param new_children: The PageElements to insert. 

2121 

2122 :return The newly inserted PageElements. 

2123 """ 

2124 inserted: List[PageElement] = [] 

2125 for new_child in new_children: 

2126 inserted.extend(self._insert(position, new_child)) 

2127 position += 1 

2128 return inserted 

2129 

2130 def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]: 

2131 if new_child is None: 

2132 raise ValueError("Cannot insert None into a tag.") 

2133 if new_child is self: 

2134 raise ValueError("Cannot insert a tag into itself.") 

2135 if isinstance(new_child, str) and not isinstance(new_child, NavigableString): 

2136 new_child = NavigableString(new_child) 

2137 

2138 from bs4 import BeautifulSoup 

2139 if isinstance(new_child, BeautifulSoup): 

2140 # We don't want to end up with a situation where one BeautifulSoup 

2141 # object contains another. Insert the BeautifulSoup's children and 

2142 # return them. 

2143 return self.insert(position, *list(new_child.contents)) 

2144 position = min(position, len(self.contents)) 

2145 if hasattr(new_child, "parent") and new_child.parent is not None: 

2146 # We're 'inserting' an element that's already one 

2147 # of this object's children. 

2148 if new_child.parent is self: 

2149 current_index = self.index(new_child) 

2150 if current_index < position: 

2151 # We're moving this element further down the list 

2152 # of this object's children. That means that when 

2153 # we extract this element, our target index will 

2154 # jump down one. 

2155 position -= 1 

2156 elif current_index == position: 

2157 # We're 'inserting' an element into its current location. 

2158 # This is a no-op. 

2159 return [new_child] 

2160 new_child.extract() 

2161 

2162 new_child.parent = self 

2163 previous_child = None 

2164 if position == 0: 

2165 new_child.previous_sibling = None 

2166 new_child.previous_element = self 

2167 else: 

2168 previous_child = self.contents[position - 1] 

2169 new_child.previous_sibling = previous_child 

2170 new_child.previous_sibling.next_sibling = new_child 

2171 new_child.previous_element = previous_child._last_descendant(False) 

2172 if new_child.previous_element is not None: 

2173 new_child.previous_element.next_element = new_child 

2174 

2175 new_childs_last_element = new_child._last_descendant( 

2176 is_initialized=False, accept_self=True 

2177 ) 

2178 # new_childs_last_element can't be None because we passed 

2179 # accept_self=True into _last_descendant. Worst case, 

2180 # new_childs_last_element will be new_child itself. Making 

2181 # this cast removes several mypy complaints later on as we 

2182 # manipulate new_childs_last_element. 

2183 new_childs_last_element = cast(PageElement, new_childs_last_element) 

2184 

2185 if position >= len(self.contents): 

2186 new_child.next_sibling = None 

2187 

2188 parent: Optional[Tag] = self 

2189 parents_next_sibling = None 

2190 while parents_next_sibling is None and parent is not None: 

2191 parents_next_sibling = parent.next_sibling 

2192 parent = parent.parent 

2193 if parents_next_sibling is not None: 

2194 # We found the element that comes next in the document. 

2195 break 

2196 if parents_next_sibling is not None: 

2197 new_childs_last_element.next_element = parents_next_sibling 

2198 else: 

2199 # The last element of this tag is the last element in 

2200 # the document. 

2201 new_childs_last_element.next_element = None 

2202 else: 

2203 next_child = self.contents[position] 

2204 new_child.next_sibling = next_child 

2205 if new_child.next_sibling is not None: 

2206 new_child.next_sibling.previous_sibling = new_child 

2207 new_childs_last_element.next_element = next_child 

2208 

2209 if new_childs_last_element.next_element is not None: 

2210 new_childs_last_element.next_element.previous_element = ( 

2211 new_childs_last_element 

2212 ) 

2213 self.contents.insert(position, new_child) 

2214 

2215 return [new_child] 

2216 

2217 def unwrap(self) -> Self: 

2218 """Replace this `PageElement` with its contents. 

2219 

2220 :return: This object, no longer part of the tree. 

2221 """ 

2222 my_parent = self.parent 

2223 if my_parent is None: 

2224 raise ValueError( 

2225 "Cannot replace an element with its contents when that " 

2226 "element is not part of a tree." 

2227 ) 

2228 my_index = my_parent.index(self) 

2229 self.extract(_self_index=my_index) 

2230 for child in reversed(self.contents[:]): 

2231 my_parent.insert(my_index, child) 

2232 return self 

2233 

2234 replace_with_children = unwrap 

2235 

2236 @_deprecated("unwrap", "4.0.0") 

2237 def replaceWithChildren(self) -> _OneElement: 

2238 ": :meta private:" 

2239 return self.unwrap() 

2240 

2241 def append(self, tag: _InsertableElement) -> PageElement|List[PageElement]: 

2242 """Appends the given `PageElement` to the contents of this `Tag`. 

2243 

2244 :param tag: A PageElement. If this is another BeautifulSoup 

2245 object, all of its contents will be inserted into this 

2246 `Tag`, since one BeautifulSoup object can't contain another 

2247 one. 

2248 

2249 :return: The object that was just appended, or (if `tag` was a BeautifulSoup 

2250 object) all such objects. 

2251 """ 

2252 inserted = self.insert(len(self.contents), tag) 

2253 if isinstance(tag, Tag) and tag.name == "[document]": # TODO: can't reference BeautifulSoup class in this module 

2254 return inserted 

2255 else: 

2256 return inserted[0] 

2257 

2258 def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]: 

2259 """Appends one or more objects to the contents of this 

2260 `Tag`. 

2261 

2262 :param tags: If a list of `PageElement` objects is provided, 

2263 they will be appended to this tag's contents, one at a time. 

2264 If a single `Tag` is provided, its `Tag.contents` will be 

2265 used to extend this object's `Tag.contents`. 

2266 

2267 :return The list of PageElements that were appended. 

2268 """ 

2269 tag_list: Iterable[_InsertableElement] 

2270 

2271 if isinstance(tags, Tag): 

2272 tag_list = list(tags.contents) 

2273 elif isinstance(tags, (PageElement, str)): 

2274 # The caller should really be using append() instead, 

2275 # but we can make it work. 

2276 warnings.warn( 

2277 "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.", 

2278 UserWarning, 

2279 stacklevel=2, 

2280 ) 

2281 if isinstance(tags, str) and not isinstance(tags, PageElement): 

2282 tags = NavigableString(tags) 

2283 tag_list = [tags] 

2284 elif isinstance(tags, Iterable): 

2285 # Moving items around the tree may change their position in 

2286 # the original list. Make a list that won't change. 

2287 tag_list = list(tags) 

2288 

2289 results: List[PageElement] = [] 

2290 for tag in tag_list: 

2291 appended = self.append(tag) 

2292 if isinstance(appended, list): 

2293 # This can happen if you pass in a mixture of Tag and BeautifulSoup objects. 

2294 results.extend(appended) 

2295 else: 

2296 results.append(appended) 

2297 

2298 return results 

2299 

2300 def clear(self, decompose: bool = False) -> None: 

2301 """Destroy all children of this `Tag` by calling 

2302 `PageElement.extract` on them. 

2303 

2304 :param decompose: If this is True, `PageElement.decompose` (a 

2305 more destructive method) will be called instead of 

2306 `PageElement.extract`. 

2307 """ 

2308 for element in self.contents[:]: 

2309 if decompose: 

2310 element.decompose() 

2311 else: 

2312 element.extract() 

2313 

2314 def smooth(self) -> None: 

2315 """Smooth out the children of this `Tag` by consolidating consecutive 

2316 strings. 

2317 

2318 If you perform a lot of operations that modify the tree, 

2319 calling this method afterwards can make pretty-printed output 

2320 look more natural. 

2321 """ 

2322 # Mark the first position of every pair of children that need 

2323 # to be consolidated. Do this rather than making a copy of 

2324 # self.contents, since in most cases very few strings will be 

2325 # affected. 

2326 marked = [] 

2327 for i, a in enumerate(self.contents): 

2328 if isinstance(a, Tag): 

2329 # Recursively smooth children. 

2330 a.smooth() 

2331 if i == len(self.contents) - 1: 

2332 # This is the last item in .contents, and it's not a 

2333 # tag. There's no chance it needs any work. 

2334 continue 

2335 b = self.contents[i + 1] 

2336 if ( 

2337 isinstance(a, NavigableString) 

2338 and isinstance(b, NavigableString) 

2339 and not isinstance(a, PreformattedString) 

2340 and not isinstance(b, PreformattedString) 

2341 ): 

2342 marked.append(i) 

2343 

2344 # Go over the marked positions in reverse order, so that 

2345 # removing items from .contents won't affect the remaining 

2346 # positions. 

2347 for i in reversed(marked): 

2348 a = cast(NavigableString, self.contents[i]) 

2349 b = cast(NavigableString, self.contents[i + 1]) 

2350 b.extract() 

2351 n = NavigableString(a + b) 

2352 a.replace_with(n) 

2353 

2354 def index(self, element: PageElement) -> int: 

2355 """Find the index of a child of this `Tag` (by identity, not value). 

2356 

2357 Doing this by identity avoids issues when a `Tag` contains two 

2358 children that have string equality. 

2359 

2360 :param element: Look for this `PageElement` in this object's contents. 

2361 """ 

2362 for i, child in enumerate(self.contents): 

2363 if child is element: 

2364 return i 

2365 raise ValueError("Tag.index: element not in tag") 

2366 

2367 def get( 

2368 self, key: str, default: Optional[_AttributeValue] = None 

2369 ) -> Optional[_AttributeValue]: 

2370 """Returns the value of the 'key' attribute for the tag, or 

2371 the value given for 'default' if it doesn't have that 

2372 attribute. 

2373 

2374 :param key: The attribute to look for. 

2375 :param default: Use this value if the attribute is not present 

2376 on this `Tag`. 

2377 """ 

2378 return self.attrs.get(key, default) 

2379 

2380 def get_attribute_list( 

2381 self, key: str, default: Optional[AttributeValueList] = None 

2382 ) -> AttributeValueList: 

2383 """The same as get(), but always returns a (possibly empty) list. 

2384 

2385 :param key: The attribute to look for. 

2386 :param default: Use this value if the attribute is not present 

2387 on this `Tag`. 

2388 :return: A list of strings, usually empty or containing only a single 

2389 value. 

2390 """ 

2391 list_value: AttributeValueList 

2392 value = self.get(key, default) 

2393 if value is None: 

2394 list_value = self.attribute_value_list_class() 

2395 elif isinstance(value, list): 

2396 list_value = value 

2397 else: 

2398 if not isinstance(value, str): 

2399 value = cast(str, value) 

2400 list_value = self.attribute_value_list_class([value]) 

2401 return list_value 

2402 

2403 def has_attr(self, key: str) -> bool: 

2404 """Does this `Tag` have an attribute with the given name?""" 

2405 return key in self.attrs 

2406 

2407 def __hash__(self) -> int: 

2408 return str(self).__hash__() 

2409 

2410 def __getitem__(self, key: str) -> _AttributeValue: 

2411 """tag[key] returns the value of the 'key' attribute for the Tag, 

2412 and throws an exception if it's not there.""" 

2413 return self.attrs[key] 

2414 

2415 def __iter__(self) -> Iterator[PageElement]: 

2416 "Iterating over a Tag iterates over its contents." 

2417 return iter(self.contents) 

2418 

2419 def __len__(self) -> int: 

2420 "The length of a Tag is the length of its list of contents." 

2421 return len(self.contents) 

2422 

2423 def __contains__(self, x: Any) -> bool: 

2424 return x in self.contents 

2425 

2426 def __bool__(self) -> bool: 

2427 "A tag is non-None even if it has no contents." 

2428 return True 

2429 

2430 def __setitem__(self, key: str, value: _AttributeValue) -> None: 

2431 """Setting tag[key] sets the value of the 'key' attribute for the 

2432 tag.""" 

2433 self.attrs[key] = value 

2434 

2435 def __delitem__(self, key: str) -> None: 

2436 "Deleting tag[key] deletes all 'key' attributes for the tag." 

2437 self.attrs.pop(key, None) 

2438 

2439 @overload 

2440 def __call__( # pyright: ignore [reportOverlappingOverload] 

2441 self, 

2442 name: _FindMethodName = None, 

2443 attrs: Optional[_StrainableAttributes] = None, 

2444 recursive: bool = True, 

2445 string: None = None, 

2446 limit: Optional[int] = None, 

2447 _stacklevel: int = 2, 

2448 **kwargs: _StrainableAttribute, 

2449 ) -> _SomeTags: 

2450 ... 

2451 

2452 @overload 

2453 def __call__( 

2454 self, 

2455 name: None = None, 

2456 attrs: None = None, 

2457 recursive: bool = True, 

2458 string: _StrainableString = "", 

2459 limit: Optional[int] = None, 

2460 _stacklevel: int = 2, 

2461 **kwargs: _StrainableAttribute, 

2462 ) -> _SomeNavigableStrings: 

2463 ... 

2464 

2465 def __call__( 

2466 self, 

2467 name: _FindMethodName = None, 

2468 attrs: Optional[_StrainableAttributes] = None, 

2469 recursive: bool = True, 

2470 string: Optional[_StrainableString] = None, 

2471 limit: Optional[int] = None, 

2472 _stacklevel: int = 2, 

2473 **kwargs: _StrainableAttribute, 

2474 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]: 

2475 """Calling a Tag like a function is the same as calling its 

2476 find_all() method. Eg. tag('a') returns a list of all the A tags 

2477 found within this tag.""" 

2478 if string is not None and (name is not None or attrs is not None or kwargs): 

2479 # TODO: Using the @overload decorator to express the three ways you 

2480 # could get into this path is way too much code for a rarely(?) used 

2481 # feature. 

2482 return cast(ResultSet[Tag], self.find_all(name, attrs, recursive, string, limit, _stacklevel, **kwargs)) #type: ignore 

2483 

2484 if string is None: 

2485 # If string is None, we're searching for tags. 

2486 tags:ResultSet[Tag] = self.find_all( 

2487 name, attrs, recursive, None, limit, _stacklevel, **kwargs 

2488 ) 

2489 return tags 

2490 

2491 # Otherwise, we're searching for strings. 

2492 strings:ResultSet[NavigableString] = self.find_all( 

2493 None, None, recursive, string, limit, _stacklevel, **kwargs 

2494 ) 

2495 return strings 

2496 

2497 def __getattr__(self, subtag: str) -> Optional[Tag]: 

2498 """Calling tag.subtag is the same as calling tag.find(name="subtag")""" 

2499 # print("Getattr %s.%s" % (self.__class__, tag)) 

2500 result: _AtMostOneElement 

2501 if len(subtag) > 3 and subtag.endswith("Tag"): 

2502 # BS3: soup.aTag -> "soup.find("a") 

2503 tag_name = subtag[:-3] 

2504 warnings.warn( 

2505 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' 

2506 % dict(name=tag_name), 

2507 DeprecationWarning, 

2508 stacklevel=2, 

2509 ) 

2510 result = self.find(tag_name) 

2511 # We special case contents to avoid recursion. 

2512 elif not subtag.startswith("__") and not subtag == "contents": 

2513 result = self.find(subtag) 

2514 else: 

2515 raise AttributeError( 

2516 "'%s' object has no attribute '%s'" % (self.__class__, subtag) 

2517 ) 

2518 return result 

2519 

2520 def __eq__(self, other: Any) -> bool: 

2521 """Returns true iff this Tag has the same name, the same attributes, 

2522 and the same contents (recursively) as `other`.""" 

2523 if self is other: 

2524 return True 

2525 if not isinstance(other, Tag): 

2526 return False 

2527 if ( 

2528 not hasattr(other, "name") 

2529 or not hasattr(other, "attrs") 

2530 or not hasattr(other, "contents") 

2531 or self.name != other.name 

2532 or self.attrs != other.attrs 

2533 or len(self) != len(other) 

2534 ): 

2535 return False 

2536 for i, my_child in enumerate(self.contents): 

2537 if my_child != other.contents[i]: 

2538 return False 

2539 return True 

2540 

2541 def __ne__(self, other: Any) -> bool: 

2542 """Returns true iff this Tag is not identical to `other`, 

2543 as defined in __eq__.""" 

2544 return not self == other 

2545 

2546 def __repr__(self) -> str: 

2547 """Renders this `Tag` as a string.""" 

2548 return self.decode() 

2549 

2550 __str__ = __unicode__ = __repr__ 

2551 

2552 def encode( 

2553 self, 

2554 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2555 indent_level: Optional[int] = None, 

2556 formatter: _FormatterOrName = "minimal", 

2557 errors: str = "xmlcharrefreplace", 

2558 ) -> bytes: 

2559 """Render this `Tag` and its contents as a bytestring. 

2560 

2561 :param encoding: The encoding to use when converting to 

2562 a bytestring. This may also affect the text of the document, 

2563 specifically any encoding declarations within the document. 

2564 :param indent_level: Each line of the rendering will be 

2565 indented this many levels. (The ``formatter`` decides what a 

2566 'level' means, in terms of spaces or other characters 

2567 output.) This is used internally in recursive calls while 

2568 pretty-printing. 

2569 :param formatter: Either a `Formatter` object, or a string naming one of 

2570 the standard formatters. 

2571 :param errors: An error handling strategy such as 

2572 'xmlcharrefreplace'. This value is passed along into 

2573 :py:meth:`str.encode` and its value should be one of the `error 

2574 handling constants defined by Python's codecs module 

2575 <https://docs.python.org/3/library/codecs.html#error-handlers>`_. 

2576 """ 

2577 # Turn the data structure into Unicode, then encode the 

2578 # Unicode. 

2579 u = self.decode(indent_level, encoding, formatter) 

2580 return u.encode(encoding, errors) 

2581 

2582 def decode( 

2583 self, 

2584 indent_level: Optional[int] = None, 

2585 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2586 formatter: _FormatterOrName = "minimal", 

2587 iterator: Optional[Iterator[PageElement]] = None, 

2588 ) -> str: 

2589 """Render this `Tag` and its contents as a Unicode string. 

2590 

2591 :param indent_level: Each line of the rendering will be 

2592 indented this many levels. (The ``formatter`` decides what a 

2593 'level' means, in terms of spaces or other characters 

2594 output.) This is used internally in recursive calls while 

2595 pretty-printing. 

2596 :param encoding: The encoding you intend to use when 

2597 converting the string to a bytestring. decode() is *not* 

2598 responsible for performing that encoding. This information 

2599 is needed so that a real encoding can be substituted in if 

2600 the document contains an encoding declaration (e.g. in a 

2601 <meta> tag). 

2602 :param formatter: Either a `Formatter` object, or a string 

2603 naming one of the standard formatters. 

2604 :param iterator: The iterator to use when navigating over the 

2605 parse tree. This is only used by `Tag.decode_contents` and 

2606 you probably won't need to use it. 

2607 """ 

2608 pieces = [] 

2609 # First off, turn a non-Formatter `formatter` into a Formatter 

2610 # object. This will stop the lookup from happening over and 

2611 # over again. 

2612 if not isinstance(formatter, Formatter): 

2613 formatter = self.formatter_for_name(formatter) 

2614 

2615 if indent_level is True: 

2616 indent_level = 0 

2617 

2618 # The currently active tag that put us into string literal 

2619 # mode. Until this element is closed, children will be treated 

2620 # as string literals and not pretty-printed. String literal 

2621 # mode is turned on immediately after this tag begins, and 

2622 # turned off immediately before it's closed. This means there 

2623 # will be whitespace before and after the tag itself. 

2624 string_literal_tag = None 

2625 

2626 for event, element in self._event_stream(iterator): 

2627 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): 

2628 element = cast(Tag, element) 

2629 piece = element._format_tag(eventual_encoding, formatter, opening=True) 

2630 elif event is Tag.END_ELEMENT_EVENT: 

2631 element = cast(Tag, element) 

2632 piece = element._format_tag(eventual_encoding, formatter, opening=False) 

2633 if indent_level is not None: 

2634 indent_level -= 1 

2635 else: 

2636 element = cast(NavigableString, element) 

2637 piece = element.output_ready(formatter) 

2638 

2639 # Now we need to apply the 'prettiness' -- extra 

2640 # whitespace before and/or after this tag. This can get 

2641 # complicated because certain tags, like <pre> and 

2642 # <script>, can't be prettified, since adding whitespace would 

2643 # change the meaning of the content. 

2644 

2645 # The default behavior is to add whitespace before and 

2646 # after an element when string literal mode is off, and to 

2647 # leave things as they are when string literal mode is on. 

2648 if string_literal_tag: 

2649 indent_before = indent_after = False 

2650 else: 

2651 indent_before = indent_after = True 

2652 

2653 # The only time the behavior is more complex than that is 

2654 # when we encounter an opening or closing tag that might 

2655 # put us into or out of string literal mode. 

2656 if ( 

2657 event is Tag.START_ELEMENT_EVENT 

2658 and not string_literal_tag 

2659 and not cast(Tag, element)._should_pretty_print() 

2660 ): 

2661 # We are about to enter string literal mode. Add 

2662 # whitespace before this tag, but not after. We 

2663 # will stay in string literal mode until this tag 

2664 # is closed. 

2665 indent_before = True 

2666 indent_after = False 

2667 string_literal_tag = element 

2668 elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag: 

2669 # We are about to exit string literal mode by closing 

2670 # the tag that sent us into that mode. Add whitespace 

2671 # after this tag, but not before. 

2672 indent_before = False 

2673 indent_after = True 

2674 string_literal_tag = None 

2675 

2676 # Now we know whether to add whitespace before and/or 

2677 # after this element. 

2678 if indent_level is not None: 

2679 if indent_before or indent_after: 

2680 if isinstance(element, NavigableString): 

2681 piece = piece.strip() 

2682 if piece: 

2683 piece = self._indent_string( 

2684 piece, indent_level, formatter, indent_before, indent_after 

2685 ) 

2686 if event == Tag.START_ELEMENT_EVENT: 

2687 indent_level += 1 

2688 pieces.append(piece) 

2689 return "".join(pieces) 

2690 

2691 class _TreeTraversalEvent(object): 

2692 """An internal class representing an event in the process 

2693 of traversing a parse tree. 

2694 

2695 :meta private: 

2696 """ 

2697 

2698 # Stand-ins for the different events yielded by _event_stream 

2699 START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2700 END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2701 EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2702 STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2703 

2704 def _event_stream( 

2705 self, iterator: Optional[Iterator[PageElement]] = None 

2706 ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]: 

2707 """Yield a sequence of events that can be used to reconstruct the DOM 

2708 for this element. 

2709 

2710 This lets us recreate the nested structure of this element 

2711 (e.g. when formatting it as a string) without using recursive 

2712 method calls. 

2713 

2714 This is similar in concept to the SAX API, but it's a simpler 

2715 interface designed for internal use. The events are different 

2716 from SAX and the arguments associated with the events are Tags 

2717 and other Beautiful Soup objects. 

2718 

2719 :param iterator: An alternate iterator to use when traversing 

2720 the tree. 

2721 """ 

2722 tag_stack: List[Tag] = [] 

2723 

2724 iterator = iterator or self.self_and_descendants 

2725 

2726 for c in iterator: 

2727 # If the parent of the element we're about to yield is not 

2728 # the tag currently on the stack, it means that the tag on 

2729 # the stack closed before this element appeared. 

2730 while tag_stack and c.parent != tag_stack[-1]: 

2731 now_closed_tag = tag_stack.pop() 

2732 yield Tag.END_ELEMENT_EVENT, now_closed_tag 

2733 

2734 if isinstance(c, Tag): 

2735 if c.is_empty_element: 

2736 yield Tag.EMPTY_ELEMENT_EVENT, c 

2737 else: 

2738 yield Tag.START_ELEMENT_EVENT, c 

2739 tag_stack.append(c) 

2740 continue 

2741 else: 

2742 yield Tag.STRING_ELEMENT_EVENT, c 

2743 

2744 while tag_stack: 

2745 now_closed_tag = tag_stack.pop() 

2746 yield Tag.END_ELEMENT_EVENT, now_closed_tag 

2747 

2748 def _indent_string( 

2749 self, 

2750 s: str, 

2751 indent_level: int, 

2752 formatter: Formatter, 

2753 indent_before: bool, 

2754 indent_after: bool, 

2755 ) -> str: 

2756 """Add indentation whitespace before and/or after a string. 

2757 

2758 :param s: The string to amend with whitespace. 

2759 :param indent_level: The indentation level; affects how much 

2760 whitespace goes before the string. 

2761 :param indent_before: Whether or not to add whitespace 

2762 before the string. 

2763 :param indent_after: Whether or not to add whitespace 

2764 (a newline) after the string. 

2765 """ 

2766 space_before = "" 

2767 if indent_before and indent_level: 

2768 space_before = formatter.indent * indent_level 

2769 

2770 space_after = "" 

2771 if indent_after: 

2772 space_after = "\n" 

2773 

2774 return space_before + s + space_after 

2775 

2776 def _format_tag( 

2777 self, eventual_encoding: str, formatter: Formatter, opening: bool 

2778 ) -> str: 

2779 if self.hidden: 

2780 # A hidden tag is invisible, although its contents 

2781 # are visible. 

2782 return "" 

2783 

2784 # A tag starts with the < character (see below). 

2785 

2786 # Then the / character, if this is a closing tag. 

2787 closing_slash = "" 

2788 if not opening: 

2789 closing_slash = "/" 

2790 

2791 # Then an optional namespace prefix. 

2792 prefix = "" 

2793 if self.prefix: 

2794 prefix = self.prefix + ":" 

2795 

2796 # Then a list of attribute values, if this is an opening tag. 

2797 attribute_string = "" 

2798 if opening: 

2799 attributes = formatter.attributes(self) 

2800 attrs = [] 

2801 for key, val in attributes: 

2802 if val is None: 

2803 decoded = key 

2804 else: 

2805 if isinstance(val, list) or isinstance(val, tuple): 

2806 val = " ".join(val) 

2807 elif not isinstance(val, str): 

2808 val = str(val) 

2809 elif ( 

2810 isinstance(val, AttributeValueWithCharsetSubstitution) 

2811 and eventual_encoding is not None 

2812 ): 

2813 val = val.substitute_encoding(eventual_encoding) 

2814 

2815 text = formatter.attribute_value(val) 

2816 decoded = str(key) + "=" + formatter.quoted_attribute_value(text) 

2817 attrs.append(decoded) 

2818 if attrs: 

2819 attribute_string = " " + " ".join(attrs) 

2820 

2821 # Then an optional closing slash (for a void element in an 

2822 # XML document). 

2823 void_element_closing_slash = "" 

2824 if self.is_empty_element: 

2825 void_element_closing_slash = formatter.void_element_close_prefix or "" 

2826 

2827 # Put it all together. 

2828 return ( 

2829 "<" 

2830 + closing_slash 

2831 + prefix 

2832 + self.name 

2833 + attribute_string 

2834 + void_element_closing_slash 

2835 + ">" 

2836 ) 

2837 

2838 def _should_pretty_print(self, indent_level: int = 1) -> bool: 

2839 """Should this tag be pretty-printed? 

2840 

2841 Most of them should, but some (such as <pre> in HTML 

2842 documents) should not. 

2843 """ 

2844 return indent_level is not None and ( 

2845 not self.preserve_whitespace_tags 

2846 or self.name not in self.preserve_whitespace_tags 

2847 ) 

2848 

2849 @overload 

2850 def prettify( 

2851 self, 

2852 encoding: None = None, 

2853 formatter: _FormatterOrName = "minimal", 

2854 ) -> str: 

2855 ... 

2856 

2857 @overload 

2858 def prettify( 

2859 self, 

2860 encoding: _Encoding, 

2861 formatter: _FormatterOrName = "minimal", 

2862 ) -> bytes: 

2863 ... 

2864 

2865 def prettify( 

2866 self, 

2867 encoding: Optional[_Encoding] = None, 

2868 formatter: _FormatterOrName = "minimal", 

2869 ) -> Union[str, bytes]: 

2870 """Pretty-print this `Tag` as a string or bytestring. 

2871 

2872 :param encoding: The encoding of the bytestring, or None if you want Unicode. 

2873 :param formatter: A Formatter object, or a string naming one of 

2874 the standard formatters. 

2875 :return: A string (if no ``encoding`` is provided) or a bytestring 

2876 (otherwise). 

2877 """ 

2878 if encoding is None: 

2879 return self.decode(indent_level=0, formatter=formatter) 

2880 else: 

2881 return self.encode(encoding=encoding, indent_level=0, formatter=formatter) 

2882 

2883 def decode_contents( 

2884 self, 

2885 indent_level: Optional[int] = None, 

2886 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2887 formatter: _FormatterOrName = "minimal", 

2888 ) -> str: 

2889 """Renders the contents of this tag as a Unicode string. 

2890 

2891 :param indent_level: Each line of the rendering will be 

2892 indented this many levels. (The formatter decides what a 

2893 'level' means in terms of spaces or other characters 

2894 output.) Used internally in recursive calls while 

2895 pretty-printing. 

2896 

2897 :param eventual_encoding: The tag is destined to be 

2898 encoded into this encoding. decode_contents() is *not* 

2899 responsible for performing that encoding. This information 

2900 is needed so that a real encoding can be substituted in if 

2901 the document contains an encoding declaration (e.g. in a 

2902 <meta> tag). 

2903 

2904 :param formatter: A `Formatter` object, or a string naming one of 

2905 the standard Formatters. 

2906 """ 

2907 return self.decode( 

2908 indent_level, eventual_encoding, formatter, iterator=self.descendants 

2909 ) 

2910 

2911 def encode_contents( 

2912 self, 

2913 indent_level: Optional[int] = None, 

2914 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2915 formatter: _FormatterOrName = "minimal", 

2916 ) -> bytes: 

2917 """Renders the contents of this PageElement as a bytestring. 

2918 

2919 :param indent_level: Each line of the rendering will be 

2920 indented this many levels. (The ``formatter`` decides what a 

2921 'level' means, in terms of spaces or other characters 

2922 output.) This is used internally in recursive calls while 

2923 pretty-printing. 

2924 :param formatter: Either a `Formatter` object, or a string naming one of 

2925 the standard formatters. 

2926 :param encoding: The bytestring will be in this encoding. 

2927 """ 

2928 contents = self.decode_contents(indent_level, encoding, formatter) 

2929 return contents.encode(encoding) 

2930 

2931 @_deprecated("encode_contents", "4.0.0") 

2932 def renderContents( 

2933 self, 

2934 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2935 prettyPrint: bool = False, 

2936 indentLevel: Optional[int] = 0, 

2937 ) -> bytes: 

2938 """Deprecated method for BS3 compatibility. 

2939 

2940 :meta private: 

2941 """ 

2942 if not prettyPrint: 

2943 indentLevel = None 

2944 return self.encode_contents(indent_level=indentLevel, encoding=encoding) 

2945 

2946 # Soup methods 

2947 

2948 @overload 

2949 def find( 

2950 self, 

2951 name: _FindMethodName = None, 

2952 attrs: Optional[_StrainableAttributes] = None, 

2953 recursive: bool = True, 

2954 string: None=None, 

2955 **kwargs: _StrainableAttribute, 

2956 ) -> _AtMostOneTag: 

2957 ... 

2958 

2959 @overload 

2960 def find( 

2961 self, 

2962 name: None=None, 

2963 attrs: None=None, 

2964 recursive: bool = True, 

2965 string: _StrainableString="", 

2966 ) -> _AtMostOneNavigableString: 

2967 ... 

2968 

2969 def find( 

2970 self, 

2971 name: _FindMethodName = None, 

2972 attrs: Optional[_StrainableAttributes] = None, 

2973 recursive: bool = True, 

2974 string: Optional[_StrainableString] = None, 

2975 **kwargs: _StrainableAttribute, 

2976 ) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]: 

2977 """Look in the children of this PageElement and find the first 

2978 PageElement that matches the given criteria. 

2979 

2980 All find_* methods take a common set of arguments. See the online 

2981 documentation for detailed explanations. 

2982 

2983 :param name: A filter on tag name. 

2984 :param attrs: Additional filters on attribute values. 

2985 :param recursive: If this is True, find() will perform a 

2986 recursive search of this Tag's children. Otherwise, 

2987 only the direct children will be considered. 

2988 :param string: A filter on the `Tag.string` attribute. 

2989 :kwargs: Additional filters on attribute values. 

2990 """ 

2991 if string is not None and (name is not None or attrs is not None or kwargs): 

2992 # TODO: Using the @overload decorator to express the three ways you 

2993 # could get into this path is way too much code for a rarely(?) used 

2994 # feature. 

2995 elements = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) # type:ignore 

2996 if elements: 

2997 return cast(Tag, elements[0]) 

2998 elif string is None: 

2999 tags = self.find_all(name, attrs, recursive, None, 1, _stacklevel=3, **kwargs) 

3000 if tags: 

3001 return cast(Tag, tags[0]) 

3002 else: 

3003 strings = self.find_all(None, None, recursive, string, 1, _stacklevel=3, **kwargs) 

3004 if strings: 

3005 return cast(NavigableString, strings[0]) 

3006 return None 

3007 

3008 findChild = _deprecated_function_alias("findChild", "find", "3.0.0") 

3009 

3010 @overload 

3011 def find_all( # pyright: ignore [reportOverlappingOverload] 

3012 self, 

3013 name: _FindMethodName = None, 

3014 attrs: Optional[_StrainableAttributes] = None, 

3015 recursive: bool = True, 

3016 string: None = None, 

3017 limit: Optional[int] = None, 

3018 _stacklevel: int = 2, 

3019 **kwargs: _StrainableAttribute, 

3020 ) -> _SomeTags: 

3021 ... 

3022 

3023 @overload 

3024 def find_all( 

3025 self, 

3026 name: None = None, 

3027 attrs: None = None, 

3028 recursive: bool = True, 

3029 string: _StrainableString = "", 

3030 limit: Optional[int] = None, 

3031 _stacklevel: int = 2, 

3032 **kwargs: _StrainableAttribute, 

3033 ) -> _SomeNavigableStrings: 

3034 ... 

3035 

3036 def find_all( 

3037 self, 

3038 name: _FindMethodName = None, 

3039 attrs: Optional[_StrainableAttributes] = None, 

3040 recursive: bool = True, 

3041 string: Optional[_StrainableString] = None, 

3042 limit: Optional[int] = None, 

3043 _stacklevel: int = 2, 

3044 **kwargs: _StrainableAttribute, 

3045 ) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]: 

3046 """Look in the children of this `PageElement` and find all 

3047 `PageElement` objects that match the given criteria. 

3048 

3049 All find_* methods take a common set of arguments. See the online 

3050 documentation for detailed explanations. 

3051 

3052 :param name: A filter on tag name. 

3053 :param attrs: Additional filters on attribute values. 

3054 :param recursive: If this is True, find_all() will perform a 

3055 recursive search of this PageElement's children. Otherwise, 

3056 only the direct children will be considered. 

3057 :param limit: Stop looking after finding this many results. 

3058 :param _stacklevel: Used internally to improve warning messages. 

3059 :kwargs: Additional filters on attribute values. 

3060 """ 

3061 generator = self.descendants 

3062 if not recursive: 

3063 generator = self.children 

3064 _stacklevel += 1 

3065 

3066 if string is not None and (name is not None or attrs is not None or kwargs): 

3067 # TODO: Using the @overload decorator to express the three ways you 

3068 # could get into this path is way too much code for a rarely(?) used 

3069 # feature. 

3070 return cast(ResultSet[Tag], 

3071 self._find_all(name, attrs, string, limit, generator, 

3072 _stacklevel=_stacklevel, **kwargs) 

3073 ) 

3074 

3075 if string is None: 

3076 # If string is None, we're searching for tags. 

3077 return cast(ResultSet[Tag], self._find_all( 

3078 name, attrs, None, limit, generator, _stacklevel=_stacklevel, **kwargs 

3079 )) 

3080 

3081 # Otherwise, we're searching for strings. 

3082 return cast(ResultSet[NavigableString], self._find_all( 

3083 None, None, string, limit, generator, _stacklevel=_stacklevel, **kwargs 

3084 )) 

3085 

3086 findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0") 

3087 findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0") 

3088 

3089 # Generator methods 

3090 @property 

3091 def children(self) -> Iterator[PageElement]: 

3092 """Iterate over all direct children of this `PageElement`.""" 

3093 return (x for x in self.contents) 

3094 

3095 @property 

3096 def self_and_descendants(self) -> Iterator[PageElement]: 

3097 """Iterate over this `Tag` and its children in a 

3098 breadth-first sequence. 

3099 """ 

3100 return self._self_and(self.descendants) 

3101 

3102 @property 

3103 def descendants(self) -> Iterator[PageElement]: 

3104 """Iterate over all children of this `Tag` in a 

3105 breadth-first sequence. 

3106 """ 

3107 if not len(self.contents): 

3108 return 

3109 # _last_descendant() can't return None here because 

3110 # accept_self is True. Worst case, last_descendant will end up 

3111 # as self. 

3112 last_descendant = cast(PageElement, self._last_descendant(accept_self=True)) 

3113 stopNode = last_descendant.next_element 

3114 current: _AtMostOneElement = self.contents[0] 

3115 while current is not stopNode and current is not None: 

3116 successor = current.next_element 

3117 yield current 

3118 current = successor 

3119 

3120 # CSS selector code 

3121 def select_one( 

3122 self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any 

3123 ) -> Optional[Tag]: 

3124 """Perform a CSS selection operation on the current element. 

3125 

3126 :param selector: A CSS selector. 

3127 

3128 :param namespaces: A dictionary mapping namespace prefixes 

3129 used in the CSS selector to namespace URIs. By default, 

3130 Beautiful Soup will use the prefixes it encountered while 

3131 parsing the document. 

3132 

3133 :param kwargs: Keyword arguments to be passed into Soup Sieve's 

3134 soupsieve.select() method. 

3135 """ 

3136 return self.css.select_one(selector, namespaces, **kwargs) 

3137 

3138 def select( 

3139 self, 

3140 selector: str, 

3141 namespaces: Optional[Dict[str, str]] = None, 

3142 limit: int = 0, 

3143 **kwargs: Any, 

3144 ) -> ResultSet[Tag]: 

3145 """Perform a CSS selection operation on the current element. 

3146 

3147 This uses the SoupSieve library. 

3148 

3149 :param selector: A string containing a CSS selector. 

3150 

3151 :param namespaces: A dictionary mapping namespace prefixes 

3152 used in the CSS selector to namespace URIs. By default, 

3153 Beautiful Soup will use the prefixes it encountered while 

3154 parsing the document. 

3155 

3156 :param limit: After finding this number of results, stop looking. 

3157 

3158 :param kwargs: Keyword arguments to be passed into SoupSieve's 

3159 soupsieve.select() method. 

3160 """ 

3161 return self.css.select(selector, namespaces, limit, **kwargs) 

3162 

3163 @property 

3164 def css(self) -> CSS: 

3165 """Return an interface to the CSS selector API.""" 

3166 return CSS(self) 

3167 

3168 # Old names for backwards compatibility 

3169 @_deprecated("children", "4.0.0") 

3170 def childGenerator(self) -> Iterator[PageElement]: 

3171 """Deprecated generator. 

3172 

3173 :meta private: 

3174 """ 

3175 return self.children 

3176 

3177 @_deprecated("descendants", "4.0.0") 

3178 def recursiveChildGenerator(self) -> Iterator[PageElement]: 

3179 """Deprecated generator. 

3180 

3181 :meta private: 

3182 """ 

3183 return self.descendants 

3184 

3185 @_deprecated("has_attr", "4.0.0") 

3186 def has_key(self, key: str) -> bool: 

3187 """Deprecated method. This was kind of misleading because has_key() 

3188 (attributes) was different from __in__ (contents). 

3189 

3190 has_key() is gone in Python 3, anyway. 

3191 

3192 :meta private: 

3193 """ 

3194 return self.has_attr(key) 

3195 

3196 

3197_PageElementT = TypeVar("_PageElementT", bound=PageElement) 

3198 

3199class ResultSet(List[_PageElementT], Generic[_PageElementT]): 

3200 """A ResultSet is a list of `PageElement` objects, gathered as the result 

3201 of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of 

3202 search results. 

3203 """ 

3204 

3205 source: Optional[ElementFilter] 

3206 

3207 def __init__( 

3208 self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = () 

3209 ) -> None: 

3210 super(ResultSet, self).__init__(result) 

3211 self.source = source 

3212 

3213 def __getattr__(self, key: str) -> None: 

3214 """Raise a helpful exception to explain a common code fix.""" 

3215 raise AttributeError( 

3216 f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?""" 

3217 ) 

3218 

3219# Now that all the classes used by SoupStrainer have been defined, 

3220# import SoupStrainer itself into this module to preserve the 

3221# backwards compatibility of anyone who imports 

3222# bs4.element.SoupStrainer. 

3223from bs4.filter import SoupStrainer # noqa: E402