Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/element.py: 39%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

969 statements  

1from __future__ import annotations 

2 

3# Use of this source code is governed by the MIT license. 

4__license__ = "MIT" 

5 

6import re 

7import warnings 

8 

9from bs4.css import CSS 

10from bs4._deprecation import ( 

11 _deprecated, 

12 _deprecated_alias, 

13 _deprecated_function_alias, 

14) 

15from bs4.formatter import ( 

16 Formatter, 

17 HTMLFormatter, 

18 XMLFormatter, 

19) 

20from bs4._warnings import AttributeResemblesVariableWarning 

21 

22from typing import ( 

23 Any, 

24 Callable, 

25 Dict, 

26 Generic, 

27 Iterable, 

28 Iterator, 

29 List, 

30 Mapping, 

31 Optional, 

32 Pattern, 

33 Set, 

34 TYPE_CHECKING, 

35 Tuple, 

36 Type, 

37 TypeVar, 

38 Union, 

39 cast, 

40) 

41from typing_extensions import ( 

42 Self, 

43 TypeAlias, 

44) 

45 

46if TYPE_CHECKING: 

47 from bs4 import BeautifulSoup 

48 from bs4.builder import TreeBuilder 

49 from bs4.filter import ElementFilter 

50 from bs4.formatter import ( 

51 _EntitySubstitutionFunction, 

52 _FormatterOrName, 

53 ) 

54 from bs4._typing import ( 

55 _AtMostOneElement, 

56 _AttributeValue, 

57 _AttributeValues, 

58 _Encoding, 

59 _InsertableElement, 

60 _OneElement, 

61 _QueryResults, 

62 _RawOrProcessedAttributeValues, 

63 _StrainableElement, 

64 _StrainableAttribute, 

65 _StrainableAttributes, 

66 _StrainableString, 

67 ) 

68 

69_OneOrMoreStringTypes: TypeAlias = Union[ 

70 Type["NavigableString"], Iterable[Type["NavigableString"]] 

71] 

72 

73_FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]] 

74 

75# Deprecated module-level attributes. 

76# See https://peps.python.org/pep-0562/ 

77_deprecated_names = dict( 

78 whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy." 

79) 

80#: :meta private: 

81_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+") 

82 

83 

84def __getattr__(name: str) -> Any: 

85 if name in _deprecated_names: 

86 message = _deprecated_names[name] 

87 warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2) 

88 

89 return globals()[f"_deprecated_{name}"] 

90 raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 

91 

92 

93#: Documents output by Beautiful Soup will be encoded with 

94#: this encoding unless you specify otherwise. 

95DEFAULT_OUTPUT_ENCODING: str = "utf-8" 

96 

97#: A regular expression that can be used to split on whitespace. 

98nonwhitespace_re: Pattern[str] = re.compile(r"\S+") 

99 

100#: These encodings are recognized by Python (so `Tag.encode` 

101#: could theoretically support them) but XML and HTML don't recognize 

102#: them (so they should not show up in an XML or HTML document as that 

103#: document's encoding). 

104#: 

105#: If an XML document is encoded in one of these encodings, no encoding 

106#: will be mentioned in the XML declaration. If an HTML document is 

107#: encoded in one of these encodings, and the HTML document has a 

108#: <meta> tag that mentions an encoding, the encoding will be given as 

109#: the empty string. 

110#: 

111#: Source: 

112#: Python documentation, `Python Specific Encodings <https://docs.python.org/3/library/codecs.html#python-specific-encodings>`_ 

113PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set( 

114 [ 

115 "idna", 

116 "mbcs", 

117 "oem", 

118 "palmos", 

119 "punycode", 

120 "raw_unicode_escape", 

121 "undefined", 

122 "unicode_escape", 

123 "raw-unicode-escape", 

124 "unicode-escape", 

125 "string-escape", 

126 "string_escape", 

127 ] 

128) 

129 

130 

131class NamespacedAttribute(str): 

132 """A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"') 

133 which remembers the namespace prefix ('xml') and the name ('lang') 

134 that were used to create it. 

135 """ 

136 

137 prefix: Optional[str] 

138 name: Optional[str] 

139 namespace: Optional[str] 

140 

141 def __new__( 

142 cls, 

143 prefix: Optional[str], 

144 name: Optional[str] = None, 

145 namespace: Optional[str] = None, 

146 ) -> Self: 

147 if not name: 

148 # This is the default namespace. Its name "has no value" 

149 # per https://www.w3.org/TR/xml-names/#defaulting 

150 name = None 

151 

152 if not name: 

153 obj = str.__new__(cls, prefix) 

154 elif not prefix: 

155 # Not really namespaced. 

156 obj = str.__new__(cls, name) 

157 else: 

158 obj = str.__new__(cls, prefix + ":" + name) 

159 obj.prefix = prefix 

160 obj.name = name 

161 obj.namespace = namespace 

162 return obj 

163 

164 

165class AttributeValueWithCharsetSubstitution(str): 

166 """An abstract class standing in for a character encoding specified 

167 inside an HTML ``<meta>`` tag. 

168 

169 Subclasses exist for each place such a character encoding might be 

170 found: either inside the ``charset`` attribute 

171 (`CharsetMetaAttributeValue`) or inside the ``content`` attribute 

172 (`ContentMetaAttributeValue`) 

173 

174 This allows Beautiful Soup to replace that part of the HTML file 

175 with a different encoding when ouputting a tree as a string. 

176 """ 

177 

178 # The original, un-encoded value of the ``content`` attribute. 

179 #: :meta private: 

180 original_value: str 

181 

182 def substitute_encoding(self, eventual_encoding: str) -> str: 

183 """Do whatever's necessary in this implementation-specific 

184 portion an HTML document to substitute in a specific encoding. 

185 """ 

186 raise NotImplementedError() 

187 

188 

189class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 

190 """A generic stand-in for the value of a ``<meta>`` tag's ``charset`` 

191 attribute. 

192 

193 When Beautiful Soup parses the markup ``<meta charset="utf8">``, the 

194 value of the ``charset`` attribute will become one of these objects. 

195 

196 If the document is later encoded to an encoding other than UTF-8, its 

197 ``<meta>`` tag will mention the new encoding instead of ``utf8``. 

198 """ 

199 

200 def __new__(cls, original_value: str) -> Self: 

201 # We don't need to use the original value for anything, but 

202 # it might be useful for the user to know. 

203 obj = str.__new__(cls, original_value) 

204 obj.original_value = original_value 

205 return obj 

206 

207 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: 

208 """When an HTML document is being encoded to a given encoding, the 

209 value of a ``<meta>`` tag's ``charset`` becomes the name of 

210 the encoding. 

211 """ 

212 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 

213 return "" 

214 return eventual_encoding 

215 

216 

217class AttributeValueList(List[str]): 

218 """Class for the list used to hold the values of attributes which 

219 have multiple values (such as HTML's 'class'). It's just a regular 

220 list, but you can subclass it and pass it in to the TreeBuilder 

221 constructor as attribute_value_list_class, to have your subclass 

222 instantiated instead. 

223 """ 

224 

225 

226class AttributeDict(Dict[Any,Any]): 

227 """Superclass for the dictionary used to hold a tag's 

228 attributes. You can use this, but it's just a regular dict with no 

229 special logic. 

230 """ 

231 

232 

233class XMLAttributeDict(AttributeDict): 

234 """A dictionary for holding a Tag's attributes, which processes 

235 incoming values for consistency with the HTML spec. 

236 """ 

237 

238 def __setitem__(self, key: str, value: Any) -> None: 

239 """Set an attribute value, possibly modifying it to comply with 

240 the XML spec. 

241 

242 This just means converting common non-string values to 

243 strings: XML attributes may have "any literal string as a 

244 value." 

245 """ 

246 if value is None: 

247 value = "" 

248 if isinstance(value, bool): 

249 # XML does not define any rules for boolean attributes. 

250 # Preserve the old Beautiful Soup behavior (a bool that 

251 # gets converted to a string on output) rather than 

252 # guessing what the value should be. 

253 pass 

254 elif isinstance(value, (int, float)): 

255 # It's dangerous to convert _every_ attribute value into a 

256 # plain string, since an attribute value may be a more 

257 # sophisticated string-like object 

258 # (e.g. CharsetMetaAttributeValue). But we can definitely 

259 # convert numeric values and booleans, which are the most common. 

260 value = str(value) 

261 

262 super().__setitem__(key, value) 

263 

264 

265class HTMLAttributeDict(AttributeDict): 

266 """A dictionary for holding a Tag's attributes, which processes 

267 incoming values for consistency with the HTML spec, which says 

268 'Attribute values are a mixture of text and character 

269 references...' 

270 

271 Basically, this means converting common non-string values into 

272 strings, like XMLAttributeDict, though HTML also has some rules 

273 around boolean attributes that XML doesn't have. 

274 """ 

275 

276 def __setitem__(self, key: str, value: Any) -> None: 

277 """Set an attribute value, possibly modifying it to comply 

278 with the HTML spec, 

279 """ 

280 if value in (False, None): 

281 # 'The values "true" and "false" are not allowed on 

282 # boolean attributes. To represent a false value, the 

283 # attribute has to be omitted altogether.' 

284 if key in self: 

285 del self[key] 

286 return 

287 if isinstance(value, bool): 

288 # 'If the [boolean] attribute is present, its value must 

289 # either be the empty string or a value that is an ASCII 

290 # case-insensitive match for the attribute's canonical 

291 # name, with no leading or trailing whitespace.' 

292 # 

293 # [fixme] It's not clear to me whether "canonical name" 

294 # means fully-qualified name, unqualified name, or 

295 # (probably not) name with namespace prefix. For now I'm 

296 # going with unqualified name. 

297 if isinstance(key, NamespacedAttribute): 

298 value = key.name 

299 else: 

300 value = key 

301 elif isinstance(value, (int, float)): 

302 # See note in XMLAttributeDict for the reasoning why we 

303 # only do this to numbers. 

304 value = str(value) 

305 super().__setitem__(key, value) 

306 

307 

308class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): 

309 """A generic stand-in for the value of a ``<meta>`` tag's ``content`` 

310 attribute. 

311 

312 When Beautiful Soup parses the markup: 

313 ``<meta http-equiv="content-type" content="text/html; charset=utf8">`` 

314 

315 The value of the ``content`` attribute will become one of these objects. 

316 

317 If the document is later encoded to an encoding other than UTF-8, its 

318 ``<meta>`` tag will mention the new encoding instead of ``utf8``. 

319 """ 

320 

321 #: Match the 'charset' argument inside the 'content' attribute 

322 #: of a <meta> tag. 

323 #: :meta private: 

324 CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) 

325 

326 def __new__(cls, original_value: str) -> Self: 

327 cls.CHARSET_RE.search(original_value) 

328 obj = str.__new__(cls, original_value) 

329 obj.original_value = original_value 

330 return obj 

331 

332 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: 

333 """When an HTML document is being encoded to a given encoding, the 

334 value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes 

335 the name of the encoding. 

336 """ 

337 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 

338 return self.CHARSET_RE.sub("", self.original_value) 

339 

340 def rewrite(match: re.Match[str]) -> str: 

341 return match.group(1) + eventual_encoding 

342 

343 return self.CHARSET_RE.sub(rewrite, self.original_value) 

344 

345 

346class PageElement(object): 

347 """An abstract class representing a single element in the parse tree. 

348 

349 `NavigableString`, `Tag`, etc. are all subclasses of 

350 `PageElement`. For this reason you'll see a lot of methods that 

351 return `PageElement`, but you'll never see an actual `PageElement` 

352 object. For the most part you can think of `PageElement` as 

353 meaning "a `Tag` or a `NavigableString`." 

354 """ 

355 

356 #: In general, we can't tell just by looking at an element whether 

357 #: it's contained in an XML document or an HTML document. But for 

358 #: `Tag` objects (q.v.) we can store this information at parse time. 

359 #: :meta private: 

360 known_xml: Optional[bool] = None 

361 

362 #: Whether or not this element has been decomposed from the tree 

363 #: it was created in. 

364 _decomposed: bool 

365 

366 parent: Optional[Tag] 

367 next_element: _AtMostOneElement 

368 previous_element: _AtMostOneElement 

369 next_sibling: _AtMostOneElement 

370 previous_sibling: _AtMostOneElement 

371 

372 #: Whether or not this element is hidden from generated output. 

373 #: Only the `BeautifulSoup` object itself is hidden. 

374 hidden: bool = False 

375 

376 def setup( 

377 self, 

378 parent: Optional[Tag] = None, 

379 previous_element: _AtMostOneElement = None, 

380 next_element: _AtMostOneElement = None, 

381 previous_sibling: _AtMostOneElement = None, 

382 next_sibling: _AtMostOneElement = None, 

383 ) -> None: 

384 """Sets up the initial relations between this element and 

385 other elements. 

386 

387 :param parent: The parent of this element. 

388 

389 :param previous_element: The element parsed immediately before 

390 this one. 

391 

392 :param next_element: The element parsed immediately after 

393 this one. 

394 

395 :param previous_sibling: The most recently encountered element 

396 on the same level of the parse tree as this one. 

397 

398 :param previous_sibling: The next element to be encountered 

399 on the same level of the parse tree as this one. 

400 """ 

401 self.parent = parent 

402 

403 self.previous_element = previous_element 

404 if self.previous_element is not None: 

405 self.previous_element.next_element = self 

406 

407 self.next_element = next_element 

408 if self.next_element is not None: 

409 self.next_element.previous_element = self 

410 

411 self.next_sibling = next_sibling 

412 if self.next_sibling is not None: 

413 self.next_sibling.previous_sibling = self 

414 

415 if ( 

416 previous_sibling is None 

417 and self.parent is not None 

418 and self.parent.contents 

419 ): 

420 previous_sibling = self.parent.contents[-1] 

421 

422 self.previous_sibling = previous_sibling 

423 if self.previous_sibling is not None: 

424 self.previous_sibling.next_sibling = self 

425 

426 def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str: 

427 """Format the given string using the given formatter. 

428 

429 :param s: A string. 

430 :param formatter: A Formatter object, or a string naming one of the standard formatters. 

431 """ 

432 if formatter is None: 

433 return s 

434 if not isinstance(formatter, Formatter): 

435 formatter = self.formatter_for_name(formatter) 

436 output = formatter.substitute(s) 

437 return output 

438 

439 def formatter_for_name( 

440 self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction] 

441 ) -> Formatter: 

442 """Look up or create a Formatter for the given identifier, 

443 if necessary. 

444 

445 :param formatter: Can be a `Formatter` object (used as-is), a 

446 function (used as the entity substitution hook for an 

447 `bs4.formatter.XMLFormatter` or 

448 `bs4.formatter.HTMLFormatter`), or a string (used to look 

449 up an `bs4.formatter.XMLFormatter` or 

450 `bs4.formatter.HTMLFormatter` in the appropriate registry. 

451 

452 """ 

453 if isinstance(formatter_name, Formatter): 

454 return formatter_name 

455 c: type[Formatter] 

456 registry: Mapping[Optional[str], Formatter] 

457 if self._is_xml: 

458 c = XMLFormatter 

459 registry = XMLFormatter.REGISTRY 

460 else: 

461 c = HTMLFormatter 

462 registry = HTMLFormatter.REGISTRY 

463 if callable(formatter_name): 

464 return c(entity_substitution=formatter_name) 

465 return registry[formatter_name] 

466 

467 @property 

468 def _is_xml(self) -> bool: 

469 """Is this element part of an XML tree or an HTML tree? 

470 

471 This is used in formatter_for_name, when deciding whether an 

472 XMLFormatter or HTMLFormatter is more appropriate. It can be 

473 inefficient, but it should be called very rarely. 

474 """ 

475 if self.known_xml is not None: 

476 # Most of the time we will have determined this when the 

477 # document is parsed. 

478 return self.known_xml 

479 

480 # Otherwise, it's likely that this element was created by 

481 # direct invocation of the constructor from within the user's 

482 # Python code. 

483 if self.parent is None: 

484 # This is the top-level object. It should have .known_xml set 

485 # from tree creation. If not, take a guess--BS is usually 

486 # used on HTML markup. 

487 return getattr(self, "is_xml", False) 

488 return self.parent._is_xml 

489 

490 nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0") 

491 previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0") 

492 

493 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: 

494 raise NotImplementedError() 

495 

496 def __copy__(self) -> Self: 

497 """A copy of a PageElement can only be a deep copy, because 

498 only one PageElement can occupy a given place in a parse tree. 

499 """ 

500 return self.__deepcopy__({}) 

501 

502 default: Iterable[type[NavigableString]] = tuple() #: :meta private: 

503 

504 def _all_strings( 

505 self, strip: bool = False, types: Iterable[type[NavigableString]] = default 

506 ) -> Iterator[str]: 

507 """Yield all strings of certain classes, possibly stripping them. 

508 

509 This is implemented differently in `Tag` and `NavigableString`. 

510 """ 

511 raise NotImplementedError() 

512 

513 @property 

514 def stripped_strings(self) -> Iterator[str]: 

515 """Yield all interesting strings in this PageElement, stripping them 

516 first. 

517 

518 See `Tag` for information on which strings are considered 

519 interesting in a given context. 

520 """ 

521 for string in self._all_strings(True): 

522 yield string 

523 

524 def get_text( 

525 self, 

526 separator: str = "", 

527 strip: bool = False, 

528 types: Iterable[Type[NavigableString]] = default, 

529 ) -> str: 

530 """Get all child strings of this PageElement, concatenated using the 

531 given separator. 

532 

533 :param separator: Strings will be concatenated using this separator. 

534 

535 :param strip: If True, strings will be stripped before being 

536 concatenated. 

537 

538 :param types: A tuple of NavigableString subclasses. Any 

539 strings of a subclass not found in this list will be 

540 ignored. Although there are exceptions, the default 

541 behavior in most cases is to consider only NavigableString 

542 and CData objects. That means no comments, processing 

543 instructions, etc. 

544 

545 :return: A string. 

546 """ 

547 return separator.join([s for s in self._all_strings(strip, types=types)]) 

548 

549 getText = get_text 

550 text = property(get_text) 

551 

552 def replace_with(self, *args: PageElement) -> Self: 

553 """Replace this `PageElement` with one or more other `PageElement`, 

554 objects, keeping the rest of the tree the same. 

555 

556 :return: This `PageElement`, no longer part of the tree. 

557 """ 

558 if self.parent is None: 

559 raise ValueError( 

560 "Cannot replace one element with another when the " 

561 "element to be replaced is not part of a tree." 

562 ) 

563 if len(args) == 1 and args[0] is self: 

564 # Replacing an element with itself is a no-op. 

565 return self 

566 if any(x is self.parent for x in args): 

567 raise ValueError("Cannot replace a Tag with its parent.") 

568 old_parent = self.parent 

569 my_index = self.parent.index(self) 

570 self.extract(_self_index=my_index) 

571 for idx, replace_with in enumerate(args, start=my_index): 

572 old_parent.insert(idx, replace_with) 

573 return self 

574 

575 replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0") 

576 

577 def wrap(self, wrap_inside: Tag) -> Tag: 

578 """Wrap this `PageElement` inside a `Tag`. 

579 

580 :return: ``wrap_inside``, occupying the position in the tree that used 

581 to be occupied by this object, and with this object now inside it. 

582 """ 

583 me = self.replace_with(wrap_inside) 

584 wrap_inside.append(me) 

585 return wrap_inside 

586 

587 def extract(self, _self_index: Optional[int] = None) -> Self: 

588 """Destructively rips this element out of the tree. 

589 

590 :param _self_index: The location of this element in its parent's 

591 .contents, if known. Passing this in allows for a performance 

592 optimization. 

593 

594 :return: this `PageElement`, no longer part of the tree. 

595 """ 

596 if self.parent is not None: 

597 if _self_index is None: 

598 _self_index = self.parent.index(self) 

599 del self.parent.contents[_self_index] 

600 

601 # Find the two elements that would be next to each other if 

602 # this element (and any children) hadn't been parsed. Connect 

603 # the two. 

604 last_child = self._last_descendant() 

605 

606 # last_child can't be None because we passed accept_self=True 

607 # into _last_descendant. Worst case, last_child will be 

608 # self. Making this cast removes several mypy complaints later 

609 # on as we manipulate last_child. 

610 last_child = cast(PageElement, last_child) 

611 next_element = last_child.next_element 

612 

613 if self.previous_element is not None: 

614 if self.previous_element is not next_element: 

615 self.previous_element.next_element = next_element 

616 if next_element is not None and next_element is not self.previous_element: 

617 next_element.previous_element = self.previous_element 

618 self.previous_element = None 

619 last_child.next_element = None 

620 

621 self.parent = None 

622 if ( 

623 self.previous_sibling is not None 

624 and self.previous_sibling is not self.next_sibling 

625 ): 

626 self.previous_sibling.next_sibling = self.next_sibling 

627 if ( 

628 self.next_sibling is not None 

629 and self.next_sibling is not self.previous_sibling 

630 ): 

631 self.next_sibling.previous_sibling = self.previous_sibling 

632 self.previous_sibling = self.next_sibling = None 

633 return self 

634 

635 def decompose(self) -> None: 

636 """Recursively destroys this `PageElement` and its children. 

637 

638 The element will be removed from the tree and wiped out; so 

639 will everything beneath it. 

640 

641 The behavior of a decomposed `PageElement` is undefined and you 

642 should never use one for anything, but if you need to *check* 

643 whether an element has been decomposed, you can use the 

644 `PageElement.decomposed` property. 

645 """ 

646 self.extract() 

647 e: _AtMostOneElement = self 

648 next_up: _AtMostOneElement = None 

649 while e is not None: 

650 next_up = e.next_element 

651 e.__dict__.clear() 

652 if isinstance(e, Tag): 

653 e.contents = [] 

654 e._decomposed = True 

655 e = next_up 

656 

657 def _last_descendant( 

658 self, is_initialized: bool = True, accept_self: bool = True 

659 ) -> _AtMostOneElement: 

660 """Finds the last element beneath this object to be parsed. 

661 

662 Special note to help you figure things out if your type 

663 checking is tripped up by the fact that this method returns 

664 _AtMostOneElement instead of PageElement: the only time 

665 this method returns None is if `accept_self` is False and the 

666 `PageElement` has no children--either it's a NavigableString 

667 or an empty Tag. 

668 

669 :param is_initialized: Has `PageElement.setup` been called on 

670 this `PageElement` yet? 

671 

672 :param accept_self: Is ``self`` an acceptable answer to the 

673 question? 

674 """ 

675 if is_initialized and self.next_sibling is not None: 

676 last_child = self.next_sibling.previous_element 

677 else: 

678 last_child = self 

679 while isinstance(last_child, Tag) and last_child.contents: 

680 last_child = last_child.contents[-1] 

681 if not accept_self and last_child is self: 

682 last_child = None 

683 return last_child 

684 

685 _lastRecursiveChild = _deprecated_alias( 

686 "_lastRecursiveChild", "_last_descendant", "4.0.0" 

687 ) 

688 

689 def insert_before(self, *args: _InsertableElement) -> List[PageElement]: 

690 """Makes the given element(s) the immediate predecessor of this one. 

691 

692 All the elements will have the same `PageElement.parent` as 

693 this one, and the given elements will occur immediately before 

694 this one. 

695 

696 :param args: One or more PageElements. 

697 

698 :return The list of PageElements that were inserted. 

699 """ 

700 parent = self.parent 

701 if parent is None: 

702 raise ValueError("Element has no parent, so 'before' has no meaning.") 

703 if any(x is self for x in args): 

704 raise ValueError("Can't insert an element before itself.") 

705 results: List[PageElement] = [] 

706 for predecessor in args: 

707 # Extract first so that the index won't be screwed up if they 

708 # are siblings. 

709 if isinstance(predecessor, PageElement): 

710 predecessor.extract() 

711 index = parent.index(self) 

712 results.extend(parent.insert(index, predecessor)) 

713 

714 return results 

715 

716 def insert_after(self, *args: _InsertableElement) -> List[PageElement]: 

717 """Makes the given element(s) the immediate successor of this one. 

718 

719 The elements will have the same `PageElement.parent` as this 

720 one, and the given elements will occur immediately after this 

721 one. 

722 

723 :param args: One or more PageElements. 

724 

725 :return The list of PageElements that were inserted. 

726 """ 

727 # Do all error checking before modifying the tree. 

728 parent = self.parent 

729 if parent is None: 

730 raise ValueError("Element has no parent, so 'after' has no meaning.") 

731 if any(x is self for x in args): 

732 raise ValueError("Can't insert an element after itself.") 

733 

734 offset = 0 

735 results: List[PageElement] = [] 

736 for successor in args: 

737 # Extract first so that the index won't be screwed up if they 

738 # are siblings. 

739 if isinstance(successor, PageElement): 

740 successor.extract() 

741 index = parent.index(self) 

742 results.extend(parent.insert(index + 1 + offset, successor)) 

743 offset += 1 

744 

745 return results 

746 

747 def find_next( 

748 self, 

749 name: _FindMethodName = None, 

750 attrs: _StrainableAttributes = {}, 

751 string: Optional[_StrainableString] = None, 

752 **kwargs: _StrainableAttribute, 

753 ) -> _AtMostOneElement: 

754 """Find the first PageElement that matches the given criteria and 

755 appears later in the document than this PageElement. 

756 

757 All find_* methods take a common set of arguments. See the online 

758 documentation for detailed explanations. 

759 

760 :param name: A filter on tag name. 

761 :param attrs: Additional filters on attribute values. 

762 :param string: A filter for a NavigableString with specific text. 

763 :kwargs: Additional filters on attribute values. 

764 """ 

765 return self._find_one(self.find_all_next, name, attrs, string, **kwargs) 

766 

767 findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0") 

768 

769 def find_all_next( 

770 self, 

771 name: _FindMethodName = None, 

772 attrs: _StrainableAttributes = {}, 

773 string: Optional[_StrainableString] = None, 

774 limit: Optional[int] = None, 

775 _stacklevel: int = 2, 

776 **kwargs: _StrainableAttribute, 

777 ) -> _QueryResults: 

778 """Find all `PageElement` objects that match the given criteria and 

779 appear later in the document than this `PageElement`. 

780 

781 All find_* methods take a common set of arguments. See the online 

782 documentation for detailed explanations. 

783 

784 :param name: A filter on tag name. 

785 :param attrs: Additional filters on attribute values. 

786 :param string: A filter for a NavigableString with specific text. 

787 :param limit: Stop looking after finding this many results. 

788 :param _stacklevel: Used internally to improve warning messages. 

789 :kwargs: Additional filters on attribute values. 

790 """ 

791 return self._find_all( 

792 name, 

793 attrs, 

794 string, 

795 limit, 

796 self.next_elements, 

797 _stacklevel=_stacklevel + 1, 

798 **kwargs, 

799 ) 

800 

801 findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0") 

802 

803 def find_next_sibling( 

804 self, 

805 name: _FindMethodName = None, 

806 attrs: _StrainableAttributes = {}, 

807 string: Optional[_StrainableString] = None, 

808 **kwargs: _StrainableAttribute, 

809 ) -> _AtMostOneElement: 

810 """Find the closest sibling to this PageElement that matches the 

811 given criteria and appears later in the document. 

812 

813 All find_* methods take a common set of arguments. See the 

814 online documentation for detailed explanations. 

815 

816 :param name: A filter on tag name. 

817 :param attrs: Additional filters on attribute values. 

818 :param string: A filter for a `NavigableString` with specific text. 

819 :kwargs: Additional filters on attribute values. 

820 """ 

821 return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs) 

822 

823 findNextSibling = _deprecated_function_alias( 

824 "findNextSibling", "find_next_sibling", "4.0.0" 

825 ) 

826 

827 def find_next_siblings( 

828 self, 

829 name: _FindMethodName = None, 

830 attrs: _StrainableAttributes = {}, 

831 string: Optional[_StrainableString] = None, 

832 limit: Optional[int] = None, 

833 _stacklevel: int = 2, 

834 **kwargs: _StrainableAttribute, 

835 ) -> _QueryResults: 

836 """Find all siblings of this `PageElement` that match the given criteria 

837 and appear later in the document. 

838 

839 All find_* methods take a common set of arguments. See the online 

840 documentation for detailed explanations. 

841 

842 :param name: A filter on tag name. 

843 :param attrs: Additional filters on attribute values. 

844 :param string: A filter for a `NavigableString` with specific text. 

845 :param limit: Stop looking after finding this many results. 

846 :param _stacklevel: Used internally to improve warning messages. 

847 :kwargs: Additional filters on attribute values. 

848 """ 

849 return self._find_all( 

850 name, 

851 attrs, 

852 string, 

853 limit, 

854 self.next_siblings, 

855 _stacklevel=_stacklevel + 1, 

856 **kwargs, 

857 ) 

858 

859 findNextSiblings = _deprecated_function_alias( 

860 "findNextSiblings", "find_next_siblings", "4.0.0" 

861 ) 

862 fetchNextSiblings = _deprecated_function_alias( 

863 "fetchNextSiblings", "find_next_siblings", "3.0.0" 

864 ) 

865 

866 def find_previous( 

867 self, 

868 name: _FindMethodName = None, 

869 attrs: _StrainableAttributes = {}, 

870 string: Optional[_StrainableString] = None, 

871 **kwargs: _StrainableAttribute, 

872 ) -> _AtMostOneElement: 

873 """Look backwards in the document from this `PageElement` and find the 

874 first `PageElement` that matches the given criteria. 

875 

876 All find_* methods take a common set of arguments. See the online 

877 documentation for detailed explanations. 

878 

879 :param name: A filter on tag name. 

880 :param attrs: Additional filters on attribute values. 

881 :param string: A filter for a `NavigableString` with specific text. 

882 :kwargs: Additional filters on attribute values. 

883 """ 

884 return self._find_one(self.find_all_previous, name, attrs, string, **kwargs) 

885 

886 findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0") 

887 

888 def find_all_previous( 

889 self, 

890 name: _FindMethodName = None, 

891 attrs: _StrainableAttributes = {}, 

892 string: Optional[_StrainableString] = None, 

893 limit: Optional[int] = None, 

894 _stacklevel: int = 2, 

895 **kwargs: _StrainableAttribute, 

896 ) -> _QueryResults: 

897 """Look backwards in the document from this `PageElement` and find all 

898 `PageElement` that match the given criteria. 

899 

900 All find_* methods take a common set of arguments. See the online 

901 documentation for detailed explanations. 

902 

903 :param name: A filter on tag name. 

904 :param attrs: Additional filters on attribute values. 

905 :param string: A filter for a `NavigableString` with specific text. 

906 :param limit: Stop looking after finding this many results. 

907 :param _stacklevel: Used internally to improve warning messages. 

908 :kwargs: Additional filters on attribute values. 

909 """ 

910 return self._find_all( 

911 name, 

912 attrs, 

913 string, 

914 limit, 

915 self.previous_elements, 

916 _stacklevel=_stacklevel + 1, 

917 **kwargs, 

918 ) 

919 

920 findAllPrevious = _deprecated_function_alias( 

921 "findAllPrevious", "find_all_previous", "4.0.0" 

922 ) 

923 fetchAllPrevious = _deprecated_function_alias( 

924 "fetchAllPrevious", "find_all_previous", "3.0.0" 

925 ) 

926 

927 def find_previous_sibling( 

928 self, 

929 name: _FindMethodName = None, 

930 attrs: _StrainableAttributes = {}, 

931 string: Optional[_StrainableString] = None, 

932 **kwargs: _StrainableAttribute, 

933 ) -> _AtMostOneElement: 

934 """Returns the closest sibling to this `PageElement` that matches the 

935 given criteria and appears earlier in the document. 

936 

937 All find_* methods take a common set of arguments. See the online 

938 documentation for detailed explanations. 

939 

940 :param name: A filter on tag name. 

941 :param attrs: Additional filters on attribute values. 

942 :param string: A filter for a `NavigableString` with specific text. 

943 :kwargs: Additional filters on attribute values. 

944 """ 

945 return self._find_one( 

946 self.find_previous_siblings, name, attrs, string, **kwargs 

947 ) 

948 

949 findPreviousSibling = _deprecated_function_alias( 

950 "findPreviousSibling", "find_previous_sibling", "4.0.0" 

951 ) 

952 

953 def find_previous_siblings( 

954 self, 

955 name: _FindMethodName = None, 

956 attrs: _StrainableAttributes = {}, 

957 string: Optional[_StrainableString] = None, 

958 limit: Optional[int] = None, 

959 _stacklevel: int = 2, 

960 **kwargs: _StrainableAttribute, 

961 ) -> _QueryResults: 

962 """Returns all siblings to this PageElement that match the 

963 given criteria and appear earlier in the document. 

964 

965 All find_* methods take a common set of arguments. See the online 

966 documentation for detailed explanations. 

967 

968 :param name: A filter on tag name. 

969 :param attrs: Additional filters on attribute values. 

970 :param string: A filter for a NavigableString with specific text. 

971 :param limit: Stop looking after finding this many results. 

972 :param _stacklevel: Used internally to improve warning messages. 

973 :kwargs: Additional filters on attribute values. 

974 """ 

975 return self._find_all( 

976 name, 

977 attrs, 

978 string, 

979 limit, 

980 self.previous_siblings, 

981 _stacklevel=_stacklevel + 1, 

982 **kwargs, 

983 ) 

984 

985 findPreviousSiblings = _deprecated_function_alias( 

986 "findPreviousSiblings", "find_previous_siblings", "4.0.0" 

987 ) 

988 fetchPreviousSiblings = _deprecated_function_alias( 

989 "fetchPreviousSiblings", "find_previous_siblings", "3.0.0" 

990 ) 

991 

992 def find_parent( 

993 self, 

994 name: _FindMethodName = None, 

995 attrs: _StrainableAttributes = {}, 

996 **kwargs: _StrainableAttribute, 

997 ) -> _AtMostOneElement: 

998 """Find the closest parent of this PageElement that matches the given 

999 criteria. 

1000 

1001 All find_* methods take a common set of arguments. See the online 

1002 documentation for detailed explanations. 

1003 

1004 :param name: A filter on tag name. 

1005 :param attrs: Additional filters on attribute values. 

1006 :param self: Whether the PageElement itself should be considered 

1007 as one of its 'parents'. 

1008 :kwargs: Additional filters on attribute values. 

1009 """ 

1010 # NOTE: We can't use _find_one because findParents takes a different 

1011 # set of arguments. 

1012 r = None 

1013 results = self.find_parents( 

1014 name, attrs, 1, _stacklevel=3, **kwargs 

1015 ) 

1016 if results: 

1017 r = results[0] 

1018 return r 

1019 

1020 findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0") 

1021 

1022 def find_parents( 

1023 self, 

1024 name: _FindMethodName = None, 

1025 attrs: _StrainableAttributes = {}, 

1026 limit: Optional[int] = None, 

1027 _stacklevel: int = 2, 

1028 **kwargs: _StrainableAttribute, 

1029 ) -> _QueryResults: 

1030 """Find all parents of this `PageElement` that match the given criteria. 

1031 

1032 All find_* methods take a common set of arguments. See the online 

1033 documentation for detailed explanations. 

1034 

1035 :param name: A filter on tag name. 

1036 :param attrs: Additional filters on attribute values. 

1037 :param limit: Stop looking after finding this many results. 

1038 :param _stacklevel: Used internally to improve warning messages. 

1039 :kwargs: Additional filters on attribute values. 

1040 """ 

1041 iterator = self.parents 

1042 return self._find_all( 

1043 name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs 

1044 ) 

1045 

1046 findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0") 

1047 fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0") 

1048 

1049 @property 

1050 def next(self) -> _AtMostOneElement: 

1051 """The `PageElement`, if any, that was parsed just after this one.""" 

1052 return self.next_element 

1053 

1054 @property 

1055 def previous(self) -> _AtMostOneElement: 

1056 """The `PageElement`, if any, that was parsed just before this one.""" 

1057 return self.previous_element 

1058 

1059 # These methods do the real heavy lifting. 

1060 

1061 def _find_one( 

1062 self, 

1063 # TODO-TYPING: "There is no syntax to indicate optional or 

1064 # keyword arguments; such function types are rarely used 

1065 # as callback types." - So, not sure how to get more 

1066 # specific here. 

1067 method: Callable, 

1068 name: _FindMethodName, 

1069 attrs: _StrainableAttributes, 

1070 string: Optional[_StrainableString], 

1071 **kwargs: _StrainableAttribute, 

1072 ) -> _AtMostOneElement: 

1073 r: _AtMostOneElement = None 

1074 results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs) 

1075 if results: 

1076 r = results[0] 

1077 return r 

1078 

1079 def _find_all( 

1080 self, 

1081 name: _FindMethodName, 

1082 attrs: _StrainableAttributes, 

1083 string: Optional[_StrainableString], 

1084 limit: Optional[int], 

1085 generator: Iterator[PageElement], 

1086 _stacklevel: int = 3, 

1087 **kwargs: _StrainableAttribute, 

1088 ) -> _QueryResults: 

1089 """Iterates over a generator looking for things that match.""" 

1090 

1091 if string is None and "text" in kwargs: 

1092 string = kwargs.pop("text") 

1093 warnings.warn( 

1094 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", 

1095 DeprecationWarning, 

1096 stacklevel=_stacklevel, 

1097 ) 

1098 

1099 if "_class" in kwargs: 

1100 warnings.warn( 

1101 AttributeResemblesVariableWarning.MESSAGE 

1102 % dict( 

1103 original="_class", 

1104 autocorrect="class_", 

1105 ), 

1106 AttributeResemblesVariableWarning, 

1107 stacklevel=_stacklevel, 

1108 ) 

1109 

1110 from bs4.filter import ElementFilter 

1111 

1112 if isinstance(name, ElementFilter): 

1113 matcher = name 

1114 else: 

1115 matcher = SoupStrainer(name, attrs, string, **kwargs) 

1116 

1117 result: Iterable[_OneElement] 

1118 if string is None and not limit and not attrs and not kwargs: 

1119 if name is True or name is None: 

1120 # Optimization to find all tags. 

1121 result = (element for element in generator if isinstance(element, Tag)) 

1122 return ResultSet(matcher, result) 

1123 elif isinstance(name, str): 

1124 # Optimization to find all tags with a given name. 

1125 if name.count(":") == 1: 

1126 # This is a name with a prefix. If this is a namespace-aware document, 

1127 # we need to match the local name against tag.name. If not, 

1128 # we need to match the fully-qualified name against tag.name. 

1129 prefix, local_name = name.split(":", 1) 

1130 else: 

1131 prefix = None 

1132 local_name = name 

1133 result = [] 

1134 for element in generator: 

1135 if not isinstance(element, Tag): 

1136 continue 

1137 if element.name == name or ( 

1138 element.name == local_name 

1139 and (prefix is None or element.prefix == prefix) 

1140 ): 

1141 result.append(element) 

1142 return ResultSet(matcher, result) 

1143 return matcher.find_all(generator, limit) 

1144 

1145 # These generators can be used to navigate starting from both 

1146 # NavigableStrings and Tags. 

1147 @property 

1148 def next_elements(self) -> Iterator[PageElement]: 

1149 """All PageElements that were parsed after this one.""" 

1150 i = self.next_element 

1151 while i is not None: 

1152 successor = i.next_element 

1153 yield i 

1154 i = successor 

1155 

1156 @property 

1157 def self_and_next_elements(self) -> Iterator[PageElement]: 

1158 """This PageElement, then all PageElements that were parsed after it.""" 

1159 return self._self_and(self.next_elements) 

1160 

1161 @property 

1162 def next_siblings(self) -> Iterator[PageElement]: 

1163 """All PageElements that are siblings of this one but were parsed 

1164 later. 

1165 """ 

1166 i = self.next_sibling 

1167 while i is not None: 

1168 successor = i.next_sibling 

1169 yield i 

1170 i = successor 

1171 

1172 @property 

1173 def self_and_next_siblings(self) -> Iterator[PageElement]: 

1174 """This PageElement, then all of its siblings.""" 

1175 return self._self_and(self.next_siblings) 

1176 

1177 @property 

1178 def previous_elements(self) -> Iterator[PageElement]: 

1179 """All PageElements that were parsed before this one. 

1180 

1181 :yield: A sequence of PageElements. 

1182 """ 

1183 i = self.previous_element 

1184 while i is not None: 

1185 successor = i.previous_element 

1186 yield i 

1187 i = successor 

1188 

1189 @property 

1190 def self_and_previous_elements(self) -> Iterator[PageElement]: 

1191 """This PageElement, then all elements that were parsed 

1192 earlier.""" 

1193 return self._self_and(self.previous_elements) 

1194 

1195 @property 

1196 def previous_siblings(self) -> Iterator[PageElement]: 

1197 """All PageElements that are siblings of this one but were parsed 

1198 earlier. 

1199 

1200 :yield: A sequence of PageElements. 

1201 """ 

1202 i = self.previous_sibling 

1203 while i is not None: 

1204 successor = i.previous_sibling 

1205 yield i 

1206 i = successor 

1207 

1208 @property 

1209 def self_and_previous_siblings(self) -> Iterator[PageElement]: 

1210 """This PageElement, then all of its siblings that were parsed 

1211 earlier.""" 

1212 return self._self_and(self.previous_siblings) 

1213 

1214 @property 

1215 def parents(self) -> Iterator[Tag]: 

1216 """All elements that are parents of this PageElement. 

1217 

1218 :yield: A sequence of Tags, ending with a BeautifulSoup object. 

1219 """ 

1220 i = self.parent 

1221 while i is not None: 

1222 successor = i.parent 

1223 yield i 

1224 i = successor 

1225 

1226 @property 

1227 def self_and_parents(self) -> Iterator[PageElement]: 

1228 """This element, then all of its parents. 

1229 

1230 :yield: A sequence of PageElements, ending with a BeautifulSoup object. 

1231 """ 

1232 return self._self_and(self.parents) 

1233 

1234 def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]: 

1235 """Modify a generator by yielding this element, then everything 

1236 yielded by the other generator. 

1237 """ 

1238 if not self.hidden: 

1239 yield self 

1240 for i in other_generator: 

1241 yield i 

1242 

1243 @property 

1244 def decomposed(self) -> bool: 

1245 """Check whether a PageElement has been decomposed.""" 

1246 return getattr(self, "_decomposed", False) or False 

1247 

1248 @_deprecated("next_elements", "4.0.0") 

1249 def nextGenerator(self) -> Iterator[PageElement]: 

1250 ":meta private:" 

1251 return self.next_elements 

1252 

1253 @_deprecated("next_siblings", "4.0.0") 

1254 def nextSiblingGenerator(self) -> Iterator[PageElement]: 

1255 ":meta private:" 

1256 return self.next_siblings 

1257 

1258 @_deprecated("previous_elements", "4.0.0") 

1259 def previousGenerator(self) -> Iterator[PageElement]: 

1260 ":meta private:" 

1261 return self.previous_elements 

1262 

1263 @_deprecated("previous_siblings", "4.0.0") 

1264 def previousSiblingGenerator(self) -> Iterator[PageElement]: 

1265 ":meta private:" 

1266 return self.previous_siblings 

1267 

1268 @_deprecated("parents", "4.0.0") 

1269 def parentGenerator(self) -> Iterator[PageElement]: 

1270 ":meta private:" 

1271 return self.parents 

1272 

1273 

1274class NavigableString(str, PageElement): 

1275 """A Python string that is part of a parse tree. 

1276 

1277 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will 

1278 create a `NavigableString` for the string "penguin". 

1279 """ 

1280 

1281 #: A string prepended to the body of the 'real' string 

1282 #: when formatting it as part of a document, such as the '<!--' 

1283 #: in an HTML comment. 

1284 PREFIX: str = "" 

1285 

1286 #: A string appended to the body of the 'real' string 

1287 #: when formatting it as part of a document, such as the '-->' 

1288 #: in an HTML comment. 

1289 SUFFIX: str = "" 

1290 

1291 def __new__(cls, value: Union[str, bytes]) -> Self: 

1292 """Create a new NavigableString. 

1293 

1294 When unpickling a NavigableString, this method is called with 

1295 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 

1296 passed in to the superclass's __new__ or the superclass won't know 

1297 how to handle non-ASCII characters. 

1298 """ 

1299 if isinstance(value, str): 

1300 u = str.__new__(cls, value) 

1301 else: 

1302 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 

1303 u.hidden = False 

1304 u.setup() 

1305 return u 

1306 

1307 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: 

1308 """A copy of a NavigableString has the same contents and class 

1309 as the original, but it is not connected to the parse tree. 

1310 

1311 :param recursive: This parameter is ignored; it's only defined 

1312 so that NavigableString.__deepcopy__ implements the same 

1313 signature as Tag.__deepcopy__. 

1314 """ 

1315 return type(self)(self) 

1316 

1317 def __getnewargs__(self) -> Tuple[str]: 

1318 return (str(self),) 

1319 

1320 # TODO-TYPING This should be SupportsIndex|slice but SupportsIndex 

1321 # is introduced in 3.8. 

1322 def __getitem__(self, key: Union[int|slice]) -> str: 

1323 """Raise an exception """ 

1324 if isinstance(key, str): 

1325 raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__)) 

1326 return super(NavigableString, self).__getitem__(key) 

1327 

1328 @property 

1329 def string(self) -> str: 

1330 """Convenience property defined to match `Tag.string`. 

1331 

1332 :return: This property always returns the `NavigableString` it was 

1333 called on. 

1334 

1335 :meta private: 

1336 """ 

1337 return self 

1338 

1339 def output_ready(self, formatter: _FormatterOrName = "minimal") -> str: 

1340 """Run the string through the provided formatter, making it 

1341 ready for output as part of an HTML or XML document. 

1342 

1343 :param formatter: A `Formatter` object, or a string naming one 

1344 of the standard formatters. 

1345 """ 

1346 output = self.format_string(self, formatter) 

1347 return self.PREFIX + output + self.SUFFIX 

1348 

1349 @property 

1350 def name(self) -> None: 

1351 """Since a NavigableString is not a Tag, it has no .name. 

1352 

1353 This property is implemented so that code like this doesn't crash 

1354 when run on a mixture of Tag and NavigableString objects: 

1355 [x.name for x in tag.children] 

1356 

1357 :meta private: 

1358 """ 

1359 return None 

1360 

1361 @name.setter 

1362 def name(self, name: str) -> None: 

1363 """Prevent NavigableString.name from ever being set. 

1364 

1365 :meta private: 

1366 """ 

1367 raise AttributeError("A NavigableString cannot be given a name.") 

1368 

1369 def _all_strings( 

1370 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default 

1371 ) -> Iterator[str]: 

1372 """Yield all strings of certain classes, possibly stripping them. 

1373 

1374 This makes it easy for NavigableString to implement methods 

1375 like get_text() as conveniences, creating a consistent 

1376 text-extraction API across all PageElements. 

1377 

1378 :param strip: If True, all strings will be stripped before being 

1379 yielded. 

1380 

1381 :param types: A tuple of NavigableString subclasses. If this 

1382 NavigableString isn't one of those subclasses, the 

1383 sequence will be empty. By default, the subclasses 

1384 considered are NavigableString and CData objects. That 

1385 means no comments, processing instructions, etc. 

1386 

1387 :yield: A sequence that either contains this string, or is empty. 

1388 """ 

1389 if types is self.default: 

1390 # This is kept in Tag because it's full of subclasses of 

1391 # this class, which aren't defined until later in the file. 

1392 types = Tag.MAIN_CONTENT_STRING_TYPES 

1393 

1394 # Do nothing if the caller is looking for specific types of 

1395 # string, and we're of a different type. 

1396 # 

1397 # We check specific types instead of using isinstance(self, 

1398 # types) because all of these classes subclass 

1399 # NavigableString. Anyone who's using this feature probably 

1400 # wants generic NavigableStrings but not other stuff. 

1401 my_type = type(self) 

1402 if types is not None: 

1403 if isinstance(types, type): 

1404 # Looking for a single type. 

1405 if my_type is not types: 

1406 return 

1407 elif my_type not in types: 

1408 # Looking for one of a list of types. 

1409 return 

1410 

1411 value = self 

1412 if strip: 

1413 final_value = value.strip() 

1414 else: 

1415 final_value = self 

1416 if len(final_value) > 0: 

1417 yield final_value 

1418 

1419 @property 

1420 def strings(self) -> Iterator[str]: 

1421 """Yield this string, but only if it is interesting. 

1422 

1423 This is defined the way it is for compatibility with 

1424 `Tag.strings`. See `Tag` for information on which strings are 

1425 interesting in a given context. 

1426 

1427 :yield: A sequence that either contains this string, or is empty. 

1428 """ 

1429 return self._all_strings() 

1430 

1431 

1432class PreformattedString(NavigableString): 

1433 """A `NavigableString` not subject to the normal formatting rules. 

1434 

1435 This is an abstract class used for special kinds of strings such 

1436 as comments (`Comment`) and CDATA blocks (`CData`). 

1437 """ 

1438 

1439 PREFIX: str = "" 

1440 SUFFIX: str = "" 

1441 

1442 def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str: 

1443 """Make this string ready for output by adding any subclass-specific 

1444 prefix or suffix. 

1445 

1446 :param formatter: A `Formatter` object, or a string naming one 

1447 of the standard formatters. The string will be passed into the 

1448 `Formatter`, but only to trigger any side effects: the return 

1449 value is ignored. 

1450 

1451 :return: The string, with any subclass-specific prefix and 

1452 suffix added on. 

1453 """ 

1454 if formatter is not None: 

1455 self.format_string(self, formatter) 

1456 return self.PREFIX + self + self.SUFFIX 

1457 

1458 

1459class CData(PreformattedString): 

1460 """A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_.""" 

1461 

1462 PREFIX: str = "<![CDATA[" 

1463 SUFFIX: str = "]]>" 

1464 

1465 

1466class ProcessingInstruction(PreformattedString): 

1467 """A SGML processing instruction.""" 

1468 

1469 PREFIX: str = "<?" 

1470 SUFFIX: str = ">" 

1471 

1472 

1473class XMLProcessingInstruction(ProcessingInstruction): 

1474 """An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_.""" 

1475 

1476 PREFIX: str = "<?" 

1477 SUFFIX: str = "?>" 

1478 

1479 

1480class Comment(PreformattedString): 

1481 """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_.""" 

1482 

1483 PREFIX: str = "<!--" 

1484 SUFFIX: str = "-->" 

1485 

1486 

1487class Declaration(PreformattedString): 

1488 """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_.""" 

1489 

1490 PREFIX: str = "<?" 

1491 SUFFIX: str = "?>" 

1492 

1493 

1494class Doctype(PreformattedString): 

1495 """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_.""" 

1496 

1497 @classmethod 

1498 def for_name_and_ids( 

1499 cls, name: str, pub_id: Optional[str], system_id: Optional[str] 

1500 ) -> Doctype: 

1501 """Generate an appropriate document type declaration for a given 

1502 public ID and system ID. 

1503 

1504 :param name: The name of the document's root element, e.g. 'html'. 

1505 :param pub_id: The Formal Public Identifier for this document type, 

1506 e.g. '-//W3C//DTD XHTML 1.1//EN' 

1507 :param system_id: The system identifier for this document type, 

1508 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' 

1509 """ 

1510 return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id)) 

1511 

1512 @classmethod 

1513 def _string_for_name_and_ids( 

1514 self, name: str, pub_id: Optional[str], system_id: Optional[str] 

1515 ) -> str: 

1516 """Generate a string to be used as the basis of a Doctype object. 

1517 

1518 This is a separate method from for_name_and_ids() because the lxml 

1519 TreeBuilder needs to call it. 

1520 """ 

1521 value = name or "" 

1522 if pub_id is not None: 

1523 value += ' PUBLIC "%s"' % pub_id 

1524 if system_id is not None: 

1525 value += ' "%s"' % system_id 

1526 elif system_id is not None: 

1527 value += ' SYSTEM "%s"' % system_id 

1528 return value 

1529 

1530 PREFIX: str = "<!DOCTYPE " 

1531 SUFFIX: str = ">\n" 

1532 

1533 

1534class Stylesheet(NavigableString): 

1535 """A `NavigableString` representing the contents of a `<style> HTML 

1536 tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_ 

1537 (probably CSS). 

1538 

1539 Used to distinguish embedded stylesheets from textual content. 

1540 """ 

1541 

1542 

1543class Script(NavigableString): 

1544 """A `NavigableString` representing the contents of a `<script> 

1545 HTML tag 

1546 <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_ 

1547 (probably Javascript). 

1548 

1549 Used to distinguish executable code from textual content. 

1550 """ 

1551 

1552 

1553class TemplateString(NavigableString): 

1554 """A `NavigableString` representing a string found inside an `HTML 

1555 <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_ 

1556 embedded in a larger document. 

1557 

1558 Used to distinguish such strings from the main body of the document. 

1559 """ 

1560 

1561 

1562class RubyTextString(NavigableString): 

1563 """A NavigableString representing the contents of an `<rt> HTML 

1564 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_. 

1565 

1566 Can be used to distinguish such strings from the strings they're 

1567 annotating. 

1568 """ 

1569 

1570 

1571class RubyParenthesisString(NavigableString): 

1572 """A NavigableString representing the contents of an `<rp> HTML 

1573 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_. 

1574 """ 

1575 

1576 

1577class Tag(PageElement): 

1578 """An HTML or XML tag that is part of a parse tree, along with its 

1579 attributes, contents, and relationships to other parts of the tree. 

1580 

1581 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will 

1582 create a `Tag` object representing the ``<b>`` tag. You can 

1583 instantiate `Tag` objects directly, but it's not necessary unless 

1584 you're adding entirely new markup to a parsed document. Most of 

1585 the constructor arguments are intended for use by the `TreeBuilder` 

1586 that's parsing a document. 

1587 

1588 :param parser: A `BeautifulSoup` object representing the parse tree this 

1589 `Tag` will be part of. 

1590 :param builder: The `TreeBuilder` being used to build the tree. 

1591 :param name: The name of the tag. 

1592 :param namespace: The URI of this tag's XML namespace, if any. 

1593 :param prefix: The prefix for this tag's XML namespace, if any. 

1594 :param attrs: A dictionary of attribute values. 

1595 :param parent: The `Tag` to use as the parent of this `Tag`. May be 

1596 the `BeautifulSoup` object itself. 

1597 :param previous: The `PageElement` that was parsed immediately before 

1598 parsing this tag. 

1599 :param is_xml: If True, this is an XML tag. Otherwise, this is an 

1600 HTML tag. 

1601 :param sourceline: The line number where this tag was found in its 

1602 source document. 

1603 :param sourcepos: The character position within ``sourceline`` where this 

1604 tag was found. 

1605 :param can_be_empty_element: If True, this tag should be 

1606 represented as <tag/>. If False, this tag should be represented 

1607 as <tag></tag>. 

1608 :param cdata_list_attributes: A dictionary of attributes whose values should 

1609 be parsed as lists of strings if they ever show up on this tag. 

1610 :param preserve_whitespace_tags: Names of tags whose contents 

1611 should have their whitespace preserved if they are encountered inside 

1612 this tag. 

1613 :param interesting_string_types: When iterating over this tag's 

1614 string contents in methods like `Tag.strings` or 

1615 `PageElement.get_text`, these are the types of strings that are 

1616 interesting enough to be considered. By default, 

1617 `NavigableString` (normal strings) and `CData` (CDATA 

1618 sections) are the only interesting string subtypes. 

1619 :param namespaces: A dictionary mapping currently active 

1620 namespace prefixes to URIs, as of the point in the parsing process when 

1621 this tag was encountered. This can be used later to 

1622 construct CSS selectors. 

1623 

1624 """ 

1625 

1626 def __init__( 

1627 self, 

1628 parser: Optional[BeautifulSoup] = None, 

1629 builder: Optional[TreeBuilder] = None, 

1630 name: Optional[str] = None, 

1631 namespace: Optional[str] = None, 

1632 prefix: Optional[str] = None, 

1633 attrs: Optional[_RawOrProcessedAttributeValues] = None, 

1634 parent: Optional[Union[BeautifulSoup, Tag]] = None, 

1635 previous: _AtMostOneElement = None, 

1636 is_xml: Optional[bool] = None, 

1637 sourceline: Optional[int] = None, 

1638 sourcepos: Optional[int] = None, 

1639 can_be_empty_element: Optional[bool] = None, 

1640 cdata_list_attributes: Optional[Dict[str, Set[str]]] = None, 

1641 preserve_whitespace_tags: Optional[Set[str]] = None, 

1642 interesting_string_types: Optional[Set[Type[NavigableString]]] = None, 

1643 namespaces: Optional[Dict[str, str]] = None, 

1644 # NOTE: Any new arguments here need to be mirrored in 

1645 # Tag.copy_self, and potentially BeautifulSoup.new_tag 

1646 # as well. 

1647 ): 

1648 if parser is None: 

1649 self.parser_class = None 

1650 else: 

1651 # We don't actually store the parser object: that lets extracted 

1652 # chunks be garbage-collected. 

1653 self.parser_class = parser.__class__ 

1654 if name is None: 

1655 raise ValueError("No value provided for new tag's name.") 

1656 self.name = name 

1657 self.namespace = namespace 

1658 self._namespaces = namespaces or {} 

1659 self.prefix = prefix 

1660 if (not builder or builder.store_line_numbers) and ( 

1661 sourceline is not None or sourcepos is not None 

1662 ): 

1663 self.sourceline = sourceline 

1664 self.sourcepos = sourcepos 

1665 else: 

1666 self.sourceline = sourceline 

1667 self.sourcepos = sourcepos 

1668 

1669 attr_dict_class: type[AttributeDict] 

1670 attribute_value_list_class: type[AttributeValueList] 

1671 if builder is None: 

1672 if is_xml: 

1673 attr_dict_class = XMLAttributeDict 

1674 else: 

1675 attr_dict_class = HTMLAttributeDict 

1676 attribute_value_list_class = AttributeValueList 

1677 else: 

1678 attr_dict_class = builder.attribute_dict_class 

1679 attribute_value_list_class = builder.attribute_value_list_class 

1680 self.attribute_value_list_class = attribute_value_list_class 

1681 

1682 if attrs is None: 

1683 self.attrs = attr_dict_class() 

1684 else: 

1685 if builder is not None and builder.cdata_list_attributes: 

1686 self.attrs = builder._replace_cdata_list_attribute_values( 

1687 self.name, attrs 

1688 ) 

1689 else: 

1690 self.attrs = attr_dict_class() 

1691 # Make sure that the values of any multi-valued 

1692 # attributes (e.g. when a Tag is copied) are stored in 

1693 # new lists. 

1694 for k, v in attrs.items(): 

1695 if isinstance(v, list): 

1696 v = v.__class__(v) 

1697 self.attrs[k] = v 

1698 

1699 # If possible, determine ahead of time whether this tag is an 

1700 # XML tag. 

1701 if builder: 

1702 self.known_xml = builder.is_xml 

1703 else: 

1704 self.known_xml = is_xml 

1705 self.contents: List[PageElement] = [] 

1706 self.setup(parent, previous) 

1707 self.hidden = False 

1708 

1709 if builder is None: 

1710 # In the absence of a TreeBuilder, use whatever values were 

1711 # passed in here. They're probably None, unless this is a copy of some 

1712 # other tag. 

1713 self.can_be_empty_element = can_be_empty_element 

1714 self.cdata_list_attributes = cdata_list_attributes 

1715 self.preserve_whitespace_tags = preserve_whitespace_tags 

1716 self.interesting_string_types = interesting_string_types 

1717 else: 

1718 # Set up any substitutions for this tag, such as the charset in a META tag. 

1719 self.attribute_value_list_class = builder.attribute_value_list_class 

1720 builder.set_up_substitutions(self) 

1721 

1722 # Ask the TreeBuilder whether this tag might be an empty-element tag. 

1723 self.can_be_empty_element = builder.can_be_empty_element(name) 

1724 

1725 # Keep track of the list of attributes of this tag that 

1726 # might need to be treated as a list. 

1727 # 

1728 # For performance reasons, we store the whole data structure 

1729 # rather than asking the question of every tag. Asking would 

1730 # require building a new data structure every time, and 

1731 # (unlike can_be_empty_element), we almost never need 

1732 # to check this. 

1733 self.cdata_list_attributes = builder.cdata_list_attributes 

1734 

1735 # Keep track of the names that might cause this tag to be treated as a 

1736 # whitespace-preserved tag. 

1737 self.preserve_whitespace_tags = builder.preserve_whitespace_tags 

1738 

1739 if self.name in builder.string_containers: 

1740 # This sort of tag uses a special string container 

1741 # subclass for most of its strings. We need to be able 

1742 # to look up the proper container subclass. 

1743 self.interesting_string_types = {builder.string_containers[self.name]} 

1744 else: 

1745 self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES 

1746 

1747 parser_class: Optional[type[BeautifulSoup]] 

1748 name: str 

1749 namespace: Optional[str] 

1750 prefix: Optional[str] 

1751 attrs: _AttributeValues 

1752 sourceline: Optional[int] 

1753 sourcepos: Optional[int] 

1754 known_xml: Optional[bool] 

1755 contents: List[PageElement] 

1756 hidden: bool 

1757 interesting_string_types: Optional[Set[Type[NavigableString]]] 

1758 

1759 can_be_empty_element: Optional[bool] 

1760 cdata_list_attributes: Optional[Dict[str, Set[str]]] 

1761 preserve_whitespace_tags: Optional[Set[str]] 

1762 

1763 #: :meta private: 

1764 parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0") 

1765 

1766 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self: 

1767 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree. 

1768 Its contents are a copy of the old Tag's contents. 

1769 """ 

1770 clone = self.copy_self() 

1771 

1772 if recursive: 

1773 # Clone this tag's descendants recursively, but without 

1774 # making any recursive function calls. 

1775 tag_stack: List[Tag] = [clone] 

1776 for event, element in self._event_stream(self.descendants): 

1777 if event is Tag.END_ELEMENT_EVENT: 

1778 # Stop appending incoming Tags to the Tag that was 

1779 # just closed. 

1780 tag_stack.pop() 

1781 else: 

1782 descendant_clone = element.__deepcopy__(memo, recursive=False) 

1783 # Add to its parent's .contents 

1784 tag_stack[-1].append(descendant_clone) 

1785 

1786 if event is Tag.START_ELEMENT_EVENT: 

1787 # Add the Tag itself to the stack so that its 

1788 # children will be .appended to it. 

1789 tag_stack.append(cast(Tag, descendant_clone)) 

1790 return clone 

1791 

1792 def copy_self(self) -> Self: 

1793 """Create a new Tag just like this one, but with no 

1794 contents and unattached to any parse tree. 

1795 

1796 This is the first step in the deepcopy process, but you can 

1797 call it on its own to create a copy of a Tag without copying its 

1798 contents. 

1799 """ 

1800 clone = type(self)( 

1801 None, 

1802 None, 

1803 self.name, 

1804 self.namespace, 

1805 self.prefix, 

1806 self.attrs, 

1807 is_xml=self._is_xml, 

1808 sourceline=self.sourceline, 

1809 sourcepos=self.sourcepos, 

1810 can_be_empty_element=self.can_be_empty_element, 

1811 cdata_list_attributes=self.cdata_list_attributes, 

1812 preserve_whitespace_tags=self.preserve_whitespace_tags, 

1813 interesting_string_types=self.interesting_string_types, 

1814 namespaces=self._namespaces, 

1815 ) 

1816 for attr in ("can_be_empty_element", "hidden"): 

1817 setattr(clone, attr, getattr(self, attr)) 

1818 return clone 

1819 

1820 @property 

1821 def is_empty_element(self) -> bool: 

1822 """Is this tag an empty-element tag? (aka a self-closing tag) 

1823 

1824 A tag that has contents is never an empty-element tag. 

1825 

1826 A tag that has no contents may or may not be an empty-element 

1827 tag. It depends on the `TreeBuilder` used to create the 

1828 tag. If the builder has a designated list of empty-element 

1829 tags, then only a tag whose name shows up in that list is 

1830 considered an empty-element tag. This is usually the case 

1831 for HTML documents. 

1832 

1833 If the builder has no designated list of empty-element, then 

1834 any tag with no contents is an empty-element tag. This is usually 

1835 the case for XML documents. 

1836 """ 

1837 return len(self.contents) == 0 and self.can_be_empty_element is True 

1838 

1839 @_deprecated("is_empty_element", "4.0.0") 

1840 def isSelfClosing(self) -> bool: 

1841 ": :meta private:" 

1842 return self.is_empty_element 

1843 

1844 @property 

1845 def string(self) -> Optional[str]: 

1846 """Convenience property to get the single string within this 

1847 `Tag`, assuming there is just one. 

1848 

1849 :return: If this `Tag` has a single child that's a 

1850 `NavigableString`, the return value is that string. If this 

1851 element has one child `Tag`, the return value is that child's 

1852 `Tag.string`, recursively. If this `Tag` has no children, 

1853 or has more than one child, the return value is ``None``. 

1854 

1855 If this property is unexpectedly returning ``None`` for you, 

1856 it's probably because your `Tag` has more than one thing 

1857 inside it. 

1858 """ 

1859 if len(self.contents) != 1: 

1860 return None 

1861 child = self.contents[0] 

1862 if isinstance(child, NavigableString): 

1863 return child 

1864 elif isinstance(child, Tag): 

1865 return child.string 

1866 return None 

1867 

1868 @string.setter 

1869 def string(self, string: str) -> None: 

1870 """Replace the `Tag.contents` of this `Tag` with a single string.""" 

1871 self.clear() 

1872 if isinstance(string, NavigableString): 

1873 new_class = string.__class__ 

1874 else: 

1875 new_class = NavigableString 

1876 self.append(new_class(string)) 

1877 

1878 #: :meta private: 

1879 MAIN_CONTENT_STRING_TYPES = {NavigableString, CData} 

1880 

1881 def _all_strings( 

1882 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default 

1883 ) -> Iterator[str]: 

1884 """Yield all strings of certain classes, possibly stripping them. 

1885 

1886 :param strip: If True, all strings will be stripped before being 

1887 yielded. 

1888 

1889 :param types: A tuple of NavigableString subclasses. Any strings of 

1890 a subclass not found in this list will be ignored. By 

1891 default, the subclasses considered are the ones found in 

1892 self.interesting_string_types. If that's not specified, 

1893 only NavigableString and CData objects will be 

1894 considered. That means no comments, processing 

1895 instructions, etc. 

1896 """ 

1897 if types is self.default: 

1898 if self.interesting_string_types is None: 

1899 types = self.MAIN_CONTENT_STRING_TYPES 

1900 else: 

1901 types = self.interesting_string_types 

1902 

1903 for descendant in self.descendants: 

1904 if not isinstance(descendant, NavigableString): 

1905 continue 

1906 descendant_type = type(descendant) 

1907 if isinstance(types, type): 

1908 if descendant_type is not types: 

1909 # We're not interested in strings of this type. 

1910 continue 

1911 elif types is not None and descendant_type not in types: 

1912 # We're not interested in strings of this type. 

1913 continue 

1914 if strip: 

1915 stripped = descendant.strip() 

1916 if len(stripped) == 0: 

1917 continue 

1918 yield stripped 

1919 else: 

1920 yield descendant 

1921 

1922 strings = property(_all_strings) 

1923 

1924 def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]: 

1925 """Insert one or more new PageElements as a child of this `Tag`. 

1926 

1927 This works similarly to :py:meth:`list.insert`, except you can insert 

1928 multiple elements at once. 

1929 

1930 :param position: The numeric position that should be occupied 

1931 in this Tag's `Tag.children` by the first new `PageElement`. 

1932 

1933 :param new_children: The PageElements to insert. 

1934 

1935 :return The newly inserted PageElements. 

1936 """ 

1937 inserted: List[PageElement] = [] 

1938 for new_child in new_children: 

1939 inserted.extend(self._insert(position, new_child)) 

1940 position += 1 

1941 return inserted 

1942 

1943 def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]: 

1944 if new_child is None: 

1945 raise ValueError("Cannot insert None into a tag.") 

1946 if new_child is self: 

1947 raise ValueError("Cannot insert a tag into itself.") 

1948 if isinstance(new_child, str) and not isinstance(new_child, NavigableString): 

1949 new_child = NavigableString(new_child) 

1950 

1951 from bs4 import BeautifulSoup 

1952 if isinstance(new_child, BeautifulSoup): 

1953 # We don't want to end up with a situation where one BeautifulSoup 

1954 # object contains another. Insert the BeautifulSoup's children and 

1955 # return them. 

1956 return self.insert(position, *list(new_child.contents)) 

1957 position = min(position, len(self.contents)) 

1958 if hasattr(new_child, "parent") and new_child.parent is not None: 

1959 # We're 'inserting' an element that's already one 

1960 # of this object's children. 

1961 if new_child.parent is self: 

1962 current_index = self.index(new_child) 

1963 if current_index < position: 

1964 # We're moving this element further down the list 

1965 # of this object's children. That means that when 

1966 # we extract this element, our target index will 

1967 # jump down one. 

1968 position -= 1 

1969 elif current_index == position: 

1970 # We're 'inserting' an element into its current location. 

1971 # This is a no-op. 

1972 return [new_child] 

1973 new_child.extract() 

1974 

1975 new_child.parent = self 

1976 previous_child = None 

1977 if position == 0: 

1978 new_child.previous_sibling = None 

1979 new_child.previous_element = self 

1980 else: 

1981 previous_child = self.contents[position - 1] 

1982 new_child.previous_sibling = previous_child 

1983 new_child.previous_sibling.next_sibling = new_child 

1984 new_child.previous_element = previous_child._last_descendant(False) 

1985 if new_child.previous_element is not None: 

1986 new_child.previous_element.next_element = new_child 

1987 

1988 new_childs_last_element = new_child._last_descendant( 

1989 is_initialized=False, accept_self=True 

1990 ) 

1991 # new_childs_last_element can't be None because we passed 

1992 # accept_self=True into _last_descendant. Worst case, 

1993 # new_childs_last_element will be new_child itself. Making 

1994 # this cast removes several mypy complaints later on as we 

1995 # manipulate new_childs_last_element. 

1996 new_childs_last_element = cast(PageElement, new_childs_last_element) 

1997 

1998 if position >= len(self.contents): 

1999 new_child.next_sibling = None 

2000 

2001 parent: Optional[Tag] = self 

2002 parents_next_sibling = None 

2003 while parents_next_sibling is None and parent is not None: 

2004 parents_next_sibling = parent.next_sibling 

2005 parent = parent.parent 

2006 if parents_next_sibling is not None: 

2007 # We found the element that comes next in the document. 

2008 break 

2009 if parents_next_sibling is not None: 

2010 new_childs_last_element.next_element = parents_next_sibling 

2011 else: 

2012 # The last element of this tag is the last element in 

2013 # the document. 

2014 new_childs_last_element.next_element = None 

2015 else: 

2016 next_child = self.contents[position] 

2017 new_child.next_sibling = next_child 

2018 if new_child.next_sibling is not None: 

2019 new_child.next_sibling.previous_sibling = new_child 

2020 new_childs_last_element.next_element = next_child 

2021 

2022 if new_childs_last_element.next_element is not None: 

2023 new_childs_last_element.next_element.previous_element = ( 

2024 new_childs_last_element 

2025 ) 

2026 self.contents.insert(position, new_child) 

2027 

2028 return [new_child] 

2029 

2030 def unwrap(self) -> Self: 

2031 """Replace this `PageElement` with its contents. 

2032 

2033 :return: This object, no longer part of the tree. 

2034 """ 

2035 my_parent = self.parent 

2036 if my_parent is None: 

2037 raise ValueError( 

2038 "Cannot replace an element with its contents when that " 

2039 "element is not part of a tree." 

2040 ) 

2041 my_index = my_parent.index(self) 

2042 self.extract(_self_index=my_index) 

2043 for child in reversed(self.contents[:]): 

2044 my_parent.insert(my_index, child) 

2045 return self 

2046 

2047 replace_with_children = unwrap 

2048 

2049 @_deprecated("unwrap", "4.0.0") 

2050 def replaceWithChildren(self) -> _OneElement: 

2051 ": :meta private:" 

2052 return self.unwrap() 

2053 

2054 def append(self, tag: _InsertableElement) -> PageElement: 

2055 """ 

2056 Appends the given `PageElement` to the contents of this `Tag`. 

2057 

2058 :param tag: A PageElement. 

2059 

2060 :return The newly appended PageElement. 

2061 """ 

2062 return self.insert(len(self.contents), tag)[0] 

2063 

2064 def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]: 

2065 """Appends one or more objects to the contents of this 

2066 `Tag`. 

2067 

2068 :param tags: If a list of `PageElement` objects is provided, 

2069 they will be appended to this tag's contents, one at a time. 

2070 If a single `Tag` is provided, its `Tag.contents` will be 

2071 used to extend this object's `Tag.contents`. 

2072 

2073 :return The list of PageElements that were appended. 

2074 """ 

2075 tag_list: Iterable[_InsertableElement] 

2076 

2077 if isinstance(tags, Tag): 

2078 tag_list = list(tags.contents) 

2079 elif isinstance(tags, (PageElement, str)): 

2080 # The caller should really be using append() instead, 

2081 # but we can make it work. 

2082 warnings.warn( 

2083 "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.", 

2084 UserWarning, 

2085 stacklevel=2, 

2086 ) 

2087 if isinstance(tags, str) and not isinstance(tags, PageElement): 

2088 tags = NavigableString(tags) 

2089 tag_list = [tags] 

2090 elif isinstance(tags, Iterable): 

2091 # Moving items around the tree may change their position in 

2092 # the original list. Make a list that won't change. 

2093 tag_list = list(tags) 

2094 

2095 results: List[PageElement] = [] 

2096 for tag in tag_list: 

2097 results.append(self.append(tag)) 

2098 

2099 return results 

2100 

2101 def clear(self, decompose: bool = False) -> None: 

2102 """Destroy all children of this `Tag` by calling 

2103 `PageElement.extract` on them. 

2104 

2105 :param decompose: If this is True, `PageElement.decompose` (a 

2106 more destructive method) will be called instead of 

2107 `PageElement.extract`. 

2108 """ 

2109 for element in self.contents[:]: 

2110 if decompose: 

2111 element.decompose() 

2112 else: 

2113 element.extract() 

2114 

2115 def smooth(self) -> None: 

2116 """Smooth out the children of this `Tag` by consolidating consecutive 

2117 strings. 

2118 

2119 If you perform a lot of operations that modify the tree, 

2120 calling this method afterwards can make pretty-printed output 

2121 look more natural. 

2122 """ 

2123 # Mark the first position of every pair of children that need 

2124 # to be consolidated. Do this rather than making a copy of 

2125 # self.contents, since in most cases very few strings will be 

2126 # affected. 

2127 marked = [] 

2128 for i, a in enumerate(self.contents): 

2129 if isinstance(a, Tag): 

2130 # Recursively smooth children. 

2131 a.smooth() 

2132 if i == len(self.contents) - 1: 

2133 # This is the last item in .contents, and it's not a 

2134 # tag. There's no chance it needs any work. 

2135 continue 

2136 b = self.contents[i + 1] 

2137 if ( 

2138 isinstance(a, NavigableString) 

2139 and isinstance(b, NavigableString) 

2140 and not isinstance(a, PreformattedString) 

2141 and not isinstance(b, PreformattedString) 

2142 ): 

2143 marked.append(i) 

2144 

2145 # Go over the marked positions in reverse order, so that 

2146 # removing items from .contents won't affect the remaining 

2147 # positions. 

2148 for i in reversed(marked): 

2149 a = cast(NavigableString, self.contents[i]) 

2150 b = cast(NavigableString, self.contents[i + 1]) 

2151 b.extract() 

2152 n = NavigableString(a + b) 

2153 a.replace_with(n) 

2154 

2155 def index(self, element: PageElement) -> int: 

2156 """Find the index of a child of this `Tag` (by identity, not value). 

2157 

2158 Doing this by identity avoids issues when a `Tag` contains two 

2159 children that have string equality. 

2160 

2161 :param element: Look for this `PageElement` in this object's contents. 

2162 """ 

2163 for i, child in enumerate(self.contents): 

2164 if child is element: 

2165 return i 

2166 raise ValueError("Tag.index: element not in tag") 

2167 

2168 def get( 

2169 self, key: str, default: Optional[_AttributeValue] = None 

2170 ) -> Optional[_AttributeValue]: 

2171 """Returns the value of the 'key' attribute for the tag, or 

2172 the value given for 'default' if it doesn't have that 

2173 attribute. 

2174 

2175 :param key: The attribute to look for. 

2176 :param default: Use this value if the attribute is not present 

2177 on this `Tag`. 

2178 """ 

2179 return self.attrs.get(key, default) 

2180 

2181 def get_attribute_list( 

2182 self, key: str, default: Optional[AttributeValueList] = None 

2183 ) -> AttributeValueList: 

2184 """The same as get(), but always returns a (possibly empty) list. 

2185 

2186 :param key: The attribute to look for. 

2187 :param default: Use this value if the attribute is not present 

2188 on this `Tag`. 

2189 :return: A list of strings, usually empty or containing only a single 

2190 value. 

2191 """ 

2192 list_value: AttributeValueList 

2193 value = self.get(key, default) 

2194 if value is None: 

2195 list_value = self.attribute_value_list_class() 

2196 elif isinstance(value, list): 

2197 list_value = value 

2198 else: 

2199 if not isinstance(value, str): 

2200 value = cast(str, value) 

2201 list_value = self.attribute_value_list_class([value]) 

2202 return list_value 

2203 

2204 def has_attr(self, key: str) -> bool: 

2205 """Does this `Tag` have an attribute with the given name?""" 

2206 return key in self.attrs 

2207 

2208 def __hash__(self) -> int: 

2209 return str(self).__hash__() 

2210 

2211 def __getitem__(self, key: str) -> _AttributeValue: 

2212 """tag[key] returns the value of the 'key' attribute for the Tag, 

2213 and throws an exception if it's not there.""" 

2214 return self.attrs[key] 

2215 

2216 def __iter__(self) -> Iterator[PageElement]: 

2217 "Iterating over a Tag iterates over its contents." 

2218 return iter(self.contents) 

2219 

2220 def __len__(self) -> int: 

2221 "The length of a Tag is the length of its list of contents." 

2222 return len(self.contents) 

2223 

2224 def __contains__(self, x: Any) -> bool: 

2225 return x in self.contents 

2226 

2227 def __bool__(self) -> bool: 

2228 "A tag is non-None even if it has no contents." 

2229 return True 

2230 

2231 def __setitem__(self, key: str, value: _AttributeValue) -> None: 

2232 """Setting tag[key] sets the value of the 'key' attribute for the 

2233 tag.""" 

2234 self.attrs[key] = value 

2235 

2236 def __delitem__(self, key: str) -> None: 

2237 "Deleting tag[key] deletes all 'key' attributes for the tag." 

2238 self.attrs.pop(key, None) 

2239 

2240 def __call__( 

2241 self, 

2242 name: Optional[_StrainableElement] = None, 

2243 attrs: _StrainableAttributes = {}, 

2244 recursive: bool = True, 

2245 string: Optional[_StrainableString] = None, 

2246 limit: Optional[int] = None, 

2247 _stacklevel: int = 2, 

2248 **kwargs: _StrainableAttribute, 

2249 ) -> _QueryResults: 

2250 """Calling a Tag like a function is the same as calling its 

2251 find_all() method. Eg. tag('a') returns a list of all the A tags 

2252 found within this tag.""" 

2253 return self.find_all( 

2254 name, attrs, recursive, string, limit, _stacklevel, **kwargs 

2255 ) 

2256 

2257 def __getattr__(self, subtag: str) -> Optional[Tag]: 

2258 """Calling tag.subtag is the same as calling tag.find(name="subtag")""" 

2259 # print("Getattr %s.%s" % (self.__class__, tag)) 

2260 result: _AtMostOneElement 

2261 if len(subtag) > 3 and subtag.endswith("Tag"): 

2262 # BS3: soup.aTag -> "soup.find("a") 

2263 tag_name = subtag[:-3] 

2264 warnings.warn( 

2265 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' 

2266 % dict(name=tag_name), 

2267 DeprecationWarning, 

2268 stacklevel=2, 

2269 ) 

2270 result = self.find(tag_name) 

2271 # We special case contents to avoid recursion. 

2272 elif not subtag.startswith("__") and not subtag == "contents": 

2273 result = self.find(subtag) 

2274 else: 

2275 raise AttributeError( 

2276 "'%s' object has no attribute '%s'" % (self.__class__, subtag) 

2277 ) 

2278 return cast(Optional[Tag], result) 

2279 

2280 def __eq__(self, other: Any) -> bool: 

2281 """Returns true iff this Tag has the same name, the same attributes, 

2282 and the same contents (recursively) as `other`.""" 

2283 if self is other: 

2284 return True 

2285 if not isinstance(other, Tag): 

2286 return False 

2287 if ( 

2288 not hasattr(other, "name") 

2289 or not hasattr(other, "attrs") 

2290 or not hasattr(other, "contents") 

2291 or self.name != other.name 

2292 or self.attrs != other.attrs 

2293 or len(self) != len(other) 

2294 ): 

2295 return False 

2296 for i, my_child in enumerate(self.contents): 

2297 if my_child != other.contents[i]: 

2298 return False 

2299 return True 

2300 

2301 def __ne__(self, other: Any) -> bool: 

2302 """Returns true iff this Tag is not identical to `other`, 

2303 as defined in __eq__.""" 

2304 return not self == other 

2305 

2306 def __repr__(self) -> str: 

2307 """Renders this `Tag` as a string.""" 

2308 return self.decode() 

2309 

2310 __str__ = __unicode__ = __repr__ 

2311 

2312 def encode( 

2313 self, 

2314 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2315 indent_level: Optional[int] = None, 

2316 formatter: _FormatterOrName = "minimal", 

2317 errors: str = "xmlcharrefreplace", 

2318 ) -> bytes: 

2319 """Render this `Tag` and its contents as a bytestring. 

2320 

2321 :param encoding: The encoding to use when converting to 

2322 a bytestring. This may also affect the text of the document, 

2323 specifically any encoding declarations within the document. 

2324 :param indent_level: Each line of the rendering will be 

2325 indented this many levels. (The ``formatter`` decides what a 

2326 'level' means, in terms of spaces or other characters 

2327 output.) This is used internally in recursive calls while 

2328 pretty-printing. 

2329 :param formatter: Either a `Formatter` object, or a string naming one of 

2330 the standard formatters. 

2331 :param errors: An error handling strategy such as 

2332 'xmlcharrefreplace'. This value is passed along into 

2333 :py:meth:`str.encode` and its value should be one of the `error 

2334 handling constants defined by Python's codecs module 

2335 <https://docs.python.org/3/library/codecs.html#error-handlers>`_. 

2336 """ 

2337 # Turn the data structure into Unicode, then encode the 

2338 # Unicode. 

2339 u = self.decode(indent_level, encoding, formatter) 

2340 return u.encode(encoding, errors) 

2341 

2342 def decode( 

2343 self, 

2344 indent_level: Optional[int] = None, 

2345 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2346 formatter: _FormatterOrName = "minimal", 

2347 iterator: Optional[Iterator[PageElement]] = None, 

2348 ) -> str: 

2349 """Render this `Tag` and its contents as a Unicode string. 

2350 

2351 :param indent_level: Each line of the rendering will be 

2352 indented this many levels. (The ``formatter`` decides what a 

2353 'level' means, in terms of spaces or other characters 

2354 output.) This is used internally in recursive calls while 

2355 pretty-printing. 

2356 :param encoding: The encoding you intend to use when 

2357 converting the string to a bytestring. decode() is *not* 

2358 responsible for performing that encoding. This information 

2359 is needed so that a real encoding can be substituted in if 

2360 the document contains an encoding declaration (e.g. in a 

2361 <meta> tag). 

2362 :param formatter: Either a `Formatter` object, or a string 

2363 naming one of the standard formatters. 

2364 :param iterator: The iterator to use when navigating over the 

2365 parse tree. This is only used by `Tag.decode_contents` and 

2366 you probably won't need to use it. 

2367 """ 

2368 pieces = [] 

2369 # First off, turn a non-Formatter `formatter` into a Formatter 

2370 # object. This will stop the lookup from happening over and 

2371 # over again. 

2372 if not isinstance(formatter, Formatter): 

2373 formatter = self.formatter_for_name(formatter) 

2374 

2375 if indent_level is True: 

2376 indent_level = 0 

2377 

2378 # The currently active tag that put us into string literal 

2379 # mode. Until this element is closed, children will be treated 

2380 # as string literals and not pretty-printed. String literal 

2381 # mode is turned on immediately after this tag begins, and 

2382 # turned off immediately before it's closed. This means there 

2383 # will be whitespace before and after the tag itself. 

2384 string_literal_tag = None 

2385 

2386 for event, element in self._event_stream(iterator): 

2387 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): 

2388 element = cast(Tag, element) 

2389 piece = element._format_tag(eventual_encoding, formatter, opening=True) 

2390 elif event is Tag.END_ELEMENT_EVENT: 

2391 element = cast(Tag, element) 

2392 piece = element._format_tag(eventual_encoding, formatter, opening=False) 

2393 if indent_level is not None: 

2394 indent_level -= 1 

2395 else: 

2396 element = cast(NavigableString, element) 

2397 piece = element.output_ready(formatter) 

2398 

2399 # Now we need to apply the 'prettiness' -- extra 

2400 # whitespace before and/or after this tag. This can get 

2401 # complicated because certain tags, like <pre> and 

2402 # <script>, can't be prettified, since adding whitespace would 

2403 # change the meaning of the content. 

2404 

2405 # The default behavior is to add whitespace before and 

2406 # after an element when string literal mode is off, and to 

2407 # leave things as they are when string literal mode is on. 

2408 if string_literal_tag: 

2409 indent_before = indent_after = False 

2410 else: 

2411 indent_before = indent_after = True 

2412 

2413 # The only time the behavior is more complex than that is 

2414 # when we encounter an opening or closing tag that might 

2415 # put us into or out of string literal mode. 

2416 if ( 

2417 event is Tag.START_ELEMENT_EVENT 

2418 and not string_literal_tag 

2419 and not cast(Tag, element)._should_pretty_print() 

2420 ): 

2421 # We are about to enter string literal mode. Add 

2422 # whitespace before this tag, but not after. We 

2423 # will stay in string literal mode until this tag 

2424 # is closed. 

2425 indent_before = True 

2426 indent_after = False 

2427 string_literal_tag = element 

2428 elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag: 

2429 # We are about to exit string literal mode by closing 

2430 # the tag that sent us into that mode. Add whitespace 

2431 # after this tag, but not before. 

2432 indent_before = False 

2433 indent_after = True 

2434 string_literal_tag = None 

2435 

2436 # Now we know whether to add whitespace before and/or 

2437 # after this element. 

2438 if indent_level is not None: 

2439 if indent_before or indent_after: 

2440 if isinstance(element, NavigableString): 

2441 piece = piece.strip() 

2442 if piece: 

2443 piece = self._indent_string( 

2444 piece, indent_level, formatter, indent_before, indent_after 

2445 ) 

2446 if event == Tag.START_ELEMENT_EVENT: 

2447 indent_level += 1 

2448 pieces.append(piece) 

2449 return "".join(pieces) 

2450 

2451 class _TreeTraversalEvent(object): 

2452 """An internal class representing an event in the process 

2453 of traversing a parse tree. 

2454 

2455 :meta private: 

2456 """ 

2457 

2458 # Stand-ins for the different events yielded by _event_stream 

2459 START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2460 END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2461 EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2462 STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2463 

2464 def _event_stream( 

2465 self, iterator: Optional[Iterator[PageElement]] = None 

2466 ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]: 

2467 """Yield a sequence of events that can be used to reconstruct the DOM 

2468 for this element. 

2469 

2470 This lets us recreate the nested structure of this element 

2471 (e.g. when formatting it as a string) without using recursive 

2472 method calls. 

2473 

2474 This is similar in concept to the SAX API, but it's a simpler 

2475 interface designed for internal use. The events are different 

2476 from SAX and the arguments associated with the events are Tags 

2477 and other Beautiful Soup objects. 

2478 

2479 :param iterator: An alternate iterator to use when traversing 

2480 the tree. 

2481 """ 

2482 tag_stack: List[Tag] = [] 

2483 

2484 iterator = iterator or self.self_and_descendants 

2485 

2486 for c in iterator: 

2487 # If the parent of the element we're about to yield is not 

2488 # the tag currently on the stack, it means that the tag on 

2489 # the stack closed before this element appeared. 

2490 while tag_stack and c.parent != tag_stack[-1]: 

2491 now_closed_tag = tag_stack.pop() 

2492 yield Tag.END_ELEMENT_EVENT, now_closed_tag 

2493 

2494 if isinstance(c, Tag): 

2495 if c.is_empty_element: 

2496 yield Tag.EMPTY_ELEMENT_EVENT, c 

2497 else: 

2498 yield Tag.START_ELEMENT_EVENT, c 

2499 tag_stack.append(c) 

2500 continue 

2501 else: 

2502 yield Tag.STRING_ELEMENT_EVENT, c 

2503 

2504 while tag_stack: 

2505 now_closed_tag = tag_stack.pop() 

2506 yield Tag.END_ELEMENT_EVENT, now_closed_tag 

2507 

2508 def _indent_string( 

2509 self, 

2510 s: str, 

2511 indent_level: int, 

2512 formatter: Formatter, 

2513 indent_before: bool, 

2514 indent_after: bool, 

2515 ) -> str: 

2516 """Add indentation whitespace before and/or after a string. 

2517 

2518 :param s: The string to amend with whitespace. 

2519 :param indent_level: The indentation level; affects how much 

2520 whitespace goes before the string. 

2521 :param indent_before: Whether or not to add whitespace 

2522 before the string. 

2523 :param indent_after: Whether or not to add whitespace 

2524 (a newline) after the string. 

2525 """ 

2526 space_before = "" 

2527 if indent_before and indent_level: 

2528 space_before = formatter.indent * indent_level 

2529 

2530 space_after = "" 

2531 if indent_after: 

2532 space_after = "\n" 

2533 

2534 return space_before + s + space_after 

2535 

2536 def _format_tag( 

2537 self, eventual_encoding: str, formatter: Formatter, opening: bool 

2538 ) -> str: 

2539 if self.hidden: 

2540 # A hidden tag is invisible, although its contents 

2541 # are visible. 

2542 return "" 

2543 

2544 # A tag starts with the < character (see below). 

2545 

2546 # Then the / character, if this is a closing tag. 

2547 closing_slash = "" 

2548 if not opening: 

2549 closing_slash = "/" 

2550 

2551 # Then an optional namespace prefix. 

2552 prefix = "" 

2553 if self.prefix: 

2554 prefix = self.prefix + ":" 

2555 

2556 # Then a list of attribute values, if this is an opening tag. 

2557 attribute_string = "" 

2558 if opening: 

2559 attributes = formatter.attributes(self) 

2560 attrs = [] 

2561 for key, val in attributes: 

2562 if val is None: 

2563 decoded = key 

2564 else: 

2565 if isinstance(val, list) or isinstance(val, tuple): 

2566 val = " ".join(val) 

2567 elif not isinstance(val, str): 

2568 val = str(val) 

2569 elif ( 

2570 isinstance(val, AttributeValueWithCharsetSubstitution) 

2571 and eventual_encoding is not None 

2572 ): 

2573 val = val.substitute_encoding(eventual_encoding) 

2574 

2575 text = formatter.attribute_value(val) 

2576 decoded = str(key) + "=" + formatter.quoted_attribute_value(text) 

2577 attrs.append(decoded) 

2578 if attrs: 

2579 attribute_string = " " + " ".join(attrs) 

2580 

2581 # Then an optional closing slash (for a void element in an 

2582 # XML document). 

2583 void_element_closing_slash = "" 

2584 if self.is_empty_element: 

2585 void_element_closing_slash = formatter.void_element_close_prefix or "" 

2586 

2587 # Put it all together. 

2588 return ( 

2589 "<" 

2590 + closing_slash 

2591 + prefix 

2592 + self.name 

2593 + attribute_string 

2594 + void_element_closing_slash 

2595 + ">" 

2596 ) 

2597 

2598 def _should_pretty_print(self, indent_level: int = 1) -> bool: 

2599 """Should this tag be pretty-printed? 

2600 

2601 Most of them should, but some (such as <pre> in HTML 

2602 documents) should not. 

2603 """ 

2604 return indent_level is not None and ( 

2605 not self.preserve_whitespace_tags 

2606 or self.name not in self.preserve_whitespace_tags 

2607 ) 

2608 

2609 def prettify( 

2610 self, 

2611 encoding: Optional[_Encoding] = None, 

2612 formatter: _FormatterOrName = "minimal", 

2613 ) -> Union[str, bytes]: 

2614 """Pretty-print this `Tag` as a string or bytestring. 

2615 

2616 :param encoding: The encoding of the bytestring, or None if you want Unicode. 

2617 :param formatter: A Formatter object, or a string naming one of 

2618 the standard formatters. 

2619 :return: A string (if no ``encoding`` is provided) or a bytestring 

2620 (otherwise). 

2621 """ 

2622 if encoding is None: 

2623 return self.decode(indent_level=0, formatter=formatter) 

2624 else: 

2625 return self.encode(encoding=encoding, indent_level=0, formatter=formatter) 

2626 

2627 def decode_contents( 

2628 self, 

2629 indent_level: Optional[int] = None, 

2630 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2631 formatter: _FormatterOrName = "minimal", 

2632 ) -> str: 

2633 """Renders the contents of this tag as a Unicode string. 

2634 

2635 :param indent_level: Each line of the rendering will be 

2636 indented this many levels. (The formatter decides what a 

2637 'level' means in terms of spaces or other characters 

2638 output.) Used internally in recursive calls while 

2639 pretty-printing. 

2640 

2641 :param eventual_encoding: The tag is destined to be 

2642 encoded into this encoding. decode_contents() is *not* 

2643 responsible for performing that encoding. This information 

2644 is needed so that a real encoding can be substituted in if 

2645 the document contains an encoding declaration (e.g. in a 

2646 <meta> tag). 

2647 

2648 :param formatter: A `Formatter` object, or a string naming one of 

2649 the standard Formatters. 

2650 """ 

2651 return self.decode( 

2652 indent_level, eventual_encoding, formatter, iterator=self.descendants 

2653 ) 

2654 

2655 def encode_contents( 

2656 self, 

2657 indent_level: Optional[int] = None, 

2658 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2659 formatter: _FormatterOrName = "minimal", 

2660 ) -> bytes: 

2661 """Renders the contents of this PageElement as a bytestring. 

2662 

2663 :param indent_level: Each line of the rendering will be 

2664 indented this many levels. (The ``formatter`` decides what a 

2665 'level' means, in terms of spaces or other characters 

2666 output.) This is used internally in recursive calls while 

2667 pretty-printing. 

2668 :param formatter: Either a `Formatter` object, or a string naming one of 

2669 the standard formatters. 

2670 :param encoding: The bytestring will be in this encoding. 

2671 """ 

2672 contents = self.decode_contents(indent_level, encoding, formatter) 

2673 return contents.encode(encoding) 

2674 

2675 @_deprecated("encode_contents", "4.0.0") 

2676 def renderContents( 

2677 self, 

2678 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2679 prettyPrint: bool = False, 

2680 indentLevel: Optional[int] = 0, 

2681 ) -> bytes: 

2682 """Deprecated method for BS3 compatibility. 

2683 

2684 :meta private: 

2685 """ 

2686 if not prettyPrint: 

2687 indentLevel = None 

2688 return self.encode_contents(indent_level=indentLevel, encoding=encoding) 

2689 

2690 # Soup methods 

2691 

2692 def find( 

2693 self, 

2694 name: _FindMethodName = None, 

2695 attrs: _StrainableAttributes = {}, 

2696 recursive: bool = True, 

2697 string: Optional[_StrainableString] = None, 

2698 **kwargs: _StrainableAttribute, 

2699 ) -> _AtMostOneElement: 

2700 """Look in the children of this PageElement and find the first 

2701 PageElement that matches the given criteria. 

2702 

2703 All find_* methods take a common set of arguments. See the online 

2704 documentation for detailed explanations. 

2705 

2706 :param name: A filter on tag name. 

2707 :param attrs: Additional filters on attribute values. 

2708 :param recursive: If this is True, find() will perform a 

2709 recursive search of this Tag's children. Otherwise, 

2710 only the direct children will be considered. 

2711 :param string: A filter on the `Tag.string` attribute. 

2712 :param limit: Stop looking after finding this many results. 

2713 :kwargs: Additional filters on attribute values. 

2714 """ 

2715 r = None 

2716 results = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) 

2717 if results: 

2718 r = results[0] 

2719 return r 

2720 

2721 findChild = _deprecated_function_alias("findChild", "find", "3.0.0") 

2722 

2723 def find_all( 

2724 self, 

2725 name: _FindMethodName = None, 

2726 attrs: _StrainableAttributes = {}, 

2727 recursive: bool = True, 

2728 string: Optional[_StrainableString] = None, 

2729 limit: Optional[int] = None, 

2730 _stacklevel: int = 2, 

2731 **kwargs: _StrainableAttribute, 

2732 ) -> _QueryResults: 

2733 """Look in the children of this `PageElement` and find all 

2734 `PageElement` objects that match the given criteria. 

2735 

2736 All find_* methods take a common set of arguments. See the online 

2737 documentation for detailed explanations. 

2738 

2739 :param name: A filter on tag name. 

2740 :param attrs: Additional filters on attribute values. 

2741 :param recursive: If this is True, find_all() will perform a 

2742 recursive search of this PageElement's children. Otherwise, 

2743 only the direct children will be considered. 

2744 :param limit: Stop looking after finding this many results. 

2745 :param _stacklevel: Used internally to improve warning messages. 

2746 :kwargs: Additional filters on attribute values. 

2747 """ 

2748 generator = self.descendants 

2749 if not recursive: 

2750 generator = self.children 

2751 return self._find_all( 

2752 name, attrs, string, limit, generator, _stacklevel=_stacklevel + 1, **kwargs 

2753 ) 

2754 

2755 findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0") 

2756 findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0") 

2757 

2758 # Generator methods 

2759 @property 

2760 def children(self) -> Iterator[PageElement]: 

2761 """Iterate over all direct children of this `PageElement`.""" 

2762 return (x for x in self.contents) 

2763 

2764 @property 

2765 def self_and_descendants(self) -> Iterator[PageElement]: 

2766 """Iterate over this `Tag` and its children in a 

2767 breadth-first sequence. 

2768 """ 

2769 return self._self_and(self.descendants) 

2770 

2771 @property 

2772 def descendants(self) -> Iterator[PageElement]: 

2773 """Iterate over all children of this `Tag` in a 

2774 breadth-first sequence. 

2775 """ 

2776 if not len(self.contents): 

2777 return 

2778 # _last_descendant() can't return None here because 

2779 # accept_self is True. Worst case, last_descendant will end up 

2780 # as self. 

2781 last_descendant = cast(PageElement, self._last_descendant(accept_self=True)) 

2782 stopNode = last_descendant.next_element 

2783 current: _AtMostOneElement = self.contents[0] 

2784 while current is not stopNode and current is not None: 

2785 successor = current.next_element 

2786 yield current 

2787 current = successor 

2788 

2789 # CSS selector code 

2790 def select_one( 

2791 self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any 

2792 ) -> Optional[Tag]: 

2793 """Perform a CSS selection operation on the current element. 

2794 

2795 :param selector: A CSS selector. 

2796 

2797 :param namespaces: A dictionary mapping namespace prefixes 

2798 used in the CSS selector to namespace URIs. By default, 

2799 Beautiful Soup will use the prefixes it encountered while 

2800 parsing the document. 

2801 

2802 :param kwargs: Keyword arguments to be passed into Soup Sieve's 

2803 soupsieve.select() method. 

2804 """ 

2805 return self.css.select_one(selector, namespaces, **kwargs) 

2806 

2807 def select( 

2808 self, 

2809 selector: str, 

2810 namespaces: Optional[Dict[str, str]] = None, 

2811 limit: int = 0, 

2812 **kwargs: Any, 

2813 ) -> ResultSet[Tag]: 

2814 """Perform a CSS selection operation on the current element. 

2815 

2816 This uses the SoupSieve library. 

2817 

2818 :param selector: A string containing a CSS selector. 

2819 

2820 :param namespaces: A dictionary mapping namespace prefixes 

2821 used in the CSS selector to namespace URIs. By default, 

2822 Beautiful Soup will use the prefixes it encountered while 

2823 parsing the document. 

2824 

2825 :param limit: After finding this number of results, stop looking. 

2826 

2827 :param kwargs: Keyword arguments to be passed into SoupSieve's 

2828 soupsieve.select() method. 

2829 """ 

2830 return self.css.select(selector, namespaces, limit, **kwargs) 

2831 

2832 @property 

2833 def css(self) -> CSS: 

2834 """Return an interface to the CSS selector API.""" 

2835 return CSS(self) 

2836 

2837 # Old names for backwards compatibility 

2838 @_deprecated("children", "4.0.0") 

2839 def childGenerator(self) -> Iterator[PageElement]: 

2840 """Deprecated generator. 

2841 

2842 :meta private: 

2843 """ 

2844 return self.children 

2845 

2846 @_deprecated("descendants", "4.0.0") 

2847 def recursiveChildGenerator(self) -> Iterator[PageElement]: 

2848 """Deprecated generator. 

2849 

2850 :meta private: 

2851 """ 

2852 return self.descendants 

2853 

2854 @_deprecated("has_attr", "4.0.0") 

2855 def has_key(self, key: str) -> bool: 

2856 """Deprecated method. This was kind of misleading because has_key() 

2857 (attributes) was different from __in__ (contents). 

2858 

2859 has_key() is gone in Python 3, anyway. 

2860 

2861 :meta private: 

2862 """ 

2863 return self.has_attr(key) 

2864 

2865 

2866_PageElementT = TypeVar("_PageElementT", bound=PageElement) 

2867 

2868 

2869class ResultSet(List[_PageElementT], Generic[_PageElementT]): 

2870 """A ResultSet is a list of `PageElement` objects, gathered as the result 

2871 of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of 

2872 search results. 

2873 """ 

2874 

2875 source: Optional[ElementFilter] 

2876 

2877 def __init__( 

2878 self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = () 

2879 ) -> None: 

2880 super(ResultSet, self).__init__(result) 

2881 self.source = source 

2882 

2883 def __getattr__(self, key: str) -> None: 

2884 """Raise a helpful exception to explain a common code fix.""" 

2885 raise AttributeError( 

2886 f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?""" 

2887 ) 

2888 

2889 

2890# Now that all the classes used by SoupStrainer have been defined, 

2891# import SoupStrainer itself into this module to preserve the 

2892# backwards compatibility of anyone who imports 

2893# bs4.element.SoupStrainer. 

2894from bs4.filter import SoupStrainer # noqa: E402