Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/element.py: 39%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

975 statements  

1from __future__ import annotations 

2 

3# Use of this source code is governed by the MIT license. 

4__license__ = "MIT" 

5 

6import re 

7import warnings 

8 

9from bs4.css import CSS 

10from bs4._deprecation import ( 

11 _deprecated, 

12 _deprecated_alias, 

13 _deprecated_function_alias, 

14) 

15from bs4.formatter import ( 

16 Formatter, 

17 HTMLFormatter, 

18 XMLFormatter, 

19) 

20from bs4._warnings import AttributeResemblesVariableWarning 

21 

22from typing import ( 

23 Any, 

24 Callable, 

25 Dict, 

26 Generic, 

27 Iterable, 

28 Iterator, 

29 List, 

30 Mapping, 

31 Optional, 

32 Pattern, 

33 Set, 

34 TYPE_CHECKING, 

35 Tuple, 

36 Type, 

37 TypeVar, 

38 Union, 

39 cast, 

40 overload, 

41) 

42from typing_extensions import ( 

43 Self, 

44 TypeAlias, 

45) 

46 

47if TYPE_CHECKING: 

48 from bs4 import BeautifulSoup 

49 from bs4.builder import TreeBuilder 

50 from bs4.filter import ElementFilter 

51 from bs4.formatter import ( 

52 _EntitySubstitutionFunction, 

53 _FormatterOrName, 

54 ) 

55 from bs4._typing import ( 

56 _AtMostOneElement, 

57 _AttributeValue, 

58 _AttributeValues, 

59 _Encoding, 

60 _InsertableElement, 

61 _OneElement, 

62 _QueryResults, 

63 _RawOrProcessedAttributeValues, 

64 _StrainableElement, 

65 _StrainableAttribute, 

66 _StrainableAttributes, 

67 _StrainableString, 

68 ) 

69 

70_OneOrMoreStringTypes: TypeAlias = Union[ 

71 Type["NavigableString"], Iterable[Type["NavigableString"]] 

72] 

73 

74_FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]] 

75 

76# Deprecated module-level attributes. 

77# See https://peps.python.org/pep-0562/ 

78_deprecated_names = dict( 

79 whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy." 

80) 

81#: :meta private: 

82_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+") 

83 

84 

85def __getattr__(name: str) -> Any: 

86 if name in _deprecated_names: 

87 message = _deprecated_names[name] 

88 warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2) 

89 

90 return globals()[f"_deprecated_{name}"] 

91 raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 

92 

93 

94#: Documents output by Beautiful Soup will be encoded with 

95#: this encoding unless you specify otherwise. 

96DEFAULT_OUTPUT_ENCODING: str = "utf-8" 

97 

98#: A regular expression that can be used to split on whitespace. 

99nonwhitespace_re: Pattern[str] = re.compile(r"\S+") 

100 

101#: These encodings are recognized by Python (so `Tag.encode` 

102#: could theoretically support them) but XML and HTML don't recognize 

103#: them (so they should not show up in an XML or HTML document as that 

104#: document's encoding). 

105#: 

106#: If an XML document is encoded in one of these encodings, no encoding 

107#: will be mentioned in the XML declaration. If an HTML document is 

108#: encoded in one of these encodings, and the HTML document has a 

109#: <meta> tag that mentions an encoding, the encoding will be given as 

110#: the empty string. 

111#: 

112#: Source: 

113#: Python documentation, `Python Specific Encodings <https://docs.python.org/3/library/codecs.html#python-specific-encodings>`_ 

114PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set( 

115 [ 

116 "idna", 

117 "mbcs", 

118 "oem", 

119 "palmos", 

120 "punycode", 

121 "raw_unicode_escape", 

122 "undefined", 

123 "unicode_escape", 

124 "raw-unicode-escape", 

125 "unicode-escape", 

126 "string-escape", 

127 "string_escape", 

128 ] 

129) 

130 

131 

132class NamespacedAttribute(str): 

133 """A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"') 

134 which remembers the namespace prefix ('xml') and the name ('lang') 

135 that were used to create it. 

136 """ 

137 

138 prefix: Optional[str] 

139 name: Optional[str] 

140 namespace: Optional[str] 

141 

142 def __new__( 

143 cls, 

144 prefix: Optional[str], 

145 name: Optional[str] = None, 

146 namespace: Optional[str] = None, 

147 ) -> Self: 

148 if not name: 

149 # This is the default namespace. Its name "has no value" 

150 # per https://www.w3.org/TR/xml-names/#defaulting 

151 name = None 

152 

153 if not name: 

154 obj = str.__new__(cls, prefix) 

155 elif not prefix: 

156 # Not really namespaced. 

157 obj = str.__new__(cls, name) 

158 else: 

159 obj = str.__new__(cls, prefix + ":" + name) 

160 obj.prefix = prefix 

161 obj.name = name 

162 obj.namespace = namespace 

163 return obj 

164 

165 

166class AttributeValueWithCharsetSubstitution(str): 

167 """An abstract class standing in for a character encoding specified 

168 inside an HTML ``<meta>`` tag. 

169 

170 Subclasses exist for each place such a character encoding might be 

171 found: either inside the ``charset`` attribute 

172 (`CharsetMetaAttributeValue`) or inside the ``content`` attribute 

173 (`ContentMetaAttributeValue`) 

174 

175 This allows Beautiful Soup to replace that part of the HTML file 

176 with a different encoding when ouputting a tree as a string. 

177 """ 

178 

179 # The original, un-encoded value of the ``content`` attribute. 

180 #: :meta private: 

181 original_value: str 

182 

183 def substitute_encoding(self, eventual_encoding: str) -> str: 

184 """Do whatever's necessary in this implementation-specific 

185 portion an HTML document to substitute in a specific encoding. 

186 """ 

187 raise NotImplementedError() 

188 

189 

190class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 

191 """A generic stand-in for the value of a ``<meta>`` tag's ``charset`` 

192 attribute. 

193 

194 When Beautiful Soup parses the markup ``<meta charset="utf8">``, the 

195 value of the ``charset`` attribute will become one of these objects. 

196 

197 If the document is later encoded to an encoding other than UTF-8, its 

198 ``<meta>`` tag will mention the new encoding instead of ``utf8``. 

199 """ 

200 

201 def __new__(cls, original_value: str) -> Self: 

202 # We don't need to use the original value for anything, but 

203 # it might be useful for the user to know. 

204 obj = str.__new__(cls, original_value) 

205 obj.original_value = original_value 

206 return obj 

207 

208 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: 

209 """When an HTML document is being encoded to a given encoding, the 

210 value of a ``<meta>`` tag's ``charset`` becomes the name of 

211 the encoding. 

212 """ 

213 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 

214 return "" 

215 return eventual_encoding 

216 

217 

218class AttributeValueList(List[str]): 

219 """Class for the list used to hold the values of attributes which 

220 have multiple values (such as HTML's 'class'). It's just a regular 

221 list, but you can subclass it and pass it in to the TreeBuilder 

222 constructor as attribute_value_list_class, to have your subclass 

223 instantiated instead. 

224 """ 

225 

226 

227class AttributeDict(Dict[Any,Any]): 

228 """Superclass for the dictionary used to hold a tag's 

229 attributes. You can use this, but it's just a regular dict with no 

230 special logic. 

231 """ 

232 

233 

234class XMLAttributeDict(AttributeDict): 

235 """A dictionary for holding a Tag's attributes, which processes 

236 incoming values for consistency with the HTML spec. 

237 """ 

238 

239 def __setitem__(self, key: str, value: Any) -> None: 

240 """Set an attribute value, possibly modifying it to comply with 

241 the XML spec. 

242 

243 This just means converting common non-string values to 

244 strings: XML attributes may have "any literal string as a 

245 value." 

246 """ 

247 if value is None: 

248 value = "" 

249 if isinstance(value, bool): 

250 # XML does not define any rules for boolean attributes. 

251 # Preserve the old Beautiful Soup behavior (a bool that 

252 # gets converted to a string on output) rather than 

253 # guessing what the value should be. 

254 pass 

255 elif isinstance(value, (int, float)): 

256 # It's dangerous to convert _every_ attribute value into a 

257 # plain string, since an attribute value may be a more 

258 # sophisticated string-like object 

259 # (e.g. CharsetMetaAttributeValue). But we can definitely 

260 # convert numeric values and booleans, which are the most common. 

261 value = str(value) 

262 

263 super().__setitem__(key, value) 

264 

265 

266class HTMLAttributeDict(AttributeDict): 

267 """A dictionary for holding a Tag's attributes, which processes 

268 incoming values for consistency with the HTML spec, which says 

269 'Attribute values are a mixture of text and character 

270 references...' 

271 

272 Basically, this means converting common non-string values into 

273 strings, like XMLAttributeDict, though HTML also has some rules 

274 around boolean attributes that XML doesn't have. 

275 """ 

276 

277 def __setitem__(self, key: str, value: Any) -> None: 

278 """Set an attribute value, possibly modifying it to comply 

279 with the HTML spec, 

280 """ 

281 if value in (False, None): 

282 # 'The values "true" and "false" are not allowed on 

283 # boolean attributes. To represent a false value, the 

284 # attribute has to be omitted altogether.' 

285 if key in self: 

286 del self[key] 

287 return 

288 if isinstance(value, bool): 

289 # 'If the [boolean] attribute is present, its value must 

290 # either be the empty string or a value that is an ASCII 

291 # case-insensitive match for the attribute's canonical 

292 # name, with no leading or trailing whitespace.' 

293 # 

294 # [fixme] It's not clear to me whether "canonical name" 

295 # means fully-qualified name, unqualified name, or 

296 # (probably not) name with namespace prefix. For now I'm 

297 # going with unqualified name. 

298 if isinstance(key, NamespacedAttribute): 

299 value = key.name 

300 else: 

301 value = key 

302 elif isinstance(value, (int, float)): 

303 # See note in XMLAttributeDict for the reasoning why we 

304 # only do this to numbers. 

305 value = str(value) 

306 super().__setitem__(key, value) 

307 

308 

309class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): 

310 """A generic stand-in for the value of a ``<meta>`` tag's ``content`` 

311 attribute. 

312 

313 When Beautiful Soup parses the markup: 

314 ``<meta http-equiv="content-type" content="text/html; charset=utf8">`` 

315 

316 The value of the ``content`` attribute will become one of these objects. 

317 

318 If the document is later encoded to an encoding other than UTF-8, its 

319 ``<meta>`` tag will mention the new encoding instead of ``utf8``. 

320 """ 

321 

322 #: Match the 'charset' argument inside the 'content' attribute 

323 #: of a <meta> tag. 

324 #: :meta private: 

325 CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) 

326 

327 def __new__(cls, original_value: str) -> Self: 

328 cls.CHARSET_RE.search(original_value) 

329 obj = str.__new__(cls, original_value) 

330 obj.original_value = original_value 

331 return obj 

332 

333 def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: 

334 """When an HTML document is being encoded to a given encoding, the 

335 value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes 

336 the name of the encoding. 

337 """ 

338 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 

339 return self.CHARSET_RE.sub("", self.original_value) 

340 

341 def rewrite(match: re.Match[str]) -> str: 

342 return match.group(1) + eventual_encoding 

343 

344 return self.CHARSET_RE.sub(rewrite, self.original_value) 

345 

346 

347class PageElement(object): 

348 """An abstract class representing a single element in the parse tree. 

349 

350 `NavigableString`, `Tag`, etc. are all subclasses of 

351 `PageElement`. For this reason you'll see a lot of methods that 

352 return `PageElement`, but you'll never see an actual `PageElement` 

353 object. For the most part you can think of `PageElement` as 

354 meaning "a `Tag` or a `NavigableString`." 

355 """ 

356 

357 #: In general, we can't tell just by looking at an element whether 

358 #: it's contained in an XML document or an HTML document. But for 

359 #: `Tag` objects (q.v.) we can store this information at parse time. 

360 #: :meta private: 

361 known_xml: Optional[bool] = None 

362 

363 #: Whether or not this element has been decomposed from the tree 

364 #: it was created in. 

365 _decomposed: bool 

366 

367 parent: Optional[Tag] 

368 next_element: _AtMostOneElement 

369 previous_element: _AtMostOneElement 

370 next_sibling: _AtMostOneElement 

371 previous_sibling: _AtMostOneElement 

372 

373 #: Whether or not this element is hidden from generated output. 

374 #: Only the `BeautifulSoup` object itself is hidden. 

375 hidden: bool = False 

376 

377 def setup( 

378 self, 

379 parent: Optional[Tag] = None, 

380 previous_element: _AtMostOneElement = None, 

381 next_element: _AtMostOneElement = None, 

382 previous_sibling: _AtMostOneElement = None, 

383 next_sibling: _AtMostOneElement = None, 

384 ) -> None: 

385 """Sets up the initial relations between this element and 

386 other elements. 

387 

388 :param parent: The parent of this element. 

389 

390 :param previous_element: The element parsed immediately before 

391 this one. 

392 

393 :param next_element: The element parsed immediately after 

394 this one. 

395 

396 :param previous_sibling: The most recently encountered element 

397 on the same level of the parse tree as this one. 

398 

399 :param previous_sibling: The next element to be encountered 

400 on the same level of the parse tree as this one. 

401 """ 

402 self.parent = parent 

403 

404 self.previous_element = previous_element 

405 if self.previous_element is not None: 

406 self.previous_element.next_element = self 

407 

408 self.next_element = next_element 

409 if self.next_element is not None: 

410 self.next_element.previous_element = self 

411 

412 self.next_sibling = next_sibling 

413 if self.next_sibling is not None: 

414 self.next_sibling.previous_sibling = self 

415 

416 if ( 

417 previous_sibling is None 

418 and self.parent is not None 

419 and self.parent.contents 

420 ): 

421 previous_sibling = self.parent.contents[-1] 

422 

423 self.previous_sibling = previous_sibling 

424 if self.previous_sibling is not None: 

425 self.previous_sibling.next_sibling = self 

426 

427 def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str: 

428 """Format the given string using the given formatter. 

429 

430 :param s: A string. 

431 :param formatter: A Formatter object, or a string naming one of the standard formatters. 

432 """ 

433 if formatter is None: 

434 return s 

435 if not isinstance(formatter, Formatter): 

436 formatter = self.formatter_for_name(formatter) 

437 output = formatter.substitute(s) 

438 return output 

439 

440 def formatter_for_name( 

441 self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction] 

442 ) -> Formatter: 

443 """Look up or create a Formatter for the given identifier, 

444 if necessary. 

445 

446 :param formatter: Can be a `Formatter` object (used as-is), a 

447 function (used as the entity substitution hook for an 

448 `bs4.formatter.XMLFormatter` or 

449 `bs4.formatter.HTMLFormatter`), or a string (used to look 

450 up an `bs4.formatter.XMLFormatter` or 

451 `bs4.formatter.HTMLFormatter` in the appropriate registry. 

452 

453 """ 

454 if isinstance(formatter_name, Formatter): 

455 return formatter_name 

456 c: type[Formatter] 

457 registry: Mapping[Optional[str], Formatter] 

458 if self._is_xml: 

459 c = XMLFormatter 

460 registry = XMLFormatter.REGISTRY 

461 else: 

462 c = HTMLFormatter 

463 registry = HTMLFormatter.REGISTRY 

464 if callable(formatter_name): 

465 return c(entity_substitution=formatter_name) 

466 return registry[formatter_name] 

467 

468 @property 

469 def _is_xml(self) -> bool: 

470 """Is this element part of an XML tree or an HTML tree? 

471 

472 This is used in formatter_for_name, when deciding whether an 

473 XMLFormatter or HTMLFormatter is more appropriate. It can be 

474 inefficient, but it should be called very rarely. 

475 """ 

476 if self.known_xml is not None: 

477 # Most of the time we will have determined this when the 

478 # document is parsed. 

479 return self.known_xml 

480 

481 # Otherwise, it's likely that this element was created by 

482 # direct invocation of the constructor from within the user's 

483 # Python code. 

484 if self.parent is None: 

485 # This is the top-level object. It should have .known_xml set 

486 # from tree creation. If not, take a guess--BS is usually 

487 # used on HTML markup. 

488 return getattr(self, "is_xml", False) 

489 return self.parent._is_xml 

490 

491 nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0") 

492 previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0") 

493 

494 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: 

495 raise NotImplementedError() 

496 

497 def __copy__(self) -> Self: 

498 """A copy of a PageElement can only be a deep copy, because 

499 only one PageElement can occupy a given place in a parse tree. 

500 """ 

501 return self.__deepcopy__({}) 

502 

503 default: Iterable[type[NavigableString]] = tuple() #: :meta private: 

504 

505 def _all_strings( 

506 self, strip: bool = False, types: Iterable[type[NavigableString]] = default 

507 ) -> Iterator[str]: 

508 """Yield all strings of certain classes, possibly stripping them. 

509 

510 This is implemented differently in `Tag` and `NavigableString`. 

511 """ 

512 raise NotImplementedError() 

513 

514 @property 

515 def stripped_strings(self) -> Iterator[str]: 

516 """Yield all interesting strings in this PageElement, stripping them 

517 first. 

518 

519 See `Tag` for information on which strings are considered 

520 interesting in a given context. 

521 """ 

522 for string in self._all_strings(True): 

523 yield string 

524 

525 def get_text( 

526 self, 

527 separator: str = "", 

528 strip: bool = False, 

529 types: Iterable[Type[NavigableString]] = default, 

530 ) -> str: 

531 """Get all child strings of this PageElement, concatenated using the 

532 given separator. 

533 

534 :param separator: Strings will be concatenated using this separator. 

535 

536 :param strip: If True, strings will be stripped before being 

537 concatenated. 

538 

539 :param types: A tuple of NavigableString subclasses. Any 

540 strings of a subclass not found in this list will be 

541 ignored. Although there are exceptions, the default 

542 behavior in most cases is to consider only NavigableString 

543 and CData objects. That means no comments, processing 

544 instructions, etc. 

545 

546 :return: A string. 

547 """ 

548 return separator.join([s for s in self._all_strings(strip, types=types)]) 

549 

550 getText = get_text 

551 text = property(get_text) 

552 

553 def replace_with(self, *args: _InsertableElement) -> Self: 

554 """Replace this `PageElement` with one or more other elements, 

555 objects, keeping the rest of the tree the same. 

556 

557 :return: This `PageElement`, no longer part of the tree. 

558 """ 

559 if self.parent is None: 

560 raise ValueError( 

561 "Cannot replace one element with another when the " 

562 "element to be replaced is not part of a tree." 

563 ) 

564 if len(args) == 1 and args[0] is self: 

565 # Replacing an element with itself is a no-op. 

566 return self 

567 if any(x is self.parent for x in args): 

568 raise ValueError("Cannot replace a Tag with its parent.") 

569 old_parent = self.parent 

570 my_index = self.parent.index(self) 

571 self.extract(_self_index=my_index) 

572 for idx, replace_with in enumerate(args, start=my_index): 

573 old_parent.insert(idx, replace_with) 

574 return self 

575 

576 replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0") 

577 

578 def wrap(self, wrap_inside: Tag) -> Tag: 

579 """Wrap this `PageElement` inside a `Tag`. 

580 

581 :return: ``wrap_inside``, occupying the position in the tree that used 

582 to be occupied by this object, and with this object now inside it. 

583 """ 

584 me = self.replace_with(wrap_inside) 

585 wrap_inside.append(me) 

586 return wrap_inside 

587 

588 def extract(self, _self_index: Optional[int] = None) -> Self: 

589 """Destructively rips this element out of the tree. 

590 

591 :param _self_index: The location of this element in its parent's 

592 .contents, if known. Passing this in allows for a performance 

593 optimization. 

594 

595 :return: this `PageElement`, no longer part of the tree. 

596 """ 

597 if self.parent is not None: 

598 if _self_index is None: 

599 _self_index = self.parent.index(self) 

600 del self.parent.contents[_self_index] 

601 

602 # Find the two elements that would be next to each other if 

603 # this element (and any children) hadn't been parsed. Connect 

604 # the two. 

605 last_child = self._last_descendant() 

606 

607 # last_child can't be None because we passed accept_self=True 

608 # into _last_descendant. Worst case, last_child will be 

609 # self. Making this cast removes several mypy complaints later 

610 # on as we manipulate last_child. 

611 last_child = cast(PageElement, last_child) 

612 next_element = last_child.next_element 

613 

614 if self.previous_element is not None: 

615 if self.previous_element is not next_element: 

616 self.previous_element.next_element = next_element 

617 if next_element is not None and next_element is not self.previous_element: 

618 next_element.previous_element = self.previous_element 

619 self.previous_element = None 

620 last_child.next_element = None 

621 

622 self.parent = None 

623 if ( 

624 self.previous_sibling is not None 

625 and self.previous_sibling is not self.next_sibling 

626 ): 

627 self.previous_sibling.next_sibling = self.next_sibling 

628 if ( 

629 self.next_sibling is not None 

630 and self.next_sibling is not self.previous_sibling 

631 ): 

632 self.next_sibling.previous_sibling = self.previous_sibling 

633 self.previous_sibling = self.next_sibling = None 

634 return self 

635 

636 def decompose(self) -> None: 

637 """Recursively destroys this `PageElement` and its children. 

638 

639 The element will be removed from the tree and wiped out; so 

640 will everything beneath it. 

641 

642 The behavior of a decomposed `PageElement` is undefined and you 

643 should never use one for anything, but if you need to *check* 

644 whether an element has been decomposed, you can use the 

645 `PageElement.decomposed` property. 

646 """ 

647 self.extract() 

648 e: _AtMostOneElement = self 

649 next_up: _AtMostOneElement = None 

650 while e is not None: 

651 next_up = e.next_element 

652 e.__dict__.clear() 

653 if isinstance(e, Tag): 

654 e.contents = [] 

655 e._decomposed = True 

656 e = next_up 

657 

658 def _last_descendant( 

659 self, is_initialized: bool = True, accept_self: bool = True 

660 ) -> _AtMostOneElement: 

661 """Finds the last element beneath this object to be parsed. 

662 

663 Special note to help you figure things out if your type 

664 checking is tripped up by the fact that this method returns 

665 _AtMostOneElement instead of PageElement: the only time 

666 this method returns None is if `accept_self` is False and the 

667 `PageElement` has no children--either it's a NavigableString 

668 or an empty Tag. 

669 

670 :param is_initialized: Has `PageElement.setup` been called on 

671 this `PageElement` yet? 

672 

673 :param accept_self: Is ``self`` an acceptable answer to the 

674 question? 

675 """ 

676 if is_initialized and self.next_sibling is not None: 

677 last_child = self.next_sibling.previous_element 

678 else: 

679 last_child = self 

680 while isinstance(last_child, Tag) and last_child.contents: 

681 last_child = last_child.contents[-1] 

682 if not accept_self and last_child is self: 

683 last_child = None 

684 return last_child 

685 

686 _lastRecursiveChild = _deprecated_alias( 

687 "_lastRecursiveChild", "_last_descendant", "4.0.0" 

688 ) 

689 

690 def insert_before(self, *args: _InsertableElement) -> List[PageElement]: 

691 """Makes the given element(s) the immediate predecessor of this one. 

692 

693 All the elements will have the same `PageElement.parent` as 

694 this one, and the given elements will occur immediately before 

695 this one. 

696 

697 :param args: One or more PageElements. 

698 

699 :return The list of PageElements that were inserted. 

700 """ 

701 parent = self.parent 

702 if parent is None: 

703 raise ValueError("Element has no parent, so 'before' has no meaning.") 

704 if any(x is self for x in args): 

705 raise ValueError("Can't insert an element before itself.") 

706 results: List[PageElement] = [] 

707 for predecessor in args: 

708 # Extract first so that the index won't be screwed up if they 

709 # are siblings. 

710 if isinstance(predecessor, PageElement): 

711 predecessor.extract() 

712 index = parent.index(self) 

713 results.extend(parent.insert(index, predecessor)) 

714 

715 return results 

716 

717 def insert_after(self, *args: _InsertableElement) -> List[PageElement]: 

718 """Makes the given element(s) the immediate successor of this one. 

719 

720 The elements will have the same `PageElement.parent` as this 

721 one, and the given elements will occur immediately after this 

722 one. 

723 

724 :param args: One or more PageElements. 

725 

726 :return The list of PageElements that were inserted. 

727 """ 

728 # Do all error checking before modifying the tree. 

729 parent = self.parent 

730 if parent is None: 

731 raise ValueError("Element has no parent, so 'after' has no meaning.") 

732 if any(x is self for x in args): 

733 raise ValueError("Can't insert an element after itself.") 

734 

735 offset = 0 

736 results: List[PageElement] = [] 

737 for successor in args: 

738 # Extract first so that the index won't be screwed up if they 

739 # are siblings. 

740 if isinstance(successor, PageElement): 

741 successor.extract() 

742 index = parent.index(self) 

743 results.extend(parent.insert(index + 1 + offset, successor)) 

744 offset += 1 

745 

746 return results 

747 

748 def find_next( 

749 self, 

750 name: _FindMethodName = None, 

751 attrs: _StrainableAttributes = {}, 

752 string: Optional[_StrainableString] = None, 

753 **kwargs: _StrainableAttribute, 

754 ) -> _AtMostOneElement: 

755 """Find the first PageElement that matches the given criteria and 

756 appears later in the document than this PageElement. 

757 

758 All find_* methods take a common set of arguments. See the online 

759 documentation for detailed explanations. 

760 

761 :param name: A filter on tag name. 

762 :param attrs: Additional filters on attribute values. 

763 :param string: A filter for a NavigableString with specific text. 

764 :kwargs: Additional filters on attribute values. 

765 """ 

766 return self._find_one(self.find_all_next, name, attrs, string, **kwargs) 

767 

768 findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0") 

769 

770 def find_all_next( 

771 self, 

772 name: _FindMethodName = None, 

773 attrs: _StrainableAttributes = {}, 

774 string: Optional[_StrainableString] = None, 

775 limit: Optional[int] = None, 

776 _stacklevel: int = 2, 

777 **kwargs: _StrainableAttribute, 

778 ) -> _QueryResults: 

779 """Find all `PageElement` objects that match the given criteria and 

780 appear later in the document than this `PageElement`. 

781 

782 All find_* methods take a common set of arguments. See the online 

783 documentation for detailed explanations. 

784 

785 :param name: A filter on tag name. 

786 :param attrs: Additional filters on attribute values. 

787 :param string: A filter for a NavigableString with specific text. 

788 :param limit: Stop looking after finding this many results. 

789 :param _stacklevel: Used internally to improve warning messages. 

790 :kwargs: Additional filters on attribute values. 

791 """ 

792 return self._find_all( 

793 name, 

794 attrs, 

795 string, 

796 limit, 

797 self.next_elements, 

798 _stacklevel=_stacklevel + 1, 

799 **kwargs, 

800 ) 

801 

802 findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0") 

803 

804 def find_next_sibling( 

805 self, 

806 name: _FindMethodName = None, 

807 attrs: _StrainableAttributes = {}, 

808 string: Optional[_StrainableString] = None, 

809 **kwargs: _StrainableAttribute, 

810 ) -> _AtMostOneElement: 

811 """Find the closest sibling to this PageElement that matches the 

812 given criteria and appears later in the document. 

813 

814 All find_* methods take a common set of arguments. See the 

815 online documentation for detailed explanations. 

816 

817 :param name: A filter on tag name. 

818 :param attrs: Additional filters on attribute values. 

819 :param string: A filter for a `NavigableString` with specific text. 

820 :kwargs: Additional filters on attribute values. 

821 """ 

822 return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs) 

823 

824 findNextSibling = _deprecated_function_alias( 

825 "findNextSibling", "find_next_sibling", "4.0.0" 

826 ) 

827 

828 def find_next_siblings( 

829 self, 

830 name: _FindMethodName = None, 

831 attrs: _StrainableAttributes = {}, 

832 string: Optional[_StrainableString] = None, 

833 limit: Optional[int] = None, 

834 _stacklevel: int = 2, 

835 **kwargs: _StrainableAttribute, 

836 ) -> _QueryResults: 

837 """Find all siblings of this `PageElement` that match the given criteria 

838 and appear later in the document. 

839 

840 All find_* methods take a common set of arguments. See the online 

841 documentation for detailed explanations. 

842 

843 :param name: A filter on tag name. 

844 :param attrs: Additional filters on attribute values. 

845 :param string: A filter for a `NavigableString` with specific text. 

846 :param limit: Stop looking after finding this many results. 

847 :param _stacklevel: Used internally to improve warning messages. 

848 :kwargs: Additional filters on attribute values. 

849 """ 

850 return self._find_all( 

851 name, 

852 attrs, 

853 string, 

854 limit, 

855 self.next_siblings, 

856 _stacklevel=_stacklevel + 1, 

857 **kwargs, 

858 ) 

859 

860 findNextSiblings = _deprecated_function_alias( 

861 "findNextSiblings", "find_next_siblings", "4.0.0" 

862 ) 

863 fetchNextSiblings = _deprecated_function_alias( 

864 "fetchNextSiblings", "find_next_siblings", "3.0.0" 

865 ) 

866 

867 def find_previous( 

868 self, 

869 name: _FindMethodName = None, 

870 attrs: _StrainableAttributes = {}, 

871 string: Optional[_StrainableString] = None, 

872 **kwargs: _StrainableAttribute, 

873 ) -> _AtMostOneElement: 

874 """Look backwards in the document from this `PageElement` and find the 

875 first `PageElement` that matches the given criteria. 

876 

877 All find_* methods take a common set of arguments. See the online 

878 documentation for detailed explanations. 

879 

880 :param name: A filter on tag name. 

881 :param attrs: Additional filters on attribute values. 

882 :param string: A filter for a `NavigableString` with specific text. 

883 :kwargs: Additional filters on attribute values. 

884 """ 

885 return self._find_one(self.find_all_previous, name, attrs, string, **kwargs) 

886 

887 findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0") 

888 

889 def find_all_previous( 

890 self, 

891 name: _FindMethodName = None, 

892 attrs: _StrainableAttributes = {}, 

893 string: Optional[_StrainableString] = None, 

894 limit: Optional[int] = None, 

895 _stacklevel: int = 2, 

896 **kwargs: _StrainableAttribute, 

897 ) -> _QueryResults: 

898 """Look backwards in the document from this `PageElement` and find all 

899 `PageElement` that match the given criteria. 

900 

901 All find_* methods take a common set of arguments. See the online 

902 documentation for detailed explanations. 

903 

904 :param name: A filter on tag name. 

905 :param attrs: Additional filters on attribute values. 

906 :param string: A filter for a `NavigableString` with specific text. 

907 :param limit: Stop looking after finding this many results. 

908 :param _stacklevel: Used internally to improve warning messages. 

909 :kwargs: Additional filters on attribute values. 

910 """ 

911 return self._find_all( 

912 name, 

913 attrs, 

914 string, 

915 limit, 

916 self.previous_elements, 

917 _stacklevel=_stacklevel + 1, 

918 **kwargs, 

919 ) 

920 

921 findAllPrevious = _deprecated_function_alias( 

922 "findAllPrevious", "find_all_previous", "4.0.0" 

923 ) 

924 fetchAllPrevious = _deprecated_function_alias( 

925 "fetchAllPrevious", "find_all_previous", "3.0.0" 

926 ) 

927 

928 def find_previous_sibling( 

929 self, 

930 name: _FindMethodName = None, 

931 attrs: _StrainableAttributes = {}, 

932 string: Optional[_StrainableString] = None, 

933 **kwargs: _StrainableAttribute, 

934 ) -> _AtMostOneElement: 

935 """Returns the closest sibling to this `PageElement` that matches the 

936 given criteria and appears earlier in the document. 

937 

938 All find_* methods take a common set of arguments. See the online 

939 documentation for detailed explanations. 

940 

941 :param name: A filter on tag name. 

942 :param attrs: Additional filters on attribute values. 

943 :param string: A filter for a `NavigableString` with specific text. 

944 :kwargs: Additional filters on attribute values. 

945 """ 

946 return self._find_one( 

947 self.find_previous_siblings, name, attrs, string, **kwargs 

948 ) 

949 

950 findPreviousSibling = _deprecated_function_alias( 

951 "findPreviousSibling", "find_previous_sibling", "4.0.0" 

952 ) 

953 

954 def find_previous_siblings( 

955 self, 

956 name: _FindMethodName = None, 

957 attrs: _StrainableAttributes = {}, 

958 string: Optional[_StrainableString] = None, 

959 limit: Optional[int] = None, 

960 _stacklevel: int = 2, 

961 **kwargs: _StrainableAttribute, 

962 ) -> _QueryResults: 

963 """Returns all siblings to this PageElement that match the 

964 given criteria and appear earlier in the document. 

965 

966 All find_* methods take a common set of arguments. See the online 

967 documentation for detailed explanations. 

968 

969 :param name: A filter on tag name. 

970 :param attrs: Additional filters on attribute values. 

971 :param string: A filter for a NavigableString with specific text. 

972 :param limit: Stop looking after finding this many results. 

973 :param _stacklevel: Used internally to improve warning messages. 

974 :kwargs: Additional filters on attribute values. 

975 """ 

976 return self._find_all( 

977 name, 

978 attrs, 

979 string, 

980 limit, 

981 self.previous_siblings, 

982 _stacklevel=_stacklevel + 1, 

983 **kwargs, 

984 ) 

985 

986 findPreviousSiblings = _deprecated_function_alias( 

987 "findPreviousSiblings", "find_previous_siblings", "4.0.0" 

988 ) 

989 fetchPreviousSiblings = _deprecated_function_alias( 

990 "fetchPreviousSiblings", "find_previous_siblings", "3.0.0" 

991 ) 

992 

993 def find_parent( 

994 self, 

995 name: _FindMethodName = None, 

996 attrs: _StrainableAttributes = {}, 

997 **kwargs: _StrainableAttribute, 

998 ) -> _AtMostOneElement: 

999 """Find the closest parent of this PageElement that matches the given 

1000 criteria. 

1001 

1002 All find_* methods take a common set of arguments. See the online 

1003 documentation for detailed explanations. 

1004 

1005 :param name: A filter on tag name. 

1006 :param attrs: Additional filters on attribute values. 

1007 :param self: Whether the PageElement itself should be considered 

1008 as one of its 'parents'. 

1009 :kwargs: Additional filters on attribute values. 

1010 """ 

1011 # NOTE: We can't use _find_one because findParents takes a different 

1012 # set of arguments. 

1013 r = None 

1014 results = self.find_parents( 

1015 name, attrs, 1, _stacklevel=3, **kwargs 

1016 ) 

1017 if results: 

1018 r = results[0] 

1019 return r 

1020 

1021 findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0") 

1022 

1023 def find_parents( 

1024 self, 

1025 name: _FindMethodName = None, 

1026 attrs: _StrainableAttributes = {}, 

1027 limit: Optional[int] = None, 

1028 _stacklevel: int = 2, 

1029 **kwargs: _StrainableAttribute, 

1030 ) -> _QueryResults: 

1031 """Find all parents of this `PageElement` that match the given criteria. 

1032 

1033 All find_* methods take a common set of arguments. See the online 

1034 documentation for detailed explanations. 

1035 

1036 :param name: A filter on tag name. 

1037 :param attrs: Additional filters on attribute values. 

1038 :param limit: Stop looking after finding this many results. 

1039 :param _stacklevel: Used internally to improve warning messages. 

1040 :kwargs: Additional filters on attribute values. 

1041 """ 

1042 iterator = self.parents 

1043 return self._find_all( 

1044 name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs 

1045 ) 

1046 

1047 findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0") 

1048 fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0") 

1049 

1050 @property 

1051 def next(self) -> _AtMostOneElement: 

1052 """The `PageElement`, if any, that was parsed just after this one.""" 

1053 return self.next_element 

1054 

1055 @property 

1056 def previous(self) -> _AtMostOneElement: 

1057 """The `PageElement`, if any, that was parsed just before this one.""" 

1058 return self.previous_element 

1059 

1060 # These methods do the real heavy lifting. 

1061 

1062 def _find_one( 

1063 self, 

1064 # TODO-TYPING: "There is no syntax to indicate optional or 

1065 # keyword arguments; such function types are rarely used 

1066 # as callback types." - So, not sure how to get more 

1067 # specific here. 

1068 method: Callable, 

1069 name: _FindMethodName, 

1070 attrs: _StrainableAttributes, 

1071 string: Optional[_StrainableString], 

1072 **kwargs: _StrainableAttribute, 

1073 ) -> _AtMostOneElement: 

1074 r: _AtMostOneElement = None 

1075 results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs) 

1076 if results: 

1077 r = results[0] 

1078 return r 

1079 

1080 def _find_all( 

1081 self, 

1082 name: _FindMethodName, 

1083 attrs: _StrainableAttributes, 

1084 string: Optional[_StrainableString], 

1085 limit: Optional[int], 

1086 generator: Iterator[PageElement], 

1087 _stacklevel: int = 3, 

1088 **kwargs: _StrainableAttribute, 

1089 ) -> _QueryResults: 

1090 """Iterates over a generator looking for things that match.""" 

1091 

1092 if string is None and "text" in kwargs: 

1093 string = kwargs.pop("text") 

1094 warnings.warn( 

1095 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", 

1096 DeprecationWarning, 

1097 stacklevel=_stacklevel, 

1098 ) 

1099 

1100 if "_class" in kwargs: 

1101 warnings.warn( 

1102 AttributeResemblesVariableWarning.MESSAGE 

1103 % dict( 

1104 original="_class", 

1105 autocorrect="class_", 

1106 ), 

1107 AttributeResemblesVariableWarning, 

1108 stacklevel=_stacklevel, 

1109 ) 

1110 

1111 from bs4.filter import ElementFilter 

1112 

1113 if isinstance(name, ElementFilter): 

1114 matcher = name 

1115 else: 

1116 matcher = SoupStrainer(name, attrs, string, **kwargs) 

1117 

1118 result: Iterable[_OneElement] 

1119 if string is None and not limit and not attrs and not kwargs: 

1120 if name is True or name is None: 

1121 # Optimization to find all tags. 

1122 result = (element for element in generator if isinstance(element, Tag)) 

1123 return ResultSet(matcher, result) 

1124 elif isinstance(name, str): 

1125 # Optimization to find all tags with a given name. 

1126 if name.count(":") == 1: 

1127 # This is a name with a prefix. If this is a namespace-aware document, 

1128 # we need to match the local name against tag.name. If not, 

1129 # we need to match the fully-qualified name against tag.name. 

1130 prefix, local_name = name.split(":", 1) 

1131 else: 

1132 prefix = None 

1133 local_name = name 

1134 result = [] 

1135 for element in generator: 

1136 if not isinstance(element, Tag): 

1137 continue 

1138 if element.name == name or ( 

1139 element.name == local_name 

1140 and (prefix is None or element.prefix == prefix) 

1141 ): 

1142 result.append(element) 

1143 return ResultSet(matcher, result) 

1144 return matcher.find_all(generator, limit) 

1145 

1146 # These generators can be used to navigate starting from both 

1147 # NavigableStrings and Tags. 

1148 @property 

1149 def next_elements(self) -> Iterator[PageElement]: 

1150 """All PageElements that were parsed after this one.""" 

1151 i = self.next_element 

1152 while i is not None: 

1153 successor = i.next_element 

1154 yield i 

1155 i = successor 

1156 

1157 @property 

1158 def self_and_next_elements(self) -> Iterator[PageElement]: 

1159 """This PageElement, then all PageElements that were parsed after it.""" 

1160 return self._self_and(self.next_elements) 

1161 

1162 @property 

1163 def next_siblings(self) -> Iterator[PageElement]: 

1164 """All PageElements that are siblings of this one but were parsed 

1165 later. 

1166 """ 

1167 i = self.next_sibling 

1168 while i is not None: 

1169 successor = i.next_sibling 

1170 yield i 

1171 i = successor 

1172 

1173 @property 

1174 def self_and_next_siblings(self) -> Iterator[PageElement]: 

1175 """This PageElement, then all of its siblings.""" 

1176 return self._self_and(self.next_siblings) 

1177 

1178 @property 

1179 def previous_elements(self) -> Iterator[PageElement]: 

1180 """All PageElements that were parsed before this one. 

1181 

1182 :yield: A sequence of PageElements. 

1183 """ 

1184 i = self.previous_element 

1185 while i is not None: 

1186 successor = i.previous_element 

1187 yield i 

1188 i = successor 

1189 

1190 @property 

1191 def self_and_previous_elements(self) -> Iterator[PageElement]: 

1192 """This PageElement, then all elements that were parsed 

1193 earlier.""" 

1194 return self._self_and(self.previous_elements) 

1195 

1196 @property 

1197 def previous_siblings(self) -> Iterator[PageElement]: 

1198 """All PageElements that are siblings of this one but were parsed 

1199 earlier. 

1200 

1201 :yield: A sequence of PageElements. 

1202 """ 

1203 i = self.previous_sibling 

1204 while i is not None: 

1205 successor = i.previous_sibling 

1206 yield i 

1207 i = successor 

1208 

1209 @property 

1210 def self_and_previous_siblings(self) -> Iterator[PageElement]: 

1211 """This PageElement, then all of its siblings that were parsed 

1212 earlier.""" 

1213 return self._self_and(self.previous_siblings) 

1214 

1215 @property 

1216 def parents(self) -> Iterator[Tag]: 

1217 """All elements that are parents of this PageElement. 

1218 

1219 :yield: A sequence of Tags, ending with a BeautifulSoup object. 

1220 """ 

1221 i = self.parent 

1222 while i is not None: 

1223 successor = i.parent 

1224 yield i 

1225 i = successor 

1226 

1227 @property 

1228 def self_and_parents(self) -> Iterator[PageElement]: 

1229 """This element, then all of its parents. 

1230 

1231 :yield: A sequence of PageElements, ending with a BeautifulSoup object. 

1232 """ 

1233 return self._self_and(self.parents) 

1234 

1235 def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]: 

1236 """Modify a generator by yielding this element, then everything 

1237 yielded by the other generator. 

1238 """ 

1239 if not self.hidden: 

1240 yield self 

1241 for i in other_generator: 

1242 yield i 

1243 

1244 @property 

1245 def decomposed(self) -> bool: 

1246 """Check whether a PageElement has been decomposed.""" 

1247 return getattr(self, "_decomposed", False) or False 

1248 

1249 @_deprecated("next_elements", "4.0.0") 

1250 def nextGenerator(self) -> Iterator[PageElement]: 

1251 ":meta private:" 

1252 return self.next_elements 

1253 

1254 @_deprecated("next_siblings", "4.0.0") 

1255 def nextSiblingGenerator(self) -> Iterator[PageElement]: 

1256 ":meta private:" 

1257 return self.next_siblings 

1258 

1259 @_deprecated("previous_elements", "4.0.0") 

1260 def previousGenerator(self) -> Iterator[PageElement]: 

1261 ":meta private:" 

1262 return self.previous_elements 

1263 

1264 @_deprecated("previous_siblings", "4.0.0") 

1265 def previousSiblingGenerator(self) -> Iterator[PageElement]: 

1266 ":meta private:" 

1267 return self.previous_siblings 

1268 

1269 @_deprecated("parents", "4.0.0") 

1270 def parentGenerator(self) -> Iterator[PageElement]: 

1271 ":meta private:" 

1272 return self.parents 

1273 

1274 

1275class NavigableString(str, PageElement): 

1276 """A Python string that is part of a parse tree. 

1277 

1278 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will 

1279 create a `NavigableString` for the string "penguin". 

1280 """ 

1281 

1282 #: A string prepended to the body of the 'real' string 

1283 #: when formatting it as part of a document, such as the '<!--' 

1284 #: in an HTML comment. 

1285 PREFIX: str = "" 

1286 

1287 #: A string appended to the body of the 'real' string 

1288 #: when formatting it as part of a document, such as the '-->' 

1289 #: in an HTML comment. 

1290 SUFFIX: str = "" 

1291 

1292 def __new__(cls, value: Union[str, bytes]) -> Self: 

1293 """Create a new NavigableString. 

1294 

1295 When unpickling a NavigableString, this method is called with 

1296 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 

1297 passed in to the superclass's __new__ or the superclass won't know 

1298 how to handle non-ASCII characters. 

1299 """ 

1300 if isinstance(value, str): 

1301 u = str.__new__(cls, value) 

1302 else: 

1303 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 

1304 u.hidden = False 

1305 u.setup() 

1306 return u 

1307 

1308 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: 

1309 """A copy of a NavigableString has the same contents and class 

1310 as the original, but it is not connected to the parse tree. 

1311 

1312 :param recursive: This parameter is ignored; it's only defined 

1313 so that NavigableString.__deepcopy__ implements the same 

1314 signature as Tag.__deepcopy__. 

1315 """ 

1316 return type(self)(self) 

1317 

1318 def __getnewargs__(self) -> Tuple[str]: 

1319 return (str(self),) 

1320 

1321 # TODO-TYPING This should be SupportsIndex|slice but SupportsIndex 

1322 # is introduced in 3.8. This can be changed once 3.7 support is dropped. 

1323 def __getitem__(self, key: Union[int|slice]) -> str: # type:ignore 

1324 """Raise an exception """ 

1325 if isinstance(key, str): 

1326 raise TypeError("string indices must be integers, not '{0}'. Are you treating a NavigableString like a Tag?".format(key.__class__.__name__)) 

1327 return super(NavigableString, self).__getitem__(key) 

1328 

1329 @property 

1330 def string(self) -> str: 

1331 """Convenience property defined to match `Tag.string`. 

1332 

1333 :return: This property always returns the `NavigableString` it was 

1334 called on. 

1335 

1336 :meta private: 

1337 """ 

1338 return self 

1339 

1340 def output_ready(self, formatter: _FormatterOrName = "minimal") -> str: 

1341 """Run the string through the provided formatter, making it 

1342 ready for output as part of an HTML or XML document. 

1343 

1344 :param formatter: A `Formatter` object, or a string naming one 

1345 of the standard formatters. 

1346 """ 

1347 output = self.format_string(self, formatter) 

1348 return self.PREFIX + output + self.SUFFIX 

1349 

1350 @property 

1351 def name(self) -> None: 

1352 """Since a NavigableString is not a Tag, it has no .name. 

1353 

1354 This property is implemented so that code like this doesn't crash 

1355 when run on a mixture of Tag and NavigableString objects: 

1356 [x.name for x in tag.children] 

1357 

1358 :meta private: 

1359 """ 

1360 return None 

1361 

1362 @name.setter 

1363 def name(self, name: str) -> None: 

1364 """Prevent NavigableString.name from ever being set. 

1365 

1366 :meta private: 

1367 """ 

1368 raise AttributeError("A NavigableString cannot be given a name.") 

1369 

1370 def _all_strings( 

1371 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default 

1372 ) -> Iterator[str]: 

1373 """Yield all strings of certain classes, possibly stripping them. 

1374 

1375 This makes it easy for NavigableString to implement methods 

1376 like get_text() as conveniences, creating a consistent 

1377 text-extraction API across all PageElements. 

1378 

1379 :param strip: If True, all strings will be stripped before being 

1380 yielded. 

1381 

1382 :param types: A tuple of NavigableString subclasses. If this 

1383 NavigableString isn't one of those subclasses, the 

1384 sequence will be empty. By default, the subclasses 

1385 considered are NavigableString and CData objects. That 

1386 means no comments, processing instructions, etc. 

1387 

1388 :yield: A sequence that either contains this string, or is empty. 

1389 """ 

1390 if types is self.default: 

1391 # This is kept in Tag because it's full of subclasses of 

1392 # this class, which aren't defined until later in the file. 

1393 types = Tag.MAIN_CONTENT_STRING_TYPES 

1394 

1395 # Do nothing if the caller is looking for specific types of 

1396 # string, and we're of a different type. 

1397 # 

1398 # We check specific types instead of using isinstance(self, 

1399 # types) because all of these classes subclass 

1400 # NavigableString. Anyone who's using this feature probably 

1401 # wants generic NavigableStrings but not other stuff. 

1402 my_type = type(self) 

1403 if types is not None: 

1404 if isinstance(types, type): 

1405 # Looking for a single type. 

1406 if my_type is not types: 

1407 return 

1408 elif my_type not in types: 

1409 # Looking for one of a list of types. 

1410 return 

1411 

1412 value = self 

1413 if strip: 

1414 final_value = value.strip() 

1415 else: 

1416 final_value = self 

1417 if len(final_value) > 0: 

1418 yield final_value 

1419 

1420 @property 

1421 def strings(self) -> Iterator[str]: 

1422 """Yield this string, but only if it is interesting. 

1423 

1424 This is defined the way it is for compatibility with 

1425 `Tag.strings`. See `Tag` for information on which strings are 

1426 interesting in a given context. 

1427 

1428 :yield: A sequence that either contains this string, or is empty. 

1429 """ 

1430 return self._all_strings() 

1431 

1432 

1433class PreformattedString(NavigableString): 

1434 """A `NavigableString` not subject to the normal formatting rules. 

1435 

1436 This is an abstract class used for special kinds of strings such 

1437 as comments (`Comment`) and CDATA blocks (`CData`). 

1438 """ 

1439 

1440 PREFIX: str = "" 

1441 SUFFIX: str = "" 

1442 

1443 def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str: 

1444 """Make this string ready for output by adding any subclass-specific 

1445 prefix or suffix. 

1446 

1447 :param formatter: A `Formatter` object, or a string naming one 

1448 of the standard formatters. The string will be passed into the 

1449 `Formatter`, but only to trigger any side effects: the return 

1450 value is ignored. 

1451 

1452 :return: The string, with any subclass-specific prefix and 

1453 suffix added on. 

1454 """ 

1455 if formatter is not None: 

1456 self.format_string(self, formatter) 

1457 return self.PREFIX + self + self.SUFFIX 

1458 

1459 

1460class CData(PreformattedString): 

1461 """A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_.""" 

1462 

1463 PREFIX: str = "<![CDATA[" 

1464 SUFFIX: str = "]]>" 

1465 

1466 

1467class ProcessingInstruction(PreformattedString): 

1468 """A SGML processing instruction.""" 

1469 

1470 PREFIX: str = "<?" 

1471 SUFFIX: str = ">" 

1472 

1473 

1474class XMLProcessingInstruction(ProcessingInstruction): 

1475 """An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_.""" 

1476 

1477 PREFIX: str = "<?" 

1478 SUFFIX: str = "?>" 

1479 

1480 

1481class Comment(PreformattedString): 

1482 """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_.""" 

1483 

1484 PREFIX: str = "<!--" 

1485 SUFFIX: str = "-->" 

1486 

1487 

1488class Declaration(PreformattedString): 

1489 """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_.""" 

1490 

1491 PREFIX: str = "<?" 

1492 SUFFIX: str = "?>" 

1493 

1494 

1495class Doctype(PreformattedString): 

1496 """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_.""" 

1497 

1498 @classmethod 

1499 def for_name_and_ids( 

1500 cls, name: str, pub_id: Optional[str], system_id: Optional[str] 

1501 ) -> Doctype: 

1502 """Generate an appropriate document type declaration for a given 

1503 public ID and system ID. 

1504 

1505 :param name: The name of the document's root element, e.g. 'html'. 

1506 :param pub_id: The Formal Public Identifier for this document type, 

1507 e.g. '-//W3C//DTD XHTML 1.1//EN' 

1508 :param system_id: The system identifier for this document type, 

1509 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' 

1510 """ 

1511 return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id)) 

1512 

1513 @classmethod 

1514 def _string_for_name_and_ids( 

1515 cls, name: str, pub_id: Optional[str], system_id: Optional[str] 

1516 ) -> str: 

1517 """Generate a string to be used as the basis of a Doctype object. 

1518 

1519 This is a separate method from for_name_and_ids() because the lxml 

1520 TreeBuilder needs to call it. 

1521 """ 

1522 value = name or "" 

1523 if pub_id is not None: 

1524 value += ' PUBLIC "%s"' % pub_id 

1525 if system_id is not None: 

1526 value += ' "%s"' % system_id 

1527 elif system_id is not None: 

1528 value += ' SYSTEM "%s"' % system_id 

1529 return value 

1530 

1531 PREFIX: str = "<!DOCTYPE " 

1532 SUFFIX: str = ">\n" 

1533 

1534 

1535class Stylesheet(NavigableString): 

1536 """A `NavigableString` representing the contents of a `<style> HTML 

1537 tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_ 

1538 (probably CSS). 

1539 

1540 Used to distinguish embedded stylesheets from textual content. 

1541 """ 

1542 

1543 

1544class Script(NavigableString): 

1545 """A `NavigableString` representing the contents of a `<script> 

1546 HTML tag 

1547 <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_ 

1548 (probably Javascript). 

1549 

1550 Used to distinguish executable code from textual content. 

1551 """ 

1552 

1553 

1554class TemplateString(NavigableString): 

1555 """A `NavigableString` representing a string found inside an `HTML 

1556 <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_ 

1557 embedded in a larger document. 

1558 

1559 Used to distinguish such strings from the main body of the document. 

1560 """ 

1561 

1562 

1563class RubyTextString(NavigableString): 

1564 """A NavigableString representing the contents of an `<rt> HTML 

1565 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_. 

1566 

1567 Can be used to distinguish such strings from the strings they're 

1568 annotating. 

1569 """ 

1570 

1571 

1572class RubyParenthesisString(NavigableString): 

1573 """A NavigableString representing the contents of an `<rp> HTML 

1574 tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_. 

1575 """ 

1576 

1577 

1578class Tag(PageElement): 

1579 """An HTML or XML tag that is part of a parse tree, along with its 

1580 attributes, contents, and relationships to other parts of the tree. 

1581 

1582 When Beautiful Soup parses the markup ``<b>penguin</b>``, it will 

1583 create a `Tag` object representing the ``<b>`` tag. You can 

1584 instantiate `Tag` objects directly, but it's not necessary unless 

1585 you're adding entirely new markup to a parsed document. Most of 

1586 the constructor arguments are intended for use by the `TreeBuilder` 

1587 that's parsing a document. 

1588 

1589 :param parser: A `BeautifulSoup` object representing the parse tree this 

1590 `Tag` will be part of. 

1591 :param builder: The `TreeBuilder` being used to build the tree. 

1592 :param name: The name of the tag. 

1593 :param namespace: The URI of this tag's XML namespace, if any. 

1594 :param prefix: The prefix for this tag's XML namespace, if any. 

1595 :param attrs: A dictionary of attribute values. 

1596 :param parent: The `Tag` to use as the parent of this `Tag`. May be 

1597 the `BeautifulSoup` object itself. 

1598 :param previous: The `PageElement` that was parsed immediately before 

1599 parsing this tag. 

1600 :param is_xml: If True, this is an XML tag. Otherwise, this is an 

1601 HTML tag. 

1602 :param sourceline: The line number where this tag was found in its 

1603 source document. 

1604 :param sourcepos: The character position within ``sourceline`` where this 

1605 tag was found. 

1606 :param can_be_empty_element: If True, this tag should be 

1607 represented as <tag/>. If False, this tag should be represented 

1608 as <tag></tag>. 

1609 :param cdata_list_attributes: A dictionary of attributes whose values should 

1610 be parsed as lists of strings if they ever show up on this tag. 

1611 :param preserve_whitespace_tags: Names of tags whose contents 

1612 should have their whitespace preserved if they are encountered inside 

1613 this tag. 

1614 :param interesting_string_types: When iterating over this tag's 

1615 string contents in methods like `Tag.strings` or 

1616 `PageElement.get_text`, these are the types of strings that are 

1617 interesting enough to be considered. By default, 

1618 `NavigableString` (normal strings) and `CData` (CDATA 

1619 sections) are the only interesting string subtypes. 

1620 :param namespaces: A dictionary mapping currently active 

1621 namespace prefixes to URIs, as of the point in the parsing process when 

1622 this tag was encountered. This can be used later to 

1623 construct CSS selectors. 

1624 

1625 """ 

1626 

1627 def __init__( 

1628 self, 

1629 parser: Optional[BeautifulSoup] = None, 

1630 builder: Optional[TreeBuilder] = None, 

1631 name: Optional[str] = None, 

1632 namespace: Optional[str] = None, 

1633 prefix: Optional[str] = None, 

1634 attrs: Optional[_RawOrProcessedAttributeValues] = None, 

1635 parent: Optional[Union[BeautifulSoup, Tag]] = None, 

1636 previous: _AtMostOneElement = None, 

1637 is_xml: Optional[bool] = None, 

1638 sourceline: Optional[int] = None, 

1639 sourcepos: Optional[int] = None, 

1640 can_be_empty_element: Optional[bool] = None, 

1641 cdata_list_attributes: Optional[Dict[str, Set[str]]] = None, 

1642 preserve_whitespace_tags: Optional[Set[str]] = None, 

1643 interesting_string_types: Optional[Set[Type[NavigableString]]] = None, 

1644 namespaces: Optional[Dict[str, str]] = None, 

1645 # NOTE: Any new arguments here need to be mirrored in 

1646 # Tag.copy_self, and potentially BeautifulSoup.new_tag 

1647 # as well. 

1648 ): 

1649 if parser is None: 

1650 self.parser_class = None 

1651 else: 

1652 # We don't actually store the parser object: that lets extracted 

1653 # chunks be garbage-collected. 

1654 self.parser_class = parser.__class__ 

1655 if name is None: 

1656 raise ValueError("No value provided for new tag's name.") 

1657 self.name = name 

1658 self.namespace = namespace 

1659 self._namespaces = namespaces or {} 

1660 self.prefix = prefix 

1661 if (not builder or builder.store_line_numbers) and ( 

1662 sourceline is not None or sourcepos is not None 

1663 ): 

1664 self.sourceline = sourceline 

1665 self.sourcepos = sourcepos 

1666 else: 

1667 self.sourceline = sourceline 

1668 self.sourcepos = sourcepos 

1669 

1670 attr_dict_class: type[AttributeDict] 

1671 attribute_value_list_class: type[AttributeValueList] 

1672 if builder is None: 

1673 if is_xml: 

1674 attr_dict_class = XMLAttributeDict 

1675 else: 

1676 attr_dict_class = HTMLAttributeDict 

1677 attribute_value_list_class = AttributeValueList 

1678 else: 

1679 attr_dict_class = builder.attribute_dict_class 

1680 attribute_value_list_class = builder.attribute_value_list_class 

1681 self.attribute_value_list_class = attribute_value_list_class 

1682 

1683 if attrs is None: 

1684 self.attrs = attr_dict_class() 

1685 else: 

1686 if builder is not None and builder.cdata_list_attributes: 

1687 self.attrs = builder._replace_cdata_list_attribute_values( 

1688 self.name, attrs 

1689 ) 

1690 else: 

1691 self.attrs = attr_dict_class() 

1692 # Make sure that the values of any multi-valued 

1693 # attributes (e.g. when a Tag is copied) are stored in 

1694 # new lists. 

1695 for k, v in attrs.items(): 

1696 if isinstance(v, list): 

1697 v = v.__class__(v) 

1698 self.attrs[k] = v 

1699 

1700 # If possible, determine ahead of time whether this tag is an 

1701 # XML tag. 

1702 if builder: 

1703 self.known_xml = builder.is_xml 

1704 else: 

1705 self.known_xml = is_xml 

1706 self.contents: List[PageElement] = [] 

1707 self.setup(parent, previous) 

1708 self.hidden = False 

1709 

1710 if builder is None: 

1711 # In the absence of a TreeBuilder, use whatever values were 

1712 # passed in here. They're probably None, unless this is a copy of some 

1713 # other tag. 

1714 self.can_be_empty_element = can_be_empty_element 

1715 self.cdata_list_attributes = cdata_list_attributes 

1716 self.preserve_whitespace_tags = preserve_whitespace_tags 

1717 self.interesting_string_types = interesting_string_types 

1718 else: 

1719 # Set up any substitutions for this tag, such as the charset in a META tag. 

1720 self.attribute_value_list_class = builder.attribute_value_list_class 

1721 builder.set_up_substitutions(self) 

1722 

1723 # Ask the TreeBuilder whether this tag might be an empty-element tag. 

1724 self.can_be_empty_element = builder.can_be_empty_element(name) 

1725 

1726 # Keep track of the list of attributes of this tag that 

1727 # might need to be treated as a list. 

1728 # 

1729 # For performance reasons, we store the whole data structure 

1730 # rather than asking the question of every tag. Asking would 

1731 # require building a new data structure every time, and 

1732 # (unlike can_be_empty_element), we almost never need 

1733 # to check this. 

1734 self.cdata_list_attributes = builder.cdata_list_attributes 

1735 

1736 # Keep track of the names that might cause this tag to be treated as a 

1737 # whitespace-preserved tag. 

1738 self.preserve_whitespace_tags = builder.preserve_whitespace_tags 

1739 

1740 if self.name in builder.string_containers: 

1741 # This sort of tag uses a special string container 

1742 # subclass for most of its strings. We need to be able 

1743 # to look up the proper container subclass. 

1744 self.interesting_string_types = {builder.string_containers[self.name]} 

1745 else: 

1746 self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES 

1747 

1748 parser_class: Optional[type[BeautifulSoup]] 

1749 name: str 

1750 namespace: Optional[str] 

1751 prefix: Optional[str] 

1752 attrs: _AttributeValues 

1753 sourceline: Optional[int] 

1754 sourcepos: Optional[int] 

1755 known_xml: Optional[bool] 

1756 contents: List[PageElement] 

1757 hidden: bool 

1758 interesting_string_types: Optional[Set[Type[NavigableString]]] 

1759 

1760 can_be_empty_element: Optional[bool] 

1761 cdata_list_attributes: Optional[Dict[str, Set[str]]] 

1762 preserve_whitespace_tags: Optional[Set[str]] 

1763 

1764 #: :meta private: 

1765 parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0") 

1766 

1767 def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self: 

1768 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree. 

1769 Its contents are a copy of the old Tag's contents. 

1770 """ 

1771 clone = self.copy_self() 

1772 

1773 if recursive: 

1774 # Clone this tag's descendants recursively, but without 

1775 # making any recursive function calls. 

1776 tag_stack: List[Tag] = [clone] 

1777 for event, element in self._event_stream(self.descendants): 

1778 if event is Tag.END_ELEMENT_EVENT: 

1779 # Stop appending incoming Tags to the Tag that was 

1780 # just closed. 

1781 tag_stack.pop() 

1782 else: 

1783 descendant_clone = element.__deepcopy__(memo, recursive=False) 

1784 # Add to its parent's .contents 

1785 tag_stack[-1].append(descendant_clone) 

1786 

1787 if event is Tag.START_ELEMENT_EVENT: 

1788 # Add the Tag itself to the stack so that its 

1789 # children will be .appended to it. 

1790 tag_stack.append(cast(Tag, descendant_clone)) 

1791 return clone 

1792 

1793 def copy_self(self) -> Self: 

1794 """Create a new Tag just like this one, but with no 

1795 contents and unattached to any parse tree. 

1796 

1797 This is the first step in the deepcopy process, but you can 

1798 call it on its own to create a copy of a Tag without copying its 

1799 contents. 

1800 """ 

1801 clone = type(self)( 

1802 None, 

1803 None, 

1804 self.name, 

1805 self.namespace, 

1806 self.prefix, 

1807 self.attrs, 

1808 is_xml=self._is_xml, 

1809 sourceline=self.sourceline, 

1810 sourcepos=self.sourcepos, 

1811 can_be_empty_element=self.can_be_empty_element, 

1812 cdata_list_attributes=self.cdata_list_attributes, 

1813 preserve_whitespace_tags=self.preserve_whitespace_tags, 

1814 interesting_string_types=self.interesting_string_types, 

1815 namespaces=self._namespaces, 

1816 ) 

1817 for attr in ("can_be_empty_element", "hidden"): 

1818 setattr(clone, attr, getattr(self, attr)) 

1819 return clone 

1820 

1821 @property 

1822 def is_empty_element(self) -> bool: 

1823 """Is this tag an empty-element tag? (aka a self-closing tag) 

1824 

1825 A tag that has contents is never an empty-element tag. 

1826 

1827 A tag that has no contents may or may not be an empty-element 

1828 tag. It depends on the `TreeBuilder` used to create the 

1829 tag. If the builder has a designated list of empty-element 

1830 tags, then only a tag whose name shows up in that list is 

1831 considered an empty-element tag. This is usually the case 

1832 for HTML documents. 

1833 

1834 If the builder has no designated list of empty-element, then 

1835 any tag with no contents is an empty-element tag. This is usually 

1836 the case for XML documents. 

1837 """ 

1838 return len(self.contents) == 0 and self.can_be_empty_element is True 

1839 

1840 @_deprecated("is_empty_element", "4.0.0") 

1841 def isSelfClosing(self) -> bool: 

1842 ": :meta private:" 

1843 return self.is_empty_element 

1844 

1845 @property 

1846 def string(self) -> Optional[str]: 

1847 """Convenience property to get the single string within this 

1848 `Tag`, assuming there is just one. 

1849 

1850 :return: If this `Tag` has a single child that's a 

1851 `NavigableString`, the return value is that string. If this 

1852 element has one child `Tag`, the return value is that child's 

1853 `Tag.string`, recursively. If this `Tag` has no children, 

1854 or has more than one child, the return value is ``None``. 

1855 

1856 If this property is unexpectedly returning ``None`` for you, 

1857 it's probably because your `Tag` has more than one thing 

1858 inside it. 

1859 """ 

1860 if len(self.contents) != 1: 

1861 return None 

1862 child = self.contents[0] 

1863 if isinstance(child, NavigableString): 

1864 return child 

1865 elif isinstance(child, Tag): 

1866 return child.string 

1867 return None 

1868 

1869 @string.setter 

1870 def string(self, string: str) -> None: 

1871 """Replace the `Tag.contents` of this `Tag` with a single string.""" 

1872 self.clear() 

1873 if isinstance(string, NavigableString): 

1874 new_class = string.__class__ 

1875 else: 

1876 new_class = NavigableString 

1877 self.append(new_class(string)) 

1878 

1879 #: :meta private: 

1880 MAIN_CONTENT_STRING_TYPES = {NavigableString, CData} 

1881 

1882 def _all_strings( 

1883 self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default 

1884 ) -> Iterator[str]: 

1885 """Yield all strings of certain classes, possibly stripping them. 

1886 

1887 :param strip: If True, all strings will be stripped before being 

1888 yielded. 

1889 

1890 :param types: A tuple of NavigableString subclasses. Any strings of 

1891 a subclass not found in this list will be ignored. By 

1892 default, the subclasses considered are the ones found in 

1893 self.interesting_string_types. If that's not specified, 

1894 only NavigableString and CData objects will be 

1895 considered. That means no comments, processing 

1896 instructions, etc. 

1897 """ 

1898 if types is self.default: 

1899 if self.interesting_string_types is None: 

1900 types = self.MAIN_CONTENT_STRING_TYPES 

1901 else: 

1902 types = self.interesting_string_types 

1903 

1904 for descendant in self.descendants: 

1905 if not isinstance(descendant, NavigableString): 

1906 continue 

1907 descendant_type = type(descendant) 

1908 if isinstance(types, type): 

1909 if descendant_type is not types: 

1910 # We're not interested in strings of this type. 

1911 continue 

1912 elif types is not None and descendant_type not in types: 

1913 # We're not interested in strings of this type. 

1914 continue 

1915 if strip: 

1916 stripped = descendant.strip() 

1917 if len(stripped) == 0: 

1918 continue 

1919 yield stripped 

1920 else: 

1921 yield descendant 

1922 

1923 strings = property(_all_strings) 

1924 

1925 def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]: 

1926 """Insert one or more new PageElements as a child of this `Tag`. 

1927 

1928 This works similarly to :py:meth:`list.insert`, except you can insert 

1929 multiple elements at once. 

1930 

1931 :param position: The numeric position that should be occupied 

1932 in this Tag's `Tag.children` by the first new `PageElement`. 

1933 

1934 :param new_children: The PageElements to insert. 

1935 

1936 :return The newly inserted PageElements. 

1937 """ 

1938 inserted: List[PageElement] = [] 

1939 for new_child in new_children: 

1940 inserted.extend(self._insert(position, new_child)) 

1941 position += 1 

1942 return inserted 

1943 

1944 def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]: 

1945 if new_child is None: 

1946 raise ValueError("Cannot insert None into a tag.") 

1947 if new_child is self: 

1948 raise ValueError("Cannot insert a tag into itself.") 

1949 if isinstance(new_child, str) and not isinstance(new_child, NavigableString): 

1950 new_child = NavigableString(new_child) 

1951 

1952 from bs4 import BeautifulSoup 

1953 if isinstance(new_child, BeautifulSoup): 

1954 # We don't want to end up with a situation where one BeautifulSoup 

1955 # object contains another. Insert the BeautifulSoup's children and 

1956 # return them. 

1957 return self.insert(position, *list(new_child.contents)) 

1958 position = min(position, len(self.contents)) 

1959 if hasattr(new_child, "parent") and new_child.parent is not None: 

1960 # We're 'inserting' an element that's already one 

1961 # of this object's children. 

1962 if new_child.parent is self: 

1963 current_index = self.index(new_child) 

1964 if current_index < position: 

1965 # We're moving this element further down the list 

1966 # of this object's children. That means that when 

1967 # we extract this element, our target index will 

1968 # jump down one. 

1969 position -= 1 

1970 elif current_index == position: 

1971 # We're 'inserting' an element into its current location. 

1972 # This is a no-op. 

1973 return [new_child] 

1974 new_child.extract() 

1975 

1976 new_child.parent = self 

1977 previous_child = None 

1978 if position == 0: 

1979 new_child.previous_sibling = None 

1980 new_child.previous_element = self 

1981 else: 

1982 previous_child = self.contents[position - 1] 

1983 new_child.previous_sibling = previous_child 

1984 new_child.previous_sibling.next_sibling = new_child 

1985 new_child.previous_element = previous_child._last_descendant(False) 

1986 if new_child.previous_element is not None: 

1987 new_child.previous_element.next_element = new_child 

1988 

1989 new_childs_last_element = new_child._last_descendant( 

1990 is_initialized=False, accept_self=True 

1991 ) 

1992 # new_childs_last_element can't be None because we passed 

1993 # accept_self=True into _last_descendant. Worst case, 

1994 # new_childs_last_element will be new_child itself. Making 

1995 # this cast removes several mypy complaints later on as we 

1996 # manipulate new_childs_last_element. 

1997 new_childs_last_element = cast(PageElement, new_childs_last_element) 

1998 

1999 if position >= len(self.contents): 

2000 new_child.next_sibling = None 

2001 

2002 parent: Optional[Tag] = self 

2003 parents_next_sibling = None 

2004 while parents_next_sibling is None and parent is not None: 

2005 parents_next_sibling = parent.next_sibling 

2006 parent = parent.parent 

2007 if parents_next_sibling is not None: 

2008 # We found the element that comes next in the document. 

2009 break 

2010 if parents_next_sibling is not None: 

2011 new_childs_last_element.next_element = parents_next_sibling 

2012 else: 

2013 # The last element of this tag is the last element in 

2014 # the document. 

2015 new_childs_last_element.next_element = None 

2016 else: 

2017 next_child = self.contents[position] 

2018 new_child.next_sibling = next_child 

2019 if new_child.next_sibling is not None: 

2020 new_child.next_sibling.previous_sibling = new_child 

2021 new_childs_last_element.next_element = next_child 

2022 

2023 if new_childs_last_element.next_element is not None: 

2024 new_childs_last_element.next_element.previous_element = ( 

2025 new_childs_last_element 

2026 ) 

2027 self.contents.insert(position, new_child) 

2028 

2029 return [new_child] 

2030 

2031 def unwrap(self) -> Self: 

2032 """Replace this `PageElement` with its contents. 

2033 

2034 :return: This object, no longer part of the tree. 

2035 """ 

2036 my_parent = self.parent 

2037 if my_parent is None: 

2038 raise ValueError( 

2039 "Cannot replace an element with its contents when that " 

2040 "element is not part of a tree." 

2041 ) 

2042 my_index = my_parent.index(self) 

2043 self.extract(_self_index=my_index) 

2044 for child in reversed(self.contents[:]): 

2045 my_parent.insert(my_index, child) 

2046 return self 

2047 

2048 replace_with_children = unwrap 

2049 

2050 @_deprecated("unwrap", "4.0.0") 

2051 def replaceWithChildren(self) -> _OneElement: 

2052 ": :meta private:" 

2053 return self.unwrap() 

2054 

2055 def append(self, tag: _InsertableElement) -> PageElement: 

2056 """ 

2057 Appends the given `PageElement` to the contents of this `Tag`. 

2058 

2059 :param tag: A PageElement. 

2060 

2061 :return The newly appended PageElement. 

2062 """ 

2063 return self.insert(len(self.contents), tag)[0] 

2064 

2065 def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]: 

2066 """Appends one or more objects to the contents of this 

2067 `Tag`. 

2068 

2069 :param tags: If a list of `PageElement` objects is provided, 

2070 they will be appended to this tag's contents, one at a time. 

2071 If a single `Tag` is provided, its `Tag.contents` will be 

2072 used to extend this object's `Tag.contents`. 

2073 

2074 :return The list of PageElements that were appended. 

2075 """ 

2076 tag_list: Iterable[_InsertableElement] 

2077 

2078 if isinstance(tags, Tag): 

2079 tag_list = list(tags.contents) 

2080 elif isinstance(tags, (PageElement, str)): 

2081 # The caller should really be using append() instead, 

2082 # but we can make it work. 

2083 warnings.warn( 

2084 "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.", 

2085 UserWarning, 

2086 stacklevel=2, 

2087 ) 

2088 if isinstance(tags, str) and not isinstance(tags, PageElement): 

2089 tags = NavigableString(tags) 

2090 tag_list = [tags] 

2091 elif isinstance(tags, Iterable): 

2092 # Moving items around the tree may change their position in 

2093 # the original list. Make a list that won't change. 

2094 tag_list = list(tags) 

2095 

2096 results: List[PageElement] = [] 

2097 for tag in tag_list: 

2098 results.append(self.append(tag)) 

2099 

2100 return results 

2101 

2102 def clear(self, decompose: bool = False) -> None: 

2103 """Destroy all children of this `Tag` by calling 

2104 `PageElement.extract` on them. 

2105 

2106 :param decompose: If this is True, `PageElement.decompose` (a 

2107 more destructive method) will be called instead of 

2108 `PageElement.extract`. 

2109 """ 

2110 for element in self.contents[:]: 

2111 if decompose: 

2112 element.decompose() 

2113 else: 

2114 element.extract() 

2115 

2116 def smooth(self) -> None: 

2117 """Smooth out the children of this `Tag` by consolidating consecutive 

2118 strings. 

2119 

2120 If you perform a lot of operations that modify the tree, 

2121 calling this method afterwards can make pretty-printed output 

2122 look more natural. 

2123 """ 

2124 # Mark the first position of every pair of children that need 

2125 # to be consolidated. Do this rather than making a copy of 

2126 # self.contents, since in most cases very few strings will be 

2127 # affected. 

2128 marked = [] 

2129 for i, a in enumerate(self.contents): 

2130 if isinstance(a, Tag): 

2131 # Recursively smooth children. 

2132 a.smooth() 

2133 if i == len(self.contents) - 1: 

2134 # This is the last item in .contents, and it's not a 

2135 # tag. There's no chance it needs any work. 

2136 continue 

2137 b = self.contents[i + 1] 

2138 if ( 

2139 isinstance(a, NavigableString) 

2140 and isinstance(b, NavigableString) 

2141 and not isinstance(a, PreformattedString) 

2142 and not isinstance(b, PreformattedString) 

2143 ): 

2144 marked.append(i) 

2145 

2146 # Go over the marked positions in reverse order, so that 

2147 # removing items from .contents won't affect the remaining 

2148 # positions. 

2149 for i in reversed(marked): 

2150 a = cast(NavigableString, self.contents[i]) 

2151 b = cast(NavigableString, self.contents[i + 1]) 

2152 b.extract() 

2153 n = NavigableString(a + b) 

2154 a.replace_with(n) 

2155 

2156 def index(self, element: PageElement) -> int: 

2157 """Find the index of a child of this `Tag` (by identity, not value). 

2158 

2159 Doing this by identity avoids issues when a `Tag` contains two 

2160 children that have string equality. 

2161 

2162 :param element: Look for this `PageElement` in this object's contents. 

2163 """ 

2164 for i, child in enumerate(self.contents): 

2165 if child is element: 

2166 return i 

2167 raise ValueError("Tag.index: element not in tag") 

2168 

2169 def get( 

2170 self, key: str, default: Optional[_AttributeValue] = None 

2171 ) -> Optional[_AttributeValue]: 

2172 """Returns the value of the 'key' attribute for the tag, or 

2173 the value given for 'default' if it doesn't have that 

2174 attribute. 

2175 

2176 :param key: The attribute to look for. 

2177 :param default: Use this value if the attribute is not present 

2178 on this `Tag`. 

2179 """ 

2180 return self.attrs.get(key, default) 

2181 

2182 def get_attribute_list( 

2183 self, key: str, default: Optional[AttributeValueList] = None 

2184 ) -> AttributeValueList: 

2185 """The same as get(), but always returns a (possibly empty) list. 

2186 

2187 :param key: The attribute to look for. 

2188 :param default: Use this value if the attribute is not present 

2189 on this `Tag`. 

2190 :return: A list of strings, usually empty or containing only a single 

2191 value. 

2192 """ 

2193 list_value: AttributeValueList 

2194 value = self.get(key, default) 

2195 if value is None: 

2196 list_value = self.attribute_value_list_class() 

2197 elif isinstance(value, list): 

2198 list_value = value 

2199 else: 

2200 if not isinstance(value, str): 

2201 value = cast(str, value) 

2202 list_value = self.attribute_value_list_class([value]) 

2203 return list_value 

2204 

2205 def has_attr(self, key: str) -> bool: 

2206 """Does this `Tag` have an attribute with the given name?""" 

2207 return key in self.attrs 

2208 

2209 def __hash__(self) -> int: 

2210 return str(self).__hash__() 

2211 

2212 def __getitem__(self, key: str) -> _AttributeValue: 

2213 """tag[key] returns the value of the 'key' attribute for the Tag, 

2214 and throws an exception if it's not there.""" 

2215 return self.attrs[key] 

2216 

2217 def __iter__(self) -> Iterator[PageElement]: 

2218 "Iterating over a Tag iterates over its contents." 

2219 return iter(self.contents) 

2220 

2221 def __len__(self) -> int: 

2222 "The length of a Tag is the length of its list of contents." 

2223 return len(self.contents) 

2224 

2225 def __contains__(self, x: Any) -> bool: 

2226 return x in self.contents 

2227 

2228 def __bool__(self) -> bool: 

2229 "A tag is non-None even if it has no contents." 

2230 return True 

2231 

2232 def __setitem__(self, key: str, value: _AttributeValue) -> None: 

2233 """Setting tag[key] sets the value of the 'key' attribute for the 

2234 tag.""" 

2235 self.attrs[key] = value 

2236 

2237 def __delitem__(self, key: str) -> None: 

2238 "Deleting tag[key] deletes all 'key' attributes for the tag." 

2239 self.attrs.pop(key, None) 

2240 

2241 def __call__( 

2242 self, 

2243 name: Optional[_StrainableElement] = None, 

2244 attrs: _StrainableAttributes = {}, 

2245 recursive: bool = True, 

2246 string: Optional[_StrainableString] = None, 

2247 limit: Optional[int] = None, 

2248 _stacklevel: int = 2, 

2249 **kwargs: _StrainableAttribute, 

2250 ) -> _QueryResults: 

2251 """Calling a Tag like a function is the same as calling its 

2252 find_all() method. Eg. tag('a') returns a list of all the A tags 

2253 found within this tag.""" 

2254 return self.find_all( 

2255 name, attrs, recursive, string, limit, _stacklevel, **kwargs 

2256 ) 

2257 

2258 def __getattr__(self, subtag: str) -> Optional[Tag]: 

2259 """Calling tag.subtag is the same as calling tag.find(name="subtag")""" 

2260 # print("Getattr %s.%s" % (self.__class__, tag)) 

2261 result: _AtMostOneElement 

2262 if len(subtag) > 3 and subtag.endswith("Tag"): 

2263 # BS3: soup.aTag -> "soup.find("a") 

2264 tag_name = subtag[:-3] 

2265 warnings.warn( 

2266 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' 

2267 % dict(name=tag_name), 

2268 DeprecationWarning, 

2269 stacklevel=2, 

2270 ) 

2271 result = self.find(tag_name) 

2272 # We special case contents to avoid recursion. 

2273 elif not subtag.startswith("__") and not subtag == "contents": 

2274 result = self.find(subtag) 

2275 else: 

2276 raise AttributeError( 

2277 "'%s' object has no attribute '%s'" % (self.__class__, subtag) 

2278 ) 

2279 return cast(Optional[Tag], result) 

2280 

2281 def __eq__(self, other: Any) -> bool: 

2282 """Returns true iff this Tag has the same name, the same attributes, 

2283 and the same contents (recursively) as `other`.""" 

2284 if self is other: 

2285 return True 

2286 if not isinstance(other, Tag): 

2287 return False 

2288 if ( 

2289 not hasattr(other, "name") 

2290 or not hasattr(other, "attrs") 

2291 or not hasattr(other, "contents") 

2292 or self.name != other.name 

2293 or self.attrs != other.attrs 

2294 or len(self) != len(other) 

2295 ): 

2296 return False 

2297 for i, my_child in enumerate(self.contents): 

2298 if my_child != other.contents[i]: 

2299 return False 

2300 return True 

2301 

2302 def __ne__(self, other: Any) -> bool: 

2303 """Returns true iff this Tag is not identical to `other`, 

2304 as defined in __eq__.""" 

2305 return not self == other 

2306 

2307 def __repr__(self) -> str: 

2308 """Renders this `Tag` as a string.""" 

2309 return self.decode() 

2310 

2311 __str__ = __unicode__ = __repr__ 

2312 

2313 def encode( 

2314 self, 

2315 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2316 indent_level: Optional[int] = None, 

2317 formatter: _FormatterOrName = "minimal", 

2318 errors: str = "xmlcharrefreplace", 

2319 ) -> bytes: 

2320 """Render this `Tag` and its contents as a bytestring. 

2321 

2322 :param encoding: The encoding to use when converting to 

2323 a bytestring. This may also affect the text of the document, 

2324 specifically any encoding declarations within the document. 

2325 :param indent_level: Each line of the rendering will be 

2326 indented this many levels. (The ``formatter`` decides what a 

2327 'level' means, in terms of spaces or other characters 

2328 output.) This is used internally in recursive calls while 

2329 pretty-printing. 

2330 :param formatter: Either a `Formatter` object, or a string naming one of 

2331 the standard formatters. 

2332 :param errors: An error handling strategy such as 

2333 'xmlcharrefreplace'. This value is passed along into 

2334 :py:meth:`str.encode` and its value should be one of the `error 

2335 handling constants defined by Python's codecs module 

2336 <https://docs.python.org/3/library/codecs.html#error-handlers>`_. 

2337 """ 

2338 # Turn the data structure into Unicode, then encode the 

2339 # Unicode. 

2340 u = self.decode(indent_level, encoding, formatter) 

2341 return u.encode(encoding, errors) 

2342 

2343 def decode( 

2344 self, 

2345 indent_level: Optional[int] = None, 

2346 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2347 formatter: _FormatterOrName = "minimal", 

2348 iterator: Optional[Iterator[PageElement]] = None, 

2349 ) -> str: 

2350 """Render this `Tag` and its contents as a Unicode string. 

2351 

2352 :param indent_level: Each line of the rendering will be 

2353 indented this many levels. (The ``formatter`` decides what a 

2354 'level' means, in terms of spaces or other characters 

2355 output.) This is used internally in recursive calls while 

2356 pretty-printing. 

2357 :param encoding: The encoding you intend to use when 

2358 converting the string to a bytestring. decode() is *not* 

2359 responsible for performing that encoding. This information 

2360 is needed so that a real encoding can be substituted in if 

2361 the document contains an encoding declaration (e.g. in a 

2362 <meta> tag). 

2363 :param formatter: Either a `Formatter` object, or a string 

2364 naming one of the standard formatters. 

2365 :param iterator: The iterator to use when navigating over the 

2366 parse tree. This is only used by `Tag.decode_contents` and 

2367 you probably won't need to use it. 

2368 """ 

2369 pieces = [] 

2370 # First off, turn a non-Formatter `formatter` into a Formatter 

2371 # object. This will stop the lookup from happening over and 

2372 # over again. 

2373 if not isinstance(formatter, Formatter): 

2374 formatter = self.formatter_for_name(formatter) 

2375 

2376 if indent_level is True: 

2377 indent_level = 0 

2378 

2379 # The currently active tag that put us into string literal 

2380 # mode. Until this element is closed, children will be treated 

2381 # as string literals and not pretty-printed. String literal 

2382 # mode is turned on immediately after this tag begins, and 

2383 # turned off immediately before it's closed. This means there 

2384 # will be whitespace before and after the tag itself. 

2385 string_literal_tag = None 

2386 

2387 for event, element in self._event_stream(iterator): 

2388 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): 

2389 element = cast(Tag, element) 

2390 piece = element._format_tag(eventual_encoding, formatter, opening=True) 

2391 elif event is Tag.END_ELEMENT_EVENT: 

2392 element = cast(Tag, element) 

2393 piece = element._format_tag(eventual_encoding, formatter, opening=False) 

2394 if indent_level is not None: 

2395 indent_level -= 1 

2396 else: 

2397 element = cast(NavigableString, element) 

2398 piece = element.output_ready(formatter) 

2399 

2400 # Now we need to apply the 'prettiness' -- extra 

2401 # whitespace before and/or after this tag. This can get 

2402 # complicated because certain tags, like <pre> and 

2403 # <script>, can't be prettified, since adding whitespace would 

2404 # change the meaning of the content. 

2405 

2406 # The default behavior is to add whitespace before and 

2407 # after an element when string literal mode is off, and to 

2408 # leave things as they are when string literal mode is on. 

2409 if string_literal_tag: 

2410 indent_before = indent_after = False 

2411 else: 

2412 indent_before = indent_after = True 

2413 

2414 # The only time the behavior is more complex than that is 

2415 # when we encounter an opening or closing tag that might 

2416 # put us into or out of string literal mode. 

2417 if ( 

2418 event is Tag.START_ELEMENT_EVENT 

2419 and not string_literal_tag 

2420 and not cast(Tag, element)._should_pretty_print() 

2421 ): 

2422 # We are about to enter string literal mode. Add 

2423 # whitespace before this tag, but not after. We 

2424 # will stay in string literal mode until this tag 

2425 # is closed. 

2426 indent_before = True 

2427 indent_after = False 

2428 string_literal_tag = element 

2429 elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag: 

2430 # We are about to exit string literal mode by closing 

2431 # the tag that sent us into that mode. Add whitespace 

2432 # after this tag, but not before. 

2433 indent_before = False 

2434 indent_after = True 

2435 string_literal_tag = None 

2436 

2437 # Now we know whether to add whitespace before and/or 

2438 # after this element. 

2439 if indent_level is not None: 

2440 if indent_before or indent_after: 

2441 if isinstance(element, NavigableString): 

2442 piece = piece.strip() 

2443 if piece: 

2444 piece = self._indent_string( 

2445 piece, indent_level, formatter, indent_before, indent_after 

2446 ) 

2447 if event == Tag.START_ELEMENT_EVENT: 

2448 indent_level += 1 

2449 pieces.append(piece) 

2450 return "".join(pieces) 

2451 

2452 class _TreeTraversalEvent(object): 

2453 """An internal class representing an event in the process 

2454 of traversing a parse tree. 

2455 

2456 :meta private: 

2457 """ 

2458 

2459 # Stand-ins for the different events yielded by _event_stream 

2460 START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2461 END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2462 EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2463 STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: 

2464 

2465 def _event_stream( 

2466 self, iterator: Optional[Iterator[PageElement]] = None 

2467 ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]: 

2468 """Yield a sequence of events that can be used to reconstruct the DOM 

2469 for this element. 

2470 

2471 This lets us recreate the nested structure of this element 

2472 (e.g. when formatting it as a string) without using recursive 

2473 method calls. 

2474 

2475 This is similar in concept to the SAX API, but it's a simpler 

2476 interface designed for internal use. The events are different 

2477 from SAX and the arguments associated with the events are Tags 

2478 and other Beautiful Soup objects. 

2479 

2480 :param iterator: An alternate iterator to use when traversing 

2481 the tree. 

2482 """ 

2483 tag_stack: List[Tag] = [] 

2484 

2485 iterator = iterator or self.self_and_descendants 

2486 

2487 for c in iterator: 

2488 # If the parent of the element we're about to yield is not 

2489 # the tag currently on the stack, it means that the tag on 

2490 # the stack closed before this element appeared. 

2491 while tag_stack and c.parent != tag_stack[-1]: 

2492 now_closed_tag = tag_stack.pop() 

2493 yield Tag.END_ELEMENT_EVENT, now_closed_tag 

2494 

2495 if isinstance(c, Tag): 

2496 if c.is_empty_element: 

2497 yield Tag.EMPTY_ELEMENT_EVENT, c 

2498 else: 

2499 yield Tag.START_ELEMENT_EVENT, c 

2500 tag_stack.append(c) 

2501 continue 

2502 else: 

2503 yield Tag.STRING_ELEMENT_EVENT, c 

2504 

2505 while tag_stack: 

2506 now_closed_tag = tag_stack.pop() 

2507 yield Tag.END_ELEMENT_EVENT, now_closed_tag 

2508 

2509 def _indent_string( 

2510 self, 

2511 s: str, 

2512 indent_level: int, 

2513 formatter: Formatter, 

2514 indent_before: bool, 

2515 indent_after: bool, 

2516 ) -> str: 

2517 """Add indentation whitespace before and/or after a string. 

2518 

2519 :param s: The string to amend with whitespace. 

2520 :param indent_level: The indentation level; affects how much 

2521 whitespace goes before the string. 

2522 :param indent_before: Whether or not to add whitespace 

2523 before the string. 

2524 :param indent_after: Whether or not to add whitespace 

2525 (a newline) after the string. 

2526 """ 

2527 space_before = "" 

2528 if indent_before and indent_level: 

2529 space_before = formatter.indent * indent_level 

2530 

2531 space_after = "" 

2532 if indent_after: 

2533 space_after = "\n" 

2534 

2535 return space_before + s + space_after 

2536 

2537 def _format_tag( 

2538 self, eventual_encoding: str, formatter: Formatter, opening: bool 

2539 ) -> str: 

2540 if self.hidden: 

2541 # A hidden tag is invisible, although its contents 

2542 # are visible. 

2543 return "" 

2544 

2545 # A tag starts with the < character (see below). 

2546 

2547 # Then the / character, if this is a closing tag. 

2548 closing_slash = "" 

2549 if not opening: 

2550 closing_slash = "/" 

2551 

2552 # Then an optional namespace prefix. 

2553 prefix = "" 

2554 if self.prefix: 

2555 prefix = self.prefix + ":" 

2556 

2557 # Then a list of attribute values, if this is an opening tag. 

2558 attribute_string = "" 

2559 if opening: 

2560 attributes = formatter.attributes(self) 

2561 attrs = [] 

2562 for key, val in attributes: 

2563 if val is None: 

2564 decoded = key 

2565 else: 

2566 if isinstance(val, list) or isinstance(val, tuple): 

2567 val = " ".join(val) 

2568 elif not isinstance(val, str): 

2569 val = str(val) 

2570 elif ( 

2571 isinstance(val, AttributeValueWithCharsetSubstitution) 

2572 and eventual_encoding is not None 

2573 ): 

2574 val = val.substitute_encoding(eventual_encoding) 

2575 

2576 text = formatter.attribute_value(val) 

2577 decoded = str(key) + "=" + formatter.quoted_attribute_value(text) 

2578 attrs.append(decoded) 

2579 if attrs: 

2580 attribute_string = " " + " ".join(attrs) 

2581 

2582 # Then an optional closing slash (for a void element in an 

2583 # XML document). 

2584 void_element_closing_slash = "" 

2585 if self.is_empty_element: 

2586 void_element_closing_slash = formatter.void_element_close_prefix or "" 

2587 

2588 # Put it all together. 

2589 return ( 

2590 "<" 

2591 + closing_slash 

2592 + prefix 

2593 + self.name 

2594 + attribute_string 

2595 + void_element_closing_slash 

2596 + ">" 

2597 ) 

2598 

2599 def _should_pretty_print(self, indent_level: int = 1) -> bool: 

2600 """Should this tag be pretty-printed? 

2601 

2602 Most of them should, but some (such as <pre> in HTML 

2603 documents) should not. 

2604 """ 

2605 return indent_level is not None and ( 

2606 not self.preserve_whitespace_tags 

2607 or self.name not in self.preserve_whitespace_tags 

2608 ) 

2609 

2610 @overload 

2611 def prettify( 

2612 self, 

2613 encoding: None = None, 

2614 formatter: _FormatterOrName = "minimal", 

2615 ) -> str: 

2616 ... 

2617 

2618 @overload 

2619 def prettify( 

2620 self, 

2621 encoding: _Encoding, 

2622 formatter: _FormatterOrName = "minimal", 

2623 ) -> bytes: 

2624 ... 

2625 

2626 def prettify( 

2627 self, 

2628 encoding: Optional[_Encoding] = None, 

2629 formatter: _FormatterOrName = "minimal", 

2630 ) -> Union[str, bytes]: 

2631 """Pretty-print this `Tag` as a string or bytestring. 

2632 

2633 :param encoding: The encoding of the bytestring, or None if you want Unicode. 

2634 :param formatter: A Formatter object, or a string naming one of 

2635 the standard formatters. 

2636 :return: A string (if no ``encoding`` is provided) or a bytestring 

2637 (otherwise). 

2638 """ 

2639 if encoding is None: 

2640 return self.decode(indent_level=0, formatter=formatter) 

2641 else: 

2642 return self.encode(encoding=encoding, indent_level=0, formatter=formatter) 

2643 

2644 def decode_contents( 

2645 self, 

2646 indent_level: Optional[int] = None, 

2647 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2648 formatter: _FormatterOrName = "minimal", 

2649 ) -> str: 

2650 """Renders the contents of this tag as a Unicode string. 

2651 

2652 :param indent_level: Each line of the rendering will be 

2653 indented this many levels. (The formatter decides what a 

2654 'level' means in terms of spaces or other characters 

2655 output.) Used internally in recursive calls while 

2656 pretty-printing. 

2657 

2658 :param eventual_encoding: The tag is destined to be 

2659 encoded into this encoding. decode_contents() is *not* 

2660 responsible for performing that encoding. This information 

2661 is needed so that a real encoding can be substituted in if 

2662 the document contains an encoding declaration (e.g. in a 

2663 <meta> tag). 

2664 

2665 :param formatter: A `Formatter` object, or a string naming one of 

2666 the standard Formatters. 

2667 """ 

2668 return self.decode( 

2669 indent_level, eventual_encoding, formatter, iterator=self.descendants 

2670 ) 

2671 

2672 def encode_contents( 

2673 self, 

2674 indent_level: Optional[int] = None, 

2675 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2676 formatter: _FormatterOrName = "minimal", 

2677 ) -> bytes: 

2678 """Renders the contents of this PageElement as a bytestring. 

2679 

2680 :param indent_level: Each line of the rendering will be 

2681 indented this many levels. (The ``formatter`` decides what a 

2682 'level' means, in terms of spaces or other characters 

2683 output.) This is used internally in recursive calls while 

2684 pretty-printing. 

2685 :param formatter: Either a `Formatter` object, or a string naming one of 

2686 the standard formatters. 

2687 :param encoding: The bytestring will be in this encoding. 

2688 """ 

2689 contents = self.decode_contents(indent_level, encoding, formatter) 

2690 return contents.encode(encoding) 

2691 

2692 @_deprecated("encode_contents", "4.0.0") 

2693 def renderContents( 

2694 self, 

2695 encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

2696 prettyPrint: bool = False, 

2697 indentLevel: Optional[int] = 0, 

2698 ) -> bytes: 

2699 """Deprecated method for BS3 compatibility. 

2700 

2701 :meta private: 

2702 """ 

2703 if not prettyPrint: 

2704 indentLevel = None 

2705 return self.encode_contents(indent_level=indentLevel, encoding=encoding) 

2706 

2707 # Soup methods 

2708 

2709 def find( 

2710 self, 

2711 name: _FindMethodName = None, 

2712 attrs: _StrainableAttributes = {}, 

2713 recursive: bool = True, 

2714 string: Optional[_StrainableString] = None, 

2715 **kwargs: _StrainableAttribute, 

2716 ) -> _AtMostOneElement: 

2717 """Look in the children of this PageElement and find the first 

2718 PageElement that matches the given criteria. 

2719 

2720 All find_* methods take a common set of arguments. See the online 

2721 documentation for detailed explanations. 

2722 

2723 :param name: A filter on tag name. 

2724 :param attrs: Additional filters on attribute values. 

2725 :param recursive: If this is True, find() will perform a 

2726 recursive search of this Tag's children. Otherwise, 

2727 only the direct children will be considered. 

2728 :param string: A filter on the `Tag.string` attribute. 

2729 :param limit: Stop looking after finding this many results. 

2730 :kwargs: Additional filters on attribute values. 

2731 """ 

2732 r = None 

2733 results = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) 

2734 if results: 

2735 r = results[0] 

2736 return r 

2737 

2738 findChild = _deprecated_function_alias("findChild", "find", "3.0.0") 

2739 

2740 def find_all( 

2741 self, 

2742 name: _FindMethodName = None, 

2743 attrs: _StrainableAttributes = {}, 

2744 recursive: bool = True, 

2745 string: Optional[_StrainableString] = None, 

2746 limit: Optional[int] = None, 

2747 _stacklevel: int = 2, 

2748 **kwargs: _StrainableAttribute, 

2749 ) -> _QueryResults: 

2750 """Look in the children of this `PageElement` and find all 

2751 `PageElement` objects that match the given criteria. 

2752 

2753 All find_* methods take a common set of arguments. See the online 

2754 documentation for detailed explanations. 

2755 

2756 :param name: A filter on tag name. 

2757 :param attrs: Additional filters on attribute values. 

2758 :param recursive: If this is True, find_all() will perform a 

2759 recursive search of this PageElement's children. Otherwise, 

2760 only the direct children will be considered. 

2761 :param limit: Stop looking after finding this many results. 

2762 :param _stacklevel: Used internally to improve warning messages. 

2763 :kwargs: Additional filters on attribute values. 

2764 """ 

2765 generator = self.descendants 

2766 if not recursive: 

2767 generator = self.children 

2768 return self._find_all( 

2769 name, attrs, string, limit, generator, _stacklevel=_stacklevel + 1, **kwargs 

2770 ) 

2771 

2772 findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0") 

2773 findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0") 

2774 

2775 # Generator methods 

2776 @property 

2777 def children(self) -> Iterator[PageElement]: 

2778 """Iterate over all direct children of this `PageElement`.""" 

2779 return (x for x in self.contents) 

2780 

2781 @property 

2782 def self_and_descendants(self) -> Iterator[PageElement]: 

2783 """Iterate over this `Tag` and its children in a 

2784 breadth-first sequence. 

2785 """ 

2786 return self._self_and(self.descendants) 

2787 

2788 @property 

2789 def descendants(self) -> Iterator[PageElement]: 

2790 """Iterate over all children of this `Tag` in a 

2791 breadth-first sequence. 

2792 """ 

2793 if not len(self.contents): 

2794 return 

2795 # _last_descendant() can't return None here because 

2796 # accept_self is True. Worst case, last_descendant will end up 

2797 # as self. 

2798 last_descendant = cast(PageElement, self._last_descendant(accept_self=True)) 

2799 stopNode = last_descendant.next_element 

2800 current: _AtMostOneElement = self.contents[0] 

2801 while current is not stopNode and current is not None: 

2802 successor = current.next_element 

2803 yield current 

2804 current = successor 

2805 

2806 # CSS selector code 

2807 def select_one( 

2808 self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any 

2809 ) -> Optional[Tag]: 

2810 """Perform a CSS selection operation on the current element. 

2811 

2812 :param selector: A CSS selector. 

2813 

2814 :param namespaces: A dictionary mapping namespace prefixes 

2815 used in the CSS selector to namespace URIs. By default, 

2816 Beautiful Soup will use the prefixes it encountered while 

2817 parsing the document. 

2818 

2819 :param kwargs: Keyword arguments to be passed into Soup Sieve's 

2820 soupsieve.select() method. 

2821 """ 

2822 return self.css.select_one(selector, namespaces, **kwargs) 

2823 

2824 def select( 

2825 self, 

2826 selector: str, 

2827 namespaces: Optional[Dict[str, str]] = None, 

2828 limit: int = 0, 

2829 **kwargs: Any, 

2830 ) -> ResultSet[Tag]: 

2831 """Perform a CSS selection operation on the current element. 

2832 

2833 This uses the SoupSieve library. 

2834 

2835 :param selector: A string containing a CSS selector. 

2836 

2837 :param namespaces: A dictionary mapping namespace prefixes 

2838 used in the CSS selector to namespace URIs. By default, 

2839 Beautiful Soup will use the prefixes it encountered while 

2840 parsing the document. 

2841 

2842 :param limit: After finding this number of results, stop looking. 

2843 

2844 :param kwargs: Keyword arguments to be passed into SoupSieve's 

2845 soupsieve.select() method. 

2846 """ 

2847 return self.css.select(selector, namespaces, limit, **kwargs) 

2848 

2849 @property 

2850 def css(self) -> CSS: 

2851 """Return an interface to the CSS selector API.""" 

2852 return CSS(self) 

2853 

2854 # Old names for backwards compatibility 

2855 @_deprecated("children", "4.0.0") 

2856 def childGenerator(self) -> Iterator[PageElement]: 

2857 """Deprecated generator. 

2858 

2859 :meta private: 

2860 """ 

2861 return self.children 

2862 

2863 @_deprecated("descendants", "4.0.0") 

2864 def recursiveChildGenerator(self) -> Iterator[PageElement]: 

2865 """Deprecated generator. 

2866 

2867 :meta private: 

2868 """ 

2869 return self.descendants 

2870 

2871 @_deprecated("has_attr", "4.0.0") 

2872 def has_key(self, key: str) -> bool: 

2873 """Deprecated method. This was kind of misleading because has_key() 

2874 (attributes) was different from __in__ (contents). 

2875 

2876 has_key() is gone in Python 3, anyway. 

2877 

2878 :meta private: 

2879 """ 

2880 return self.has_attr(key) 

2881 

2882 

2883_PageElementT = TypeVar("_PageElementT", bound=PageElement) 

2884 

2885 

2886class ResultSet(List[_PageElementT], Generic[_PageElementT]): 

2887 """A ResultSet is a list of `PageElement` objects, gathered as the result 

2888 of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of 

2889 search results. 

2890 """ 

2891 

2892 source: Optional[ElementFilter] 

2893 

2894 def __init__( 

2895 self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = () 

2896 ) -> None: 

2897 super(ResultSet, self).__init__(result) 

2898 self.source = source 

2899 

2900 def __getattr__(self, key: str) -> None: 

2901 """Raise a helpful exception to explain a common code fix.""" 

2902 raise AttributeError( 

2903 f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?""" 

2904 ) 

2905 

2906 

2907# Now that all the classes used by SoupStrainer have been defined, 

2908# import SoupStrainer itself into this module to preserve the 

2909# backwards compatibility of anyone who imports 

2910# bs4.element.SoupStrainer. 

2911from bs4.filter import SoupStrainer # noqa: E402