Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/__init__.py: 64%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

419 statements  

1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". 

2 

3http://www.crummy.com/software/BeautifulSoup/ 

4 

5Beautiful Soup uses a pluggable XML or HTML parser to parse a 

6(possibly invalid) document into a tree representation. Beautiful Soup 

7provides methods and Pythonic idioms that make it easy to navigate, 

8search, and modify the parse tree. 

9 

10Beautiful Soup works with Python 3.7 and up. It works better if lxml 

11and/or html5lib is installed, but they are not required. 

12 

13For more than you ever wanted to know about Beautiful Soup, see the 

14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ 

15""" 

16 

17__author__ = "Leonard Richardson (leonardr@segfault.org)" 

18__version__ = "4.13.5" 

19__copyright__ = "Copyright (c) 2004-2025 Leonard Richardson" 

20# Use of this source code is governed by the MIT license. 

21__license__ = "MIT" 

22 

23__all__ = [ 

24 "AttributeResemblesVariableWarning", 

25 "BeautifulSoup", 

26 "Comment", 

27 "Declaration", 

28 "ProcessingInstruction", 

29 "ResultSet", 

30 "CSS", 

31 "Script", 

32 "Stylesheet", 

33 "Tag", 

34 "TemplateString", 

35 "ElementFilter", 

36 "UnicodeDammit", 

37 "CData", 

38 "Doctype", 

39 

40 # Exceptions 

41 "FeatureNotFound", 

42 "ParserRejectedMarkup", 

43 "StopParsing", 

44 

45 # Warnings 

46 "AttributeResemblesVariableWarning", 

47 "GuessedAtParserWarning", 

48 "MarkupResemblesLocatorWarning", 

49 "UnusualUsageWarning", 

50 "XMLParsedAsHTMLWarning", 

51] 

52 

53from collections import Counter 

54import io 

55import sys 

56import warnings 

57 

58# The very first thing we do is give a useful error if someone is 

59# running this code under Python 2. 

60if sys.version_info.major < 3: 

61 raise ImportError( 

62 "You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3." 

63 ) 

64 

65from .builder import ( 

66 builder_registry, 

67 TreeBuilder, 

68) 

69from .builder._htmlparser import HTMLParserTreeBuilder 

70from .dammit import UnicodeDammit 

71from .css import CSS 

72from ._deprecation import ( 

73 _deprecated, 

74) 

75from .element import ( 

76 CData, 

77 Comment, 

78 DEFAULT_OUTPUT_ENCODING, 

79 Declaration, 

80 Doctype, 

81 NavigableString, 

82 PageElement, 

83 ProcessingInstruction, 

84 PYTHON_SPECIFIC_ENCODINGS, 

85 ResultSet, 

86 Script, 

87 Stylesheet, 

88 Tag, 

89 TemplateString, 

90) 

91from .formatter import Formatter 

92from .filter import ( 

93 ElementFilter, 

94 SoupStrainer, 

95) 

96from typing import ( 

97 Any, 

98 cast, 

99 Counter as CounterType, 

100 Dict, 

101 Iterator, 

102 List, 

103 Sequence, 

104 Sized, 

105 Optional, 

106 Type, 

107 Union, 

108) 

109 

110from bs4._typing import ( 

111 _Encoding, 

112 _Encodings, 

113 _IncomingMarkup, 

114 _InsertableElement, 

115 _RawAttributeValue, 

116 _RawAttributeValues, 

117 _RawMarkup, 

118) 

119 

120# Import all warnings and exceptions into the main package. 

121from bs4.exceptions import ( 

122 FeatureNotFound, 

123 ParserRejectedMarkup, 

124 StopParsing, 

125) 

126from bs4._warnings import ( 

127 AttributeResemblesVariableWarning, 

128 GuessedAtParserWarning, 

129 MarkupResemblesLocatorWarning, 

130 UnusualUsageWarning, 

131 XMLParsedAsHTMLWarning, 

132) 

133 

134 

135class BeautifulSoup(Tag): 

136 """A data structure representing a parsed HTML or XML document. 

137 

138 Most of the methods you'll call on a BeautifulSoup object are inherited from 

139 PageElement or Tag. 

140 

141 Internally, this class defines the basic interface called by the 

142 tree builders when converting an HTML/XML document into a data 

143 structure. The interface abstracts away the differences between 

144 parsers. To write a new tree builder, you'll need to understand 

145 these methods as a whole. 

146 

147 These methods will be called by the BeautifulSoup constructor: 

148 * reset() 

149 * feed(markup) 

150 

151 The tree builder may call these methods from its feed() implementation: 

152 * handle_starttag(name, attrs) # See note about return value 

153 * handle_endtag(name) 

154 * handle_data(data) # Appends to the current data node 

155 * endData(containerClass) # Ends the current data node 

156 

157 No matter how complicated the underlying parser is, you should be 

158 able to build a tree using 'start tag' events, 'end tag' events, 

159 'data' events, and "done with data" events. 

160 

161 If you encounter an empty-element tag (aka a self-closing tag, 

162 like HTML's <br> tag), call handle_starttag and then 

163 handle_endtag. 

164 """ 

165 

166 #: Since `BeautifulSoup` subclasses `Tag`, it's possible to treat it as 

167 #: a `Tag` with a `Tag.name`. Hoever, this name makes it clear the 

168 #: `BeautifulSoup` object isn't a real markup tag. 

169 ROOT_TAG_NAME: str = "[document]" 

170 

171 #: If the end-user gives no indication which tree builder they 

172 #: want, look for one with these features. 

173 DEFAULT_BUILDER_FEATURES: Sequence[str] = ["html", "fast"] 

174 

175 #: A string containing all ASCII whitespace characters, used in 

176 #: during parsing to detect data chunks that seem 'empty'. 

177 ASCII_SPACES: str = "\x20\x0a\x09\x0c\x0d" 

178 

179 # FUTURE PYTHON: 

180 element_classes: Dict[Type[PageElement], Type[PageElement]] #: :meta private: 

181 builder: TreeBuilder #: :meta private: 

182 is_xml: bool 

183 known_xml: Optional[bool] 

184 parse_only: Optional[SoupStrainer] #: :meta private: 

185 

186 # These members are only used while parsing markup. 

187 markup: Optional[_RawMarkup] #: :meta private: 

188 current_data: List[str] #: :meta private: 

189 currentTag: Optional[Tag] #: :meta private: 

190 tagStack: List[Tag] #: :meta private: 

191 open_tag_counter: CounterType[str] #: :meta private: 

192 preserve_whitespace_tag_stack: List[Tag] #: :meta private: 

193 string_container_stack: List[Tag] #: :meta private: 

194 _most_recent_element: Optional[PageElement] #: :meta private: 

195 

196 #: Beautiful Soup's best guess as to the character encoding of the 

197 #: original document. 

198 original_encoding: Optional[_Encoding] 

199 

200 #: The character encoding, if any, that was explicitly defined 

201 #: in the original document. This may or may not match 

202 #: `BeautifulSoup.original_encoding`. 

203 declared_html_encoding: Optional[_Encoding] 

204 

205 #: This is True if the markup that was parsed contains 

206 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present 

207 #: in the original markup. These mark character sequences that 

208 #: could not be represented in Unicode. 

209 contains_replacement_characters: bool 

210 

211 def __init__( 

212 self, 

213 markup: _IncomingMarkup = "", 

214 features: Optional[Union[str, Sequence[str]]] = None, 

215 builder: Optional[Union[TreeBuilder, Type[TreeBuilder]]] = None, 

216 parse_only: Optional[SoupStrainer] = None, 

217 from_encoding: Optional[_Encoding] = None, 

218 exclude_encodings: Optional[_Encodings] = None, 

219 element_classes: Optional[Dict[Type[PageElement], Type[PageElement]]] = None, 

220 **kwargs: Any, 

221 ): 

222 """Constructor. 

223 

224 :param markup: A string or a file-like object representing 

225 markup to be parsed. 

226 

227 :param features: Desirable features of the parser to be 

228 used. This may be the name of a specific parser ("lxml", 

229 "lxml-xml", "html.parser", or "html5lib") or it may be the 

230 type of markup to be used ("html", "html5", "xml"). It's 

231 recommended that you name a specific parser, so that 

232 Beautiful Soup gives you the same results across platforms 

233 and virtual environments. 

234 

235 :param builder: A TreeBuilder subclass to instantiate (or 

236 instance to use) instead of looking one up based on 

237 `features`. You only need to use this if you've implemented a 

238 custom TreeBuilder. 

239 

240 :param parse_only: A SoupStrainer. Only parts of the document 

241 matching the SoupStrainer will be considered. This is useful 

242 when parsing part of a document that would otherwise be too 

243 large to fit into memory. 

244 

245 :param from_encoding: A string indicating the encoding of the 

246 document to be parsed. Pass this in if Beautiful Soup is 

247 guessing wrongly about the document's encoding. 

248 

249 :param exclude_encodings: A list of strings indicating 

250 encodings known to be wrong. Pass this in if you don't know 

251 the document's encoding but you know Beautiful Soup's guess is 

252 wrong. 

253 

254 :param element_classes: A dictionary mapping BeautifulSoup 

255 classes like Tag and NavigableString, to other classes you'd 

256 like to be instantiated instead as the parse tree is 

257 built. This is useful for subclassing Tag or NavigableString 

258 to modify default behavior. 

259 

260 :param kwargs: For backwards compatibility purposes, the 

261 constructor accepts certain keyword arguments used in 

262 Beautiful Soup 3. None of these arguments do anything in 

263 Beautiful Soup 4; they will result in a warning and then be 

264 ignored. 

265 

266 Apart from this, any keyword arguments passed into the 

267 BeautifulSoup constructor are propagated to the TreeBuilder 

268 constructor. This makes it possible to configure a 

269 TreeBuilder by passing in arguments, not just by saying which 

270 one to use. 

271 """ 

272 if "convertEntities" in kwargs: 

273 del kwargs["convertEntities"] 

274 warnings.warn( 

275 "BS4 does not respect the convertEntities argument to the " 

276 "BeautifulSoup constructor. Entities are always converted " 

277 "to Unicode characters." 

278 ) 

279 

280 if "markupMassage" in kwargs: 

281 del kwargs["markupMassage"] 

282 warnings.warn( 

283 "BS4 does not respect the markupMassage argument to the " 

284 "BeautifulSoup constructor. The tree builder is responsible " 

285 "for any necessary markup massage." 

286 ) 

287 

288 if "smartQuotesTo" in kwargs: 

289 del kwargs["smartQuotesTo"] 

290 warnings.warn( 

291 "BS4 does not respect the smartQuotesTo argument to the " 

292 "BeautifulSoup constructor. Smart quotes are always converted " 

293 "to Unicode characters." 

294 ) 

295 

296 if "selfClosingTags" in kwargs: 

297 del kwargs["selfClosingTags"] 

298 warnings.warn( 

299 "Beautiful Soup 4 does not respect the selfClosingTags argument to the " 

300 "BeautifulSoup constructor. The tree builder is responsible " 

301 "for understanding self-closing tags." 

302 ) 

303 

304 if "isHTML" in kwargs: 

305 del kwargs["isHTML"] 

306 warnings.warn( 

307 "Beautiful Soup 4 does not respect the isHTML argument to the " 

308 "BeautifulSoup constructor. Suggest you use " 

309 "features='lxml' for HTML and features='lxml-xml' for " 

310 "XML." 

311 ) 

312 

313 def deprecated_argument(old_name: str, new_name: str) -> Optional[Any]: 

314 if old_name in kwargs: 

315 warnings.warn( 

316 'The "%s" argument to the BeautifulSoup constructor ' 

317 'was renamed to "%s" in Beautiful Soup 4.0.0' 

318 % (old_name, new_name), 

319 DeprecationWarning, 

320 stacklevel=3, 

321 ) 

322 return kwargs.pop(old_name) 

323 return None 

324 

325 parse_only = parse_only or deprecated_argument("parseOnlyThese", "parse_only") 

326 if parse_only is not None: 

327 # Issue a warning if we can tell in advance that 

328 # parse_only will exclude the entire tree. 

329 if parse_only.excludes_everything: 

330 warnings.warn( 

331 f"The given value for parse_only will exclude everything: {parse_only}", 

332 UserWarning, 

333 stacklevel=3, 

334 ) 

335 

336 from_encoding = from_encoding or deprecated_argument( 

337 "fromEncoding", "from_encoding" 

338 ) 

339 

340 if from_encoding and isinstance(markup, str): 

341 warnings.warn( 

342 "You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored." 

343 ) 

344 from_encoding = None 

345 

346 self.element_classes = element_classes or dict() 

347 

348 # We need this information to track whether or not the builder 

349 # was specified well enough that we can omit the 'you need to 

350 # specify a parser' warning. 

351 original_builder = builder 

352 original_features = features 

353 

354 builder_class: Optional[Type[TreeBuilder]] = None 

355 if isinstance(builder, type): 

356 # A builder class was passed in; it needs to be instantiated. 

357 builder_class = builder 

358 builder = None 

359 elif builder is None: 

360 if isinstance(features, str): 

361 features = [features] 

362 if features is None or len(features) == 0: 

363 features = self.DEFAULT_BUILDER_FEATURES 

364 possible_builder_class = builder_registry.lookup(*features) 

365 if possible_builder_class is None: 

366 raise FeatureNotFound( 

367 "Couldn't find a tree builder with the features you " 

368 "requested: %s. Do you need to install a parser library?" 

369 % ",".join(features) 

370 ) 

371 builder_class = possible_builder_class 

372 

373 # At this point either we have a TreeBuilder instance in 

374 # builder, or we have a builder_class that we can instantiate 

375 # with the remaining **kwargs. 

376 if builder is None: 

377 assert builder_class is not None 

378 builder = builder_class(**kwargs) 

379 if ( 

380 not original_builder 

381 and not ( 

382 original_features == builder.NAME 

383 or ( 

384 isinstance(original_features, str) 

385 and original_features in builder.ALTERNATE_NAMES 

386 ) 

387 ) 

388 and markup 

389 ): 

390 # The user did not tell us which TreeBuilder to use, 

391 # and we had to guess. Issue a warning. 

392 if builder.is_xml: 

393 markup_type = "XML" 

394 else: 

395 markup_type = "HTML" 

396 

397 # This code adapted from warnings.py so that we get the same line 

398 # of code as our warnings.warn() call gets, even if the answer is wrong 

399 # (as it may be in a multithreading situation). 

400 caller = None 

401 try: 

402 caller = sys._getframe(1) 

403 except ValueError: 

404 pass 

405 if caller: 

406 globals = caller.f_globals 

407 line_number = caller.f_lineno 

408 else: 

409 globals = sys.__dict__ 

410 line_number = 1 

411 filename = globals.get("__file__") 

412 if filename: 

413 fnl = filename.lower() 

414 if fnl.endswith((".pyc", ".pyo")): 

415 filename = filename[:-1] 

416 if filename: 

417 # If there is no filename at all, the user is most likely in a REPL, 

418 # and the warning is not necessary. 

419 values = dict( 

420 filename=filename, 

421 line_number=line_number, 

422 parser=builder.NAME, 

423 markup_type=markup_type, 

424 ) 

425 warnings.warn( 

426 GuessedAtParserWarning.MESSAGE % values, 

427 GuessedAtParserWarning, 

428 stacklevel=2, 

429 ) 

430 else: 

431 if kwargs: 

432 warnings.warn( 

433 "Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`." 

434 ) 

435 

436 self.builder = builder 

437 self.is_xml = builder.is_xml 

438 self.known_xml = self.is_xml 

439 self._namespaces = dict() 

440 self.parse_only = parse_only 

441 

442 if hasattr(markup, "read"): # It's a file-type object. 

443 markup = cast(io.IOBase, markup).read() 

444 elif not isinstance(markup, (bytes, str)) and not hasattr(markup, "__len__"): 

445 raise TypeError( 

446 f"Incoming markup is of an invalid type: {markup!r}. Markup must be a string, a bytestring, or an open filehandle." 

447 ) 

448 elif isinstance(markup, Sized) and len(markup) <= 256 and ( 

449 (isinstance(markup, bytes) and b"<" not in markup and b"\n" not in markup) 

450 or (isinstance(markup, str) and "<" not in markup and "\n" not in markup) 

451 ): 

452 # Issue warnings for a couple beginner problems 

453 # involving passing non-markup to Beautiful Soup. 

454 # Beautiful Soup will still parse the input as markup, 

455 # since that is sometimes the intended behavior. 

456 if not self._markup_is_url(markup): 

457 self._markup_resembles_filename(markup) 

458 

459 # At this point we know markup is a string or bytestring. If 

460 # it was a file-type object, we've read from it. 

461 markup = cast(_RawMarkup, markup) 

462 

463 rejections = [] 

464 success = False 

465 for ( 

466 self.markup, 

467 self.original_encoding, 

468 self.declared_html_encoding, 

469 self.contains_replacement_characters, 

470 ) in self.builder.prepare_markup( 

471 markup, from_encoding, exclude_encodings=exclude_encodings 

472 ): 

473 self.reset() 

474 self.builder.initialize_soup(self) 

475 try: 

476 self._feed() 

477 success = True 

478 break 

479 except ParserRejectedMarkup as e: 

480 rejections.append(e) 

481 pass 

482 

483 if not success: 

484 other_exceptions = [str(e) for e in rejections] 

485 raise ParserRejectedMarkup( 

486 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " 

487 + "\n ".join(other_exceptions) 

488 ) 

489 

490 # Clear out the markup and remove the builder's circular 

491 # reference to this object. 

492 self.markup = None 

493 self.builder.soup = None 

494 

495 def copy_self(self) -> "BeautifulSoup": 

496 """Create a new BeautifulSoup object with the same TreeBuilder, 

497 but not associated with any markup. 

498 

499 This is the first step of the deepcopy process. 

500 """ 

501 clone = type(self)("", None, self.builder) 

502 

503 # Keep track of the encoding of the original document, 

504 # since we won't be parsing it again. 

505 clone.original_encoding = self.original_encoding 

506 return clone 

507 

508 def __getstate__(self) -> Dict[str, Any]: 

509 # Frequently a tree builder can't be pickled. 

510 d = dict(self.__dict__) 

511 if "builder" in d and d["builder"] is not None and not self.builder.picklable: 

512 d["builder"] = type(self.builder) 

513 # Store the contents as a Unicode string. 

514 d["contents"] = [] 

515 d["markup"] = self.decode() 

516 

517 # If _most_recent_element is present, it's a Tag object left 

518 # over from initial parse. It might not be picklable and we 

519 # don't need it. 

520 if "_most_recent_element" in d: 

521 del d["_most_recent_element"] 

522 return d 

523 

524 def __setstate__(self, state: Dict[str, Any]) -> None: 

525 # If necessary, restore the TreeBuilder by looking it up. 

526 self.__dict__ = state 

527 if isinstance(self.builder, type): 

528 self.builder = self.builder() 

529 elif not self.builder: 

530 # We don't know which builder was used to build this 

531 # parse tree, so use a default we know is always available. 

532 self.builder = HTMLParserTreeBuilder() 

533 self.builder.soup = self 

534 self.reset() 

535 self._feed() 

536 

537 @classmethod 

538 @_deprecated( 

539 replaced_by="nothing (private method, will be removed)", version="4.13.0" 

540 ) 

541 def _decode_markup(cls, markup: _RawMarkup) -> str: 

542 """Ensure `markup` is Unicode so it's safe to send into warnings.warn. 

543 

544 warnings.warn had this problem back in 2010 but fortunately 

545 not anymore. This has not been used for a long time; I just 

546 noticed that fact while working on 4.13.0. 

547 """ 

548 if isinstance(markup, bytes): 

549 decoded = markup.decode("utf-8", "replace") 

550 else: 

551 decoded = markup 

552 return decoded 

553 

554 @classmethod 

555 def _markup_is_url(cls, markup: _RawMarkup) -> bool: 

556 """Error-handling method to raise a warning if incoming markup looks 

557 like a URL. 

558 

559 :param markup: A string of markup. 

560 :return: Whether or not the markup resembled a URL 

561 closely enough to justify issuing a warning. 

562 """ 

563 problem: bool = False 

564 if isinstance(markup, bytes): 

565 problem = ( 

566 any(markup.startswith(prefix) for prefix in (b"http:", b"https:")) 

567 and b" " not in markup 

568 ) 

569 elif isinstance(markup, str): 

570 problem = ( 

571 any(markup.startswith(prefix) for prefix in ("http:", "https:")) 

572 and " " not in markup 

573 ) 

574 else: 

575 return False 

576 

577 if not problem: 

578 return False 

579 warnings.warn( 

580 MarkupResemblesLocatorWarning.URL_MESSAGE % dict(what="URL"), 

581 MarkupResemblesLocatorWarning, 

582 stacklevel=3, 

583 ) 

584 return True 

585 

586 @classmethod 

587 def _markup_resembles_filename(cls, markup: _RawMarkup) -> bool: 

588 """Error-handling method to issue a warning if incoming markup 

589 resembles a filename. 

590 

591 :param markup: A string of markup. 

592 :return: Whether or not the markup resembled a filename 

593 closely enough to justify issuing a warning. 

594 """ 

595 markup_b: bytes 

596 

597 # We're only checking ASCII characters, so rather than write 

598 # the same tests twice, convert Unicode to a bytestring and 

599 # operate on the bytestring. 

600 if isinstance(markup, str): 

601 markup_b = markup.encode("utf8") 

602 else: 

603 markup_b = markup 

604 

605 # Step 1: does it end with a common textual file extension? 

606 filelike = False 

607 lower = markup_b.lower() 

608 extensions = [b".html", b".htm", b".xml", b".xhtml", b".txt"] 

609 if any(lower.endswith(ext) for ext in extensions): 

610 filelike = True 

611 if not filelike: 

612 return False 

613 

614 # Step 2: it _might_ be a file, but there are a few things 

615 # we can look for that aren't very common in filenames. 

616 

617 # Characters that have special meaning to Unix shells. (< was 

618 # excluded before this method was called.) 

619 # 

620 # Many of these are also reserved characters that cannot 

621 # appear in Windows filenames. 

622 for byte in markup_b: 

623 if byte in b"?*#&;>$|": 

624 return False 

625 

626 # Two consecutive forward slashes (as seen in a URL) or two 

627 # consecutive spaces (as seen in fixed-width data). 

628 # 

629 # (Paths to Windows network shares contain consecutive 

630 # backslashes, so checking that doesn't seem as helpful.) 

631 if b"//" in markup_b: 

632 return False 

633 if b" " in markup_b: 

634 return False 

635 

636 # A colon in any position other than position 1 (e.g. after a 

637 # Windows drive letter). 

638 if markup_b.startswith(b":"): 

639 return False 

640 colon_i = markup_b.rfind(b":") 

641 if colon_i not in (-1, 1): 

642 return False 

643 

644 # Step 3: If it survived all of those checks, it's similar 

645 # enough to a file to justify issuing a warning. 

646 warnings.warn( 

647 MarkupResemblesLocatorWarning.FILENAME_MESSAGE % dict(what="filename"), 

648 MarkupResemblesLocatorWarning, 

649 stacklevel=3, 

650 ) 

651 return True 

652 

653 def _feed(self) -> None: 

654 """Internal method that parses previously set markup, creating a large 

655 number of Tag and NavigableString objects. 

656 """ 

657 # Convert the document to Unicode. 

658 self.builder.reset() 

659 

660 if self.markup is not None: 

661 self.builder.feed(self.markup) 

662 # Close out any unfinished strings and close all the open tags. 

663 self.endData() 

664 while ( 

665 self.currentTag is not None and self.currentTag.name != self.ROOT_TAG_NAME 

666 ): 

667 self.popTag() 

668 

669 def reset(self) -> None: 

670 """Reset this object to a state as though it had never parsed any 

671 markup. 

672 """ 

673 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 

674 self.hidden = True 

675 self.builder.reset() 

676 self.current_data = [] 

677 self.currentTag = None 

678 self.tagStack = [] 

679 self.open_tag_counter = Counter() 

680 self.preserve_whitespace_tag_stack = [] 

681 self.string_container_stack = [] 

682 self._most_recent_element = None 

683 self.pushTag(self) 

684 

685 def new_tag( 

686 self, 

687 name: str, 

688 namespace: Optional[str] = None, 

689 nsprefix: Optional[str] = None, 

690 attrs: Optional[_RawAttributeValues] = None, 

691 sourceline: Optional[int] = None, 

692 sourcepos: Optional[int] = None, 

693 string: Optional[str] = None, 

694 **kwattrs: _RawAttributeValue, 

695 ) -> Tag: 

696 """Create a new Tag associated with this BeautifulSoup object. 

697 

698 :param name: The name of the new Tag. 

699 :param namespace: The URI of the new Tag's XML namespace, if any. 

700 :param prefix: The prefix for the new Tag's XML namespace, if any. 

701 :param attrs: A dictionary of this Tag's attribute values; can 

702 be used instead of ``kwattrs`` for attributes like 'class' 

703 that are reserved words in Python. 

704 :param sourceline: The line number where this tag was 

705 (purportedly) found in its source document. 

706 :param sourcepos: The character position within ``sourceline`` where this 

707 tag was (purportedly) found. 

708 :param string: String content for the new Tag, if any. 

709 :param kwattrs: Keyword arguments for the new Tag's attribute values. 

710 

711 """ 

712 attr_container = self.builder.attribute_dict_class(**kwattrs) 

713 if attrs is not None: 

714 attr_container.update(attrs) 

715 tag_class = self.element_classes.get(Tag, Tag) 

716 

717 # Assume that this is either Tag or a subclass of Tag. If not, 

718 # the user brought type-unsafety upon themselves. 

719 tag_class = cast(Type[Tag], tag_class) 

720 tag = tag_class( 

721 None, 

722 self.builder, 

723 name, 

724 namespace, 

725 nsprefix, 

726 attr_container, 

727 sourceline=sourceline, 

728 sourcepos=sourcepos, 

729 ) 

730 

731 if string is not None: 

732 tag.string = string 

733 return tag 

734 

735 def string_container( 

736 self, base_class: Optional[Type[NavigableString]] = None 

737 ) -> Type[NavigableString]: 

738 """Find the class that should be instantiated to hold a given kind of 

739 string. 

740 

741 This may be a built-in Beautiful Soup class or a custom class passed 

742 in to the BeautifulSoup constructor. 

743 """ 

744 container = base_class or NavigableString 

745 

746 # The user may want us to use some other class (hopefully a 

747 # custom subclass) instead of the one we'd use normally. 

748 container = cast( 

749 Type[NavigableString], self.element_classes.get(container, container) 

750 ) 

751 

752 # On top of that, we may be inside a tag that needs a special 

753 # container class. 

754 if self.string_container_stack and container is NavigableString: 

755 container = self.builder.string_containers.get( 

756 self.string_container_stack[-1].name, container 

757 ) 

758 return container 

759 

760 def new_string( 

761 self, s: str, subclass: Optional[Type[NavigableString]] = None 

762 ) -> NavigableString: 

763 """Create a new `NavigableString` associated with this `BeautifulSoup` 

764 object. 

765 

766 :param s: The string content of the `NavigableString` 

767 :param subclass: The subclass of `NavigableString`, if any, to 

768 use. If a document is being processed, an appropriate 

769 subclass for the current location in the document will 

770 be determined automatically. 

771 """ 

772 container = self.string_container(subclass) 

773 return container(s) 

774 

775 def insert_before(self, *args: _InsertableElement) -> List[PageElement]: 

776 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement 

777 it because there is nothing before or after it in the parse tree. 

778 """ 

779 raise NotImplementedError( 

780 "BeautifulSoup objects don't support insert_before()." 

781 ) 

782 

783 def insert_after(self, *args: _InsertableElement) -> List[PageElement]: 

784 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement 

785 it because there is nothing before or after it in the parse tree. 

786 """ 

787 raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 

788 

789 def popTag(self) -> Optional[Tag]: 

790 """Internal method called by _popToTag when a tag is closed. 

791 

792 :meta private: 

793 """ 

794 if not self.tagStack: 

795 # Nothing to pop. This shouldn't happen. 

796 return None 

797 tag = self.tagStack.pop() 

798 if tag.name in self.open_tag_counter: 

799 self.open_tag_counter[tag.name] -= 1 

800 if ( 

801 self.preserve_whitespace_tag_stack 

802 and tag == self.preserve_whitespace_tag_stack[-1] 

803 ): 

804 self.preserve_whitespace_tag_stack.pop() 

805 if self.string_container_stack and tag == self.string_container_stack[-1]: 

806 self.string_container_stack.pop() 

807 # print("Pop", tag.name) 

808 if self.tagStack: 

809 self.currentTag = self.tagStack[-1] 

810 return self.currentTag 

811 

812 def pushTag(self, tag: Tag) -> None: 

813 """Internal method called by handle_starttag when a tag is opened. 

814 

815 :meta private: 

816 """ 

817 # print("Push", tag.name) 

818 if self.currentTag is not None: 

819 self.currentTag.contents.append(tag) 

820 self.tagStack.append(tag) 

821 self.currentTag = self.tagStack[-1] 

822 if tag.name != self.ROOT_TAG_NAME: 

823 self.open_tag_counter[tag.name] += 1 

824 if tag.name in self.builder.preserve_whitespace_tags: 

825 self.preserve_whitespace_tag_stack.append(tag) 

826 if tag.name in self.builder.string_containers: 

827 self.string_container_stack.append(tag) 

828 

829 def endData(self, containerClass: Optional[Type[NavigableString]] = None) -> None: 

830 """Method called by the TreeBuilder when the end of a data segment 

831 occurs. 

832 

833 :param containerClass: The class to use when incorporating the 

834 data segment into the parse tree. 

835 

836 :meta private: 

837 """ 

838 if self.current_data: 

839 current_data = "".join(self.current_data) 

840 # If whitespace is not preserved, and this string contains 

841 # nothing but ASCII spaces, replace it with a single space 

842 # or newline. 

843 if not self.preserve_whitespace_tag_stack: 

844 strippable = True 

845 for i in current_data: 

846 if i not in self.ASCII_SPACES: 

847 strippable = False 

848 break 

849 if strippable: 

850 if "\n" in current_data: 

851 current_data = "\n" 

852 else: 

853 current_data = " " 

854 

855 # Reset the data collector. 

856 self.current_data = [] 

857 

858 # Should we add this string to the tree at all? 

859 if ( 

860 self.parse_only 

861 and len(self.tagStack) <= 1 

862 and (not self.parse_only.allow_string_creation(current_data)) 

863 ): 

864 return 

865 

866 containerClass = self.string_container(containerClass) 

867 o = containerClass(current_data) 

868 self.object_was_parsed(o) 

869 

870 def object_was_parsed( 

871 self, 

872 o: PageElement, 

873 parent: Optional[Tag] = None, 

874 most_recent_element: Optional[PageElement] = None, 

875 ) -> None: 

876 """Method called by the TreeBuilder to integrate an object into the 

877 parse tree. 

878 

879 :meta private: 

880 """ 

881 if parent is None: 

882 parent = self.currentTag 

883 assert parent is not None 

884 previous_element: Optional[PageElement] 

885 if most_recent_element is not None: 

886 previous_element = most_recent_element 

887 else: 

888 previous_element = self._most_recent_element 

889 

890 next_element = previous_sibling = next_sibling = None 

891 if isinstance(o, Tag): 

892 next_element = o.next_element 

893 next_sibling = o.next_sibling 

894 previous_sibling = o.previous_sibling 

895 if previous_element is None: 

896 previous_element = o.previous_element 

897 

898 fix = parent.next_element is not None 

899 

900 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) 

901 

902 self._most_recent_element = o 

903 parent.contents.append(o) 

904 

905 # Check if we are inserting into an already parsed node. 

906 if fix: 

907 self._linkage_fixer(parent) 

908 

909 def _linkage_fixer(self, el: Tag) -> None: 

910 """Make sure linkage of this fragment is sound.""" 

911 

912 first = el.contents[0] 

913 child = el.contents[-1] 

914 descendant: PageElement = child 

915 

916 if child is first and el.parent is not None: 

917 # Parent should be linked to first child 

918 el.next_element = child 

919 # We are no longer linked to whatever this element is 

920 prev_el = child.previous_element 

921 if prev_el is not None and prev_el is not el: 

922 prev_el.next_element = None 

923 # First child should be linked to the parent, and no previous siblings. 

924 child.previous_element = el 

925 child.previous_sibling = None 

926 

927 # We have no sibling as we've been appended as the last. 

928 child.next_sibling = None 

929 

930 # This index is a tag, dig deeper for a "last descendant" 

931 if isinstance(child, Tag) and child.contents: 

932 # _last_decendant is typed as returning Optional[PageElement], 

933 # but the value can't be None here, because el is a Tag 

934 # which we know has contents. 

935 descendant = cast(PageElement, child._last_descendant(False)) 

936 

937 # As the final step, link last descendant. It should be linked 

938 # to the parent's next sibling (if found), else walk up the chain 

939 # and find a parent with a sibling. It should have no next sibling. 

940 descendant.next_element = None 

941 descendant.next_sibling = None 

942 

943 target: Optional[Tag] = el 

944 while True: 

945 if target is None: 

946 break 

947 elif target.next_sibling is not None: 

948 descendant.next_element = target.next_sibling 

949 target.next_sibling.previous_element = child 

950 break 

951 target = target.parent 

952 

953 def _popToTag( 

954 self, name: str, nsprefix: Optional[str] = None, inclusivePop: bool = True 

955 ) -> Optional[Tag]: 

956 """Pops the tag stack up to and including the most recent 

957 instance of the given tag. 

958 

959 If there are no open tags with the given name, nothing will be 

960 popped. 

961 

962 :param name: Pop up to the most recent tag with this name. 

963 :param nsprefix: The namespace prefix that goes with `name`. 

964 :param inclusivePop: It this is false, pops the tag stack up 

965 to but *not* including the most recent instqance of the 

966 given tag. 

967 

968 :meta private: 

969 """ 

970 # print("Popping to %s" % name) 

971 if name == self.ROOT_TAG_NAME: 

972 # The BeautifulSoup object itself can never be popped. 

973 return None 

974 

975 most_recently_popped = None 

976 

977 stack_size = len(self.tagStack) 

978 for i in range(stack_size - 1, 0, -1): 

979 if not self.open_tag_counter.get(name): 

980 break 

981 t = self.tagStack[i] 

982 if name == t.name and nsprefix == t.prefix: 

983 if inclusivePop: 

984 most_recently_popped = self.popTag() 

985 break 

986 most_recently_popped = self.popTag() 

987 

988 return most_recently_popped 

989 

990 def handle_starttag( 

991 self, 

992 name: str, 

993 namespace: Optional[str], 

994 nsprefix: Optional[str], 

995 attrs: _RawAttributeValues, 

996 sourceline: Optional[int] = None, 

997 sourcepos: Optional[int] = None, 

998 namespaces: Optional[Dict[str, str]] = None, 

999 ) -> Optional[Tag]: 

1000 """Called by the tree builder when a new tag is encountered. 

1001 

1002 :param name: Name of the tag. 

1003 :param nsprefix: Namespace prefix for the tag. 

1004 :param attrs: A dictionary of attribute values. Note that 

1005 attribute values are expected to be simple strings; processing 

1006 of multi-valued attributes such as "class" comes later. 

1007 :param sourceline: The line number where this tag was found in its 

1008 source document. 

1009 :param sourcepos: The character position within `sourceline` where this 

1010 tag was found. 

1011 :param namespaces: A dictionary of all namespace prefix mappings 

1012 currently in scope in the document. 

1013 

1014 If this method returns None, the tag was rejected by an active 

1015 `ElementFilter`. You should proceed as if the tag had not occurred 

1016 in the document. For instance, if this was a self-closing tag, 

1017 don't call handle_endtag. 

1018 

1019 :meta private: 

1020 """ 

1021 # print("Start tag %s: %s" % (name, attrs)) 

1022 self.endData() 

1023 

1024 if ( 

1025 self.parse_only 

1026 and len(self.tagStack) <= 1 

1027 and not self.parse_only.allow_tag_creation(nsprefix, name, attrs) 

1028 ): 

1029 return None 

1030 

1031 tag_class = self.element_classes.get(Tag, Tag) 

1032 # Assume that this is either Tag or a subclass of Tag. If not, 

1033 # the user brought type-unsafety upon themselves. 

1034 tag_class = cast(Type[Tag], tag_class) 

1035 tag = tag_class( 

1036 self, 

1037 self.builder, 

1038 name, 

1039 namespace, 

1040 nsprefix, 

1041 attrs, 

1042 self.currentTag, 

1043 self._most_recent_element, 

1044 sourceline=sourceline, 

1045 sourcepos=sourcepos, 

1046 namespaces=namespaces, 

1047 ) 

1048 if tag is None: 

1049 return tag 

1050 if self._most_recent_element is not None: 

1051 self._most_recent_element.next_element = tag 

1052 self._most_recent_element = tag 

1053 self.pushTag(tag) 

1054 return tag 

1055 

1056 def handle_endtag(self, name: str, nsprefix: Optional[str] = None) -> None: 

1057 """Called by the tree builder when an ending tag is encountered. 

1058 

1059 :param name: Name of the tag. 

1060 :param nsprefix: Namespace prefix for the tag. 

1061 

1062 :meta private: 

1063 """ 

1064 # print("End tag: " + name) 

1065 self.endData() 

1066 self._popToTag(name, nsprefix) 

1067 

1068 def handle_data(self, data: str) -> None: 

1069 """Called by the tree builder when a chunk of textual data is 

1070 encountered. 

1071 

1072 :meta private: 

1073 """ 

1074 self.current_data.append(data) 

1075 

1076 def decode( 

1077 self, 

1078 indent_level: Optional[int] = None, 

1079 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

1080 formatter: Union[Formatter, str] = "minimal", 

1081 iterator: Optional[Iterator[PageElement]] = None, 

1082 **kwargs: Any, 

1083 ) -> str: 

1084 """Returns a string representation of the parse tree 

1085 as a full HTML or XML document. 

1086 

1087 :param indent_level: Each line of the rendering will be 

1088 indented this many levels. (The ``formatter`` decides what a 

1089 'level' means, in terms of spaces or other characters 

1090 output.) This is used internally in recursive calls while 

1091 pretty-printing. 

1092 :param eventual_encoding: The encoding of the final document. 

1093 If this is None, the document will be a Unicode string. 

1094 :param formatter: Either a `Formatter` object, or a string naming one of 

1095 the standard formatters. 

1096 :param iterator: The iterator to use when navigating over the 

1097 parse tree. This is only used by `Tag.decode_contents` and 

1098 you probably won't need to use it. 

1099 """ 

1100 if self.is_xml: 

1101 # Print the XML declaration 

1102 encoding_part = "" 

1103 declared_encoding: Optional[str] = eventual_encoding 

1104 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 

1105 # This is a special Python encoding; it can't actually 

1106 # go into an XML document because it means nothing 

1107 # outside of Python. 

1108 declared_encoding = None 

1109 if declared_encoding is not None: 

1110 encoding_part = ' encoding="%s"' % declared_encoding 

1111 prefix = '<?xml version="1.0"%s?>\n' % encoding_part 

1112 else: 

1113 prefix = "" 

1114 

1115 # Prior to 4.13.0, the first argument to this method was a 

1116 # bool called pretty_print, which gave the method a different 

1117 # signature from its superclass implementation, Tag.decode. 

1118 # 

1119 # The signatures of the two methods now match, but just in 

1120 # case someone is still passing a boolean in as the first 

1121 # argument to this method (or a keyword argument with the old 

1122 # name), we can handle it and put out a DeprecationWarning. 

1123 warning: Optional[str] = None 

1124 pretty_print: Optional[bool] = None 

1125 if isinstance(indent_level, bool): 

1126 if indent_level is True: 

1127 indent_level = 0 

1128 elif indent_level is False: 

1129 indent_level = None 

1130 warning = f"As of 4.13.0, the first argument to BeautifulSoup.decode has been changed from bool to int, to match Tag.decode. Pass in a value of {indent_level} instead." 

1131 else: 

1132 pretty_print = kwargs.pop("pretty_print", None) 

1133 assert not kwargs 

1134 if pretty_print is not None: 

1135 if pretty_print is True: 

1136 indent_level = 0 

1137 elif pretty_print is False: 

1138 indent_level = None 

1139 warning = f"As of 4.13.0, the pretty_print argument to BeautifulSoup.decode has been removed, to match Tag.decode. Pass in a value of indent_level={indent_level} instead." 

1140 

1141 if warning: 

1142 warnings.warn(warning, DeprecationWarning, stacklevel=2) 

1143 elif indent_level is False or pretty_print is False: 

1144 indent_level = None 

1145 return prefix + super(BeautifulSoup, self).decode( 

1146 indent_level, eventual_encoding, formatter, iterator 

1147 ) 

1148 

1149 

1150# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' 

1151_s = BeautifulSoup 

1152_soup = BeautifulSoup 

1153 

1154 

1155class BeautifulStoneSoup(BeautifulSoup): 

1156 """Deprecated interface to an XML parser.""" 

1157 

1158 def __init__(self, *args: Any, **kwargs: Any): 

1159 kwargs["features"] = "xml" 

1160 warnings.warn( 

1161 "The BeautifulStoneSoup class was deprecated in version 4.0.0. Instead of using " 

1162 'it, pass features="xml" into the BeautifulSoup constructor.', 

1163 DeprecationWarning, 

1164 stacklevel=2, 

1165 ) 

1166 super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 

1167 

1168 

1169# If this file is run as a script, act as an HTML pretty-printer. 

1170if __name__ == "__main__": 

1171 import sys 

1172 

1173 soup = BeautifulSoup(sys.stdin) 

1174 print((soup.prettify()))