Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/__init__.py: 66%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

422 statements  

1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". 

2 

3http://www.crummy.com/software/BeautifulSoup/ 

4 

5Beautiful Soup uses a pluggable XML or HTML parser to parse a 

6(possibly invalid) document into a tree representation. Beautiful Soup 

7provides methods and Pythonic idioms that make it easy to navigate, 

8search, and modify the parse tree. 

9 

10Beautiful Soup works with Python 3.7 and up. It works better if lxml 

11and/or html5lib is installed, but they are not required. 

12 

13For more than you ever wanted to know about Beautiful Soup, see the 

14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ 

15""" 

16 

17__author__ = "Leonard Richardson (leonardr@segfault.org)" 

18__version__ = "4.15.0" 

19__copyright__ = "Copyright (c) 2004-2026 Leonard Richardson" 

20# Use of this source code is governed by the MIT license. 

21__license__ = "MIT" 

22 

23__all__ = [ 

24 "AttributeResemblesVariableWarning", 

25 "BeautifulSoup", 

26 "Comment", 

27 "Declaration", 

28 "ProcessingInstruction", 

29 "ResultSet", 

30 "CSS", 

31 "Script", 

32 "Stylesheet", 

33 "Tag", 

34 "TemplateString", 

35 "ElementFilter", 

36 "UnicodeDammit", 

37 "CData", 

38 "Doctype", 

39 

40 # Exceptions 

41 "FeatureNotFound", 

42 "ParserRejectedMarkup", 

43 "StopParsing", 

44 

45 # Warnings 

46 "AttributeResemblesVariableWarning", 

47 "GuessedAtParserWarning", 

48 "MarkupResemblesLocatorWarning", 

49 "UnusualUsageWarning", 

50 "XMLParsedAsHTMLWarning", 

51] 

52 

53from collections import Counter 

54import io 

55import sys 

56import warnings 

57 

58# The very first thing we do is give a useful error if someone is 

59# running this code under Python 2. 

60if sys.version_info.major < 3: 

61 raise ImportError( 

62 "You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3." 

63 ) 

64 

65from .builder import ( 

66 builder_registry, 

67 TreeBuilder, 

68) 

69from .builder._htmlparser import HTMLParserTreeBuilder 

70from .dammit import UnicodeDammit 

71from .css import CSS 

72from ._deprecation import ( 

73 _deprecated, 

74) 

75from .element import ( 

76 CData, 

77 Comment, 

78 DEFAULT_OUTPUT_ENCODING, 

79 Declaration, 

80 Doctype, 

81 NavigableString, 

82 PageElement, 

83 ProcessingInstruction, 

84 PYTHON_SPECIFIC_ENCODINGS, 

85 ResultSet, 

86 Script, 

87 Stylesheet, 

88 Tag, 

89 TemplateString, 

90) 

91from .formatter import Formatter 

92from .filter import ( 

93 ElementFilter, 

94 SoupStrainer, 

95) 

96from typing import ( 

97 Any, 

98 cast, 

99 Counter as CounterType, 

100 Dict, 

101 Iterator, 

102 List, 

103 Sequence, 

104 Sized, 

105 Optional, 

106 Type, 

107 Union, 

108) 

109 

110from bs4._typing import ( 

111 _Encoding, 

112 _Encodings, 

113 _IncomingMarkup, 

114 _InsertableElement, 

115 _RawAttributeValue, 

116 _RawAttributeValues, 

117 _RawMarkup, 

118) 

119 

120# Import all warnings and exceptions into the main package. 

121from bs4.exceptions import ( 

122 FeatureNotFound, 

123 ParserRejectedMarkup, 

124 StopParsing, 

125) 

126from bs4._warnings import ( 

127 AttributeResemblesVariableWarning, 

128 GuessedAtParserWarning, 

129 MarkupResemblesLocatorWarning, 

130 UnusualUsageWarning, 

131 XMLParsedAsHTMLWarning, 

132) 

133 

134 

135class BeautifulSoup(Tag): 

136 """A data structure representing a parsed HTML or XML document. 

137 

138 Most of the methods you'll call on a BeautifulSoup object are inherited from 

139 PageElement or Tag. 

140 

141 Internally, this class defines the basic interface called by the 

142 tree builders when converting an HTML/XML document into a data 

143 structure. The interface abstracts away the differences between 

144 parsers. To write a new tree builder, you'll need to understand 

145 these methods as a whole. 

146 

147 These methods will be called by the BeautifulSoup constructor: 

148 * reset() 

149 * feed(markup) 

150 

151 The tree builder may call these methods from its feed() implementation: 

152 * handle_starttag(name, attrs) # See note about return value 

153 * handle_endtag(name) 

154 * handle_data(data) # Appends to the current data node 

155 * endData(containerClass) # Ends the current data node 

156 

157 No matter how complicated the underlying parser is, you should be 

158 able to build a tree using 'start tag' events, 'end tag' events, 

159 'data' events, and "done with data" events. 

160 

161 If you encounter an empty-element tag (aka a self-closing tag, 

162 like HTML's <br> tag), call handle_starttag and then 

163 handle_endtag. 

164 """ 

165 

166 #: Since `BeautifulSoup` subclasses `Tag`, it's possible to treat it as 

167 #: a `Tag` with a `Tag.name`. Hoever, this name makes it clear the 

168 #: `BeautifulSoup` object isn't a real markup tag. 

169 ROOT_TAG_NAME: str = "[document]" 

170 

171 #: If the end-user gives no indication which tree builder they 

172 #: want, look for one with these features. 

173 DEFAULT_BUILDER_FEATURES: Sequence[str] = ["html", "fast"] 

174 

175 #: A string containing all ASCII whitespace characters, used in 

176 #: during parsing to detect data chunks that seem 'empty'. 

177 ASCII_SPACES: str = "\x20\x0a\x09\x0c\x0d" 

178 

179 # FUTURE PYTHON: 

180 element_classes: Dict[Type[PageElement], Type[PageElement]] #: :meta private: 

181 builder: TreeBuilder #: :meta private: 

182 is_xml: bool 

183 known_xml: Optional[bool] 

184 parse_only: Optional[SoupStrainer] #: :meta private: 

185 

186 # These members are only used while parsing markup. 

187 markup: Optional[_RawMarkup] #: :meta private: 

188 current_data: List[str] #: :meta private: 

189 currentTag: Optional[Tag] #: :meta private: 

190 tagStack: List[Tag] #: :meta private: 

191 open_tag_counter: CounterType[str] #: :meta private: 

192 preserve_whitespace_tag_stack: List[Tag] #: :meta private: 

193 string_container_stack: List[Tag] #: :meta private: 

194 _most_recent_element: Optional[PageElement] #: :meta private: 

195 

196 #: Beautiful Soup's best guess as to the character encoding of the 

197 #: original document. 

198 original_encoding: Optional[_Encoding] 

199 

200 #: The character encoding, if any, that was explicitly defined 

201 #: in the original document. This may or may not match 

202 #: `BeautifulSoup.original_encoding`. 

203 declared_html_encoding: Optional[_Encoding] 

204 

205 #: This is True if the markup that was parsed contains 

206 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present 

207 #: in the original markup. These mark character sequences that 

208 #: could not be represented in Unicode. 

209 contains_replacement_characters: bool 

210 

211 def __init__( 

212 self, 

213 markup: _IncomingMarkup = "", 

214 features: Optional[Union[str, Sequence[str]]] = None, 

215 builder: Optional[Union[TreeBuilder, Type[TreeBuilder]]] = None, 

216 parse_only: Optional[SoupStrainer] = None, 

217 from_encoding: Optional[_Encoding] = None, 

218 exclude_encodings: Optional[_Encodings] = None, 

219 element_classes: Optional[Dict[Type[PageElement], Type[PageElement]]] = None, 

220 **kwargs: Any, 

221 ): 

222 """Constructor. 

223 

224 :param markup: A string or a file-like object representing 

225 markup to be parsed. 

226 

227 :param features: Desirable features of the parser to be 

228 used. This may be the name of a specific parser ("lxml", 

229 "lxml-xml", "html.parser", or "html5lib") or it may be the 

230 type of markup to be used ("html", "html5", "xml"). It's 

231 recommended that you name a specific parser, so that 

232 Beautiful Soup gives you the same results across platforms 

233 and virtual environments. 

234 

235 :param builder: A TreeBuilder subclass to instantiate (or 

236 instance to use) instead of looking one up based on 

237 `features`. You only need to use this if you've implemented a 

238 custom TreeBuilder. 

239 

240 :param parse_only: A SoupStrainer. Only parts of the document 

241 matching the SoupStrainer will be considered. This is useful 

242 when parsing part of a document that would otherwise be too 

243 large to fit into memory. 

244 

245 :param from_encoding: A string indicating the encoding of the 

246 document to be parsed. Pass this in if Beautiful Soup is 

247 guessing wrongly about the document's encoding. 

248 

249 :param exclude_encodings: A list of strings indicating 

250 encodings known to be wrong. Pass this in if you don't know 

251 the document's encoding but you know Beautiful Soup's guess is 

252 wrong. 

253 

254 :param element_classes: A dictionary mapping BeautifulSoup 

255 classes like Tag and NavigableString, to other classes you'd 

256 like to be instantiated instead as the parse tree is 

257 built. This is useful for subclassing Tag or NavigableString 

258 to modify default behavior. 

259 

260 :param kwargs: For backwards compatibility purposes, the 

261 constructor accepts certain keyword arguments used in 

262 Beautiful Soup 3. None of these arguments do anything in 

263 Beautiful Soup 4; they will result in a warning and then be 

264 ignored. 

265 

266 Apart from this, any keyword arguments passed into the 

267 BeautifulSoup constructor are propagated to the TreeBuilder 

268 constructor. This makes it possible to configure a 

269 TreeBuilder by passing in arguments, not just by saying which 

270 one to use. 

271 """ 

272 if "convertEntities" in kwargs: 

273 del kwargs["convertEntities"] 

274 warnings.warn( 

275 "BS4 does not respect the convertEntities argument to the " 

276 "BeautifulSoup constructor. Entities are always converted " 

277 "to Unicode characters." 

278 ) 

279 

280 if "markupMassage" in kwargs: 

281 del kwargs["markupMassage"] 

282 warnings.warn( 

283 "BS4 does not respect the markupMassage argument to the " 

284 "BeautifulSoup constructor. The tree builder is responsible " 

285 "for any necessary markup massage." 

286 ) 

287 

288 if "smartQuotesTo" in kwargs: 

289 del kwargs["smartQuotesTo"] 

290 warnings.warn( 

291 "BS4 does not respect the smartQuotesTo argument to the " 

292 "BeautifulSoup constructor. Smart quotes are always converted " 

293 "to Unicode characters." 

294 ) 

295 

296 if "selfClosingTags" in kwargs: 

297 del kwargs["selfClosingTags"] 

298 warnings.warn( 

299 "Beautiful Soup 4 does not respect the selfClosingTags argument to the " 

300 "BeautifulSoup constructor. The tree builder is responsible " 

301 "for understanding self-closing tags." 

302 ) 

303 

304 if "isHTML" in kwargs: 

305 del kwargs["isHTML"] 

306 warnings.warn( 

307 "Beautiful Soup 4 does not respect the isHTML argument to the " 

308 "BeautifulSoup constructor. Suggest you use " 

309 "features='lxml' for HTML and features='lxml-xml' for " 

310 "XML." 

311 ) 

312 

313 def deprecated_argument(old_name: str, new_name: str) -> Optional[Any]: 

314 if old_name in kwargs: 

315 warnings.warn( 

316 'The "%s" argument to the BeautifulSoup constructor ' 

317 'was renamed to "%s" in Beautiful Soup 4.0.0' 

318 % (old_name, new_name), 

319 DeprecationWarning, 

320 stacklevel=3, 

321 ) 

322 return kwargs.pop(old_name) 

323 return None 

324 

325 parse_only = parse_only or deprecated_argument("parseOnlyThese", "parse_only") 

326 if parse_only is not None: 

327 # Issue a warning if we can tell in advance that 

328 # parse_only will exclude the entire tree. 

329 if parse_only.excludes_everything: 

330 warnings.warn( 

331 f"The given value for parse_only will exclude everything: {parse_only}", 

332 UserWarning, 

333 stacklevel=3, 

334 ) 

335 

336 from_encoding = from_encoding or deprecated_argument( 

337 "fromEncoding", "from_encoding" 

338 ) 

339 

340 if from_encoding and isinstance(markup, str): 

341 warnings.warn( 

342 "You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored." 

343 ) 

344 from_encoding = None 

345 

346 self.element_classes = element_classes or dict() 

347 

348 # We need this information to track whether or not the builder 

349 # was specified well enough that we can omit the 'you need to 

350 # specify a parser' warning. 

351 original_builder = builder 

352 original_features = features 

353 

354 builder_class: Optional[Type[TreeBuilder]] = None 

355 if isinstance(builder, type): 

356 # A builder class was passed in; it needs to be instantiated. 

357 builder_class = builder 

358 builder = None 

359 elif builder is None: 

360 if isinstance(features, str): 

361 features = [features] 

362 if features is None or len(features) == 0: 

363 features = self.DEFAULT_BUILDER_FEATURES 

364 possible_builder_class = builder_registry.lookup(*features) 

365 if possible_builder_class is None: 

366 raise FeatureNotFound( 

367 "Couldn't find a tree builder with the features you " 

368 "requested: %s. Do you need to install a parser library?" 

369 % ",".join(features) 

370 ) 

371 builder_class = possible_builder_class 

372 

373 # At this point either we have a TreeBuilder instance in 

374 # builder, or we have a builder_class that we can instantiate 

375 # with the remaining **kwargs. 

376 if builder is None: 

377 assert builder_class is not None 

378 builder = builder_class(**kwargs) 

379 if ( 

380 not original_builder 

381 and not ( 

382 original_features == builder.NAME 

383 or ( 

384 isinstance(original_features, str) 

385 and original_features in builder.ALTERNATE_NAMES 

386 ) 

387 ) 

388 and markup 

389 ): 

390 # The user did not tell us which TreeBuilder to use, 

391 # and we had to guess. Issue a warning. 

392 if builder.is_xml: 

393 markup_type = "XML" 

394 else: 

395 markup_type = "HTML" 

396 

397 # This code adapted from warnings.py so that we get the same line 

398 # of code as our warnings.warn() call gets, even if the answer is wrong 

399 # (as it may be in a multithreading situation). 

400 caller = None 

401 try: 

402 caller = sys._getframe(1) 

403 except ValueError: 

404 pass 

405 if caller: 

406 globals = caller.f_globals 

407 line_number = caller.f_lineno 

408 else: 

409 globals = sys.__dict__ 

410 line_number = 1 

411 filename = globals.get("__file__") 

412 if filename: 

413 fnl = filename.lower() 

414 if fnl.endswith((".pyc", ".pyo")): 

415 filename = filename[:-1] 

416 if filename: 

417 # If there is no filename at all, the user is most likely in a REPL, 

418 # and the warning is not necessary. 

419 values = dict( 

420 filename=filename, 

421 line_number=line_number, 

422 parser=builder.NAME, 

423 markup_type=markup_type, 

424 ) 

425 warnings.warn( 

426 GuessedAtParserWarning.MESSAGE % values, 

427 GuessedAtParserWarning, 

428 stacklevel=2, 

429 ) 

430 else: 

431 if kwargs: 

432 warnings.warn( 

433 "Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`." 

434 ) 

435 

436 self.builder = builder 

437 self.is_xml = builder.is_xml 

438 self.known_xml = self.is_xml 

439 self._namespaces = dict() 

440 self.parse_only = parse_only 

441 

442 if hasattr(markup, "read"): # It's a file-type object. 

443 markup = cast(io.IOBase, markup).read() 

444 elif not isinstance(markup, (bytes, str)) and not hasattr(markup, "__len__"): 

445 raise TypeError( 

446 f"Incoming markup is of an invalid type: {markup!r}. Markup must be a string, a bytestring, or an open filehandle." 

447 ) 

448 elif isinstance(markup, Sized) and len(markup) <= 256 and ( 

449 (isinstance(markup, bytes) and b"<" not in markup and b"\n" not in markup) 

450 or (isinstance(markup, str) and "<" not in markup and "\n" not in markup) 

451 ): 

452 # Issue warnings for a couple beginner problems 

453 # involving passing non-markup to Beautiful Soup. 

454 # Beautiful Soup will still parse the input as markup, 

455 # since that is sometimes the intended behavior. 

456 if not self._markup_is_url(markup): 

457 self._markup_resembles_filename(markup) 

458 

459 # At this point we know markup is a string or bytestring. If 

460 # it was a file-type object, we've read from it. 

461 markup = cast(_RawMarkup, markup) 

462 

463 rejections = [] 

464 success = False 

465 for ( 

466 self.markup, 

467 self.original_encoding, 

468 self.declared_html_encoding, 

469 self.contains_replacement_characters, 

470 ) in self.builder.prepare_markup( 

471 markup, from_encoding, exclude_encodings=exclude_encodings 

472 ): 

473 self.reset() 

474 self.builder.initialize_soup(self) 

475 try: 

476 self._feed() 

477 success = True 

478 break 

479 except ParserRejectedMarkup as e: 

480 rejections.append(e) 

481 pass 

482 

483 if not success: 

484 other_exceptions = [str(e) for e in rejections] 

485 raise ParserRejectedMarkup( 

486 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " 

487 + "\n ".join(other_exceptions) 

488 ) 

489 

490 # Clear out the markup and remove the builder's circular 

491 # reference to this object. 

492 self.markup = None 

493 self.builder.soup = None 

494 

495 def copy_self(self) -> "BeautifulSoup": 

496 """Create a new BeautifulSoup object with the same TreeBuilder, 

497 but not associated with any markup. 

498 

499 This is the first step of the deepcopy process. 

500 """ 

501 clone = type(self)("", None, self.builder) 

502 

503 # Keep track of the encoding of the original document, 

504 # since we won't be parsing it again. 

505 clone.original_encoding = self.original_encoding 

506 return clone 

507 

508 def __getstate__(self) -> Dict[str, Any]: 

509 # Frequently a tree builder can't be pickled. 

510 d = dict(self.__dict__) 

511 if "builder" in d and d["builder"] is not None and not self.builder.picklable: 

512 d["builder"] = type(self.builder) 

513 # Store the contents as a Unicode string. 

514 d["contents"] = [] 

515 d["markup"] = self.decode() 

516 

517 # If _most_recent_element is present, it's a Tag object left 

518 # over from initial parse. It might not be picklable and we 

519 # don't need it. 

520 if "_most_recent_element" in d: 

521 del d["_most_recent_element"] 

522 return d 

523 

524 def __setstate__(self, state: Dict[str, Any]) -> None: 

525 # If necessary, restore the TreeBuilder by looking it up. 

526 self.__dict__ = state 

527 if isinstance(self.builder, type): 

528 self.builder = self.builder() 

529 elif not self.builder: 

530 # We don't know which builder was used to build this 

531 # parse tree, so use a default we know is always available. 

532 self.builder = HTMLParserTreeBuilder() 

533 self.builder.soup = self 

534 self.reset() 

535 self._feed() 

536 

537 @property 

538 def _is_root(self): 

539 """Yes, a BeautifulSoup object is the root of its parse tree. Used by the _root_object internal property.""" 

540 return True 

541 

542 @classmethod 

543 @_deprecated( 

544 replaced_by="nothing (private method, will be removed)", version="4.13.0" 

545 ) 

546 def _decode_markup(cls, markup: _RawMarkup) -> str: 

547 """Ensure `markup` is Unicode so it's safe to send into warnings.warn. 

548 

549 warnings.warn had this problem back in 2010 but fortunately 

550 not anymore. This has not been used for a long time; I just 

551 noticed that fact while working on 4.13.0. 

552 """ 

553 if isinstance(markup, bytes): 

554 decoded = markup.decode("utf-8", "replace") 

555 else: 

556 decoded = markup 

557 return decoded 

558 

559 @classmethod 

560 def _markup_is_url(cls, markup: _RawMarkup) -> bool: 

561 """Error-handling method to raise a warning if incoming markup looks 

562 like a URL. 

563 

564 :param markup: A string of markup. 

565 :return: Whether or not the markup resembled a URL 

566 closely enough to justify issuing a warning. 

567 """ 

568 problem: bool = False 

569 if isinstance(markup, bytes): 

570 problem = ( 

571 any(markup.startswith(prefix) for prefix in (b"http:", b"https:")) 

572 and b" " not in markup 

573 ) 

574 elif isinstance(markup, str): 

575 problem = ( 

576 any(markup.startswith(prefix) for prefix in ("http:", "https:")) 

577 and " " not in markup 

578 ) 

579 else: 

580 return False 

581 

582 if not problem: 

583 return False 

584 warnings.warn( 

585 MarkupResemblesLocatorWarning.URL_MESSAGE % dict(what="URL"), 

586 MarkupResemblesLocatorWarning, 

587 stacklevel=3, 

588 ) 

589 return True 

590 

591 @classmethod 

592 def _markup_resembles_filename(cls, markup: _RawMarkup) -> bool: 

593 """Error-handling method to issue a warning if incoming markup 

594 resembles a filename. 

595 

596 :param markup: A string of markup. 

597 :return: Whether or not the markup resembled a filename 

598 closely enough to justify issuing a warning. 

599 """ 

600 markup_b: bytes 

601 

602 # We're only checking ASCII characters, so rather than write 

603 # the same tests twice, convert Unicode to a bytestring and 

604 # operate on the bytestring. 

605 if isinstance(markup, str): 

606 markup_b = markup.encode("utf8") 

607 else: 

608 markup_b = markup 

609 

610 # Step 1: does it end with a common textual file extension? 

611 filelike = False 

612 lower = markup_b.lower() 

613 extensions = [b".html", b".htm", b".xml", b".xhtml", b".txt"] 

614 if any(lower.endswith(ext) for ext in extensions): 

615 filelike = True 

616 if not filelike: 

617 return False 

618 

619 # Step 2: it _might_ be a file, but there are a few things 

620 # we can look for that aren't very common in filenames. 

621 

622 # Characters that have special meaning to Unix shells. (< was 

623 # excluded before this method was called.) 

624 # 

625 # Many of these are also reserved characters that cannot 

626 # appear in Windows filenames. 

627 for byte in markup_b: 

628 if byte in b"?*#&;>$|": 

629 return False 

630 

631 # Two consecutive forward slashes (as seen in a URL) or two 

632 # consecutive spaces (as seen in fixed-width data). 

633 # 

634 # (Paths to Windows network shares contain consecutive 

635 # backslashes, so checking that doesn't seem as helpful.) 

636 if b"//" in markup_b: 

637 return False 

638 if b" " in markup_b: 

639 return False 

640 

641 # A colon in any position other than position 1 (e.g. after a 

642 # Windows drive letter). 

643 if markup_b.startswith(b":"): 

644 return False 

645 colon_i = markup_b.rfind(b":") 

646 if colon_i not in (-1, 1): 

647 return False 

648 

649 # Step 3: If it survived all of those checks, it's similar 

650 # enough to a file to justify issuing a warning. 

651 warnings.warn( 

652 MarkupResemblesLocatorWarning.FILENAME_MESSAGE % dict(what="filename"), 

653 MarkupResemblesLocatorWarning, 

654 stacklevel=3, 

655 ) 

656 return True 

657 

658 def _feed(self) -> None: 

659 """Internal method that parses previously set markup, creating a large 

660 number of Tag and NavigableString objects. 

661 """ 

662 # Convert the document to Unicode. 

663 self.builder.reset() 

664 

665 if self.markup is not None: 

666 self.builder.feed(self.markup) 

667 # Close out any unfinished strings and close all the open tags. 

668 self.endData() 

669 while ( 

670 self.currentTag is not None and self.currentTag.name != self.ROOT_TAG_NAME 

671 ): 

672 self.popTag() 

673 

674 def reset(self) -> None: 

675 """Reset this object to a state as though it had never parsed any 

676 markup. 

677 """ 

678 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 

679 self.hidden = True 

680 self.builder.reset() 

681 self.current_data = [] 

682 self.currentTag = None 

683 self.tagStack = [] 

684 self.open_tag_counter = Counter() 

685 self.preserve_whitespace_tag_stack = [] 

686 self.string_container_stack = [] 

687 self._most_recent_element = None 

688 self.pushTag(self) 

689 

690 def new_tag( 

691 self, 

692 name: str, 

693 namespace: Optional[str] = None, 

694 nsprefix: Optional[str] = None, 

695 attrs: Optional[_RawAttributeValues] = None, 

696 sourceline: Optional[int] = None, 

697 sourcepos: Optional[int] = None, 

698 string: Optional[str] = None, 

699 **kwattrs: _RawAttributeValue, 

700 ) -> Tag: 

701 """Create a new Tag associated with this BeautifulSoup object. 

702 

703 :param name: The name of the new Tag. 

704 :param namespace: The URI of the new Tag's XML namespace, if any. 

705 :param prefix: The prefix for the new Tag's XML namespace, if any. 

706 :param attrs: A dictionary of this Tag's attribute values; can 

707 be used instead of ``kwattrs`` for attributes like 'class' 

708 that are reserved words in Python. 

709 :param sourceline: The line number where this tag was 

710 (purportedly) found in its source document. 

711 :param sourcepos: The character position within ``sourceline`` where this 

712 tag was (purportedly) found. 

713 :param string: String content for the new Tag, if any. 

714 :param kwattrs: Keyword arguments for the new Tag's attribute values. 

715 

716 """ 

717 attr_container = self.builder.attribute_dict_class(**kwattrs) 

718 if attrs is not None: 

719 attr_container.update(attrs) 

720 tag_class = self.element_classes.get(Tag, Tag) 

721 

722 # Assume that this is either Tag or a subclass of Tag. If not, 

723 # the user brought type-unsafety upon themselves. 

724 tag_class = cast(Type[Tag], tag_class) 

725 tag = tag_class( 

726 None, 

727 self.builder, 

728 name, 

729 namespace, 

730 nsprefix, 

731 attr_container, 

732 sourceline=sourceline, 

733 sourcepos=sourcepos, 

734 ) 

735 

736 if string is not None: 

737 tag.string = string 

738 return tag 

739 

740 def string_container( 

741 self, base_class: Optional[Type[NavigableString]] = None 

742 ) -> Type[NavigableString]: 

743 """Find the class that should be instantiated to hold a given kind of 

744 string. 

745 

746 This may be a built-in Beautiful Soup class or a custom class passed 

747 in to the BeautifulSoup constructor. 

748 """ 

749 container = base_class or NavigableString 

750 

751 # The user may want us to use some other class (hopefully a 

752 # custom subclass) instead of the one we'd use normally. 

753 container = cast( 

754 Type[NavigableString], self.element_classes.get(container, container) 

755 ) 

756 

757 # On top of that, we may be inside a tag that needs a special 

758 # container class. 

759 if self.string_container_stack and container is NavigableString: 

760 container = self.builder.string_containers.get( 

761 self.string_container_stack[-1].name, container 

762 ) 

763 return container 

764 

765 def new_string( 

766 self, s: str, subclass: Optional[Type[NavigableString]] = None 

767 ) -> NavigableString: 

768 """Create a new `NavigableString` associated with this `BeautifulSoup` 

769 object. 

770 

771 :param s: The string content of the `NavigableString` 

772 :param subclass: The subclass of `NavigableString`, if any, to 

773 use. If a document is being processed, an appropriate 

774 subclass for the current location in the document will 

775 be determined automatically. 

776 """ 

777 container = self.string_container(subclass) 

778 return container(s) 

779 

780 def insert_before(self, *args: _InsertableElement) -> List[PageElement]: 

781 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement 

782 it because there is nothing before or after it in the parse tree. 

783 """ 

784 raise NotImplementedError( 

785 "BeautifulSoup objects don't support insert_before()." 

786 ) 

787 

788 def insert_after(self, *args: _InsertableElement) -> List[PageElement]: 

789 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement 

790 it because there is nothing before or after it in the parse tree. 

791 """ 

792 raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 

793 

794 def popTag(self) -> Optional[Tag]: 

795 """Internal method called by _popToTag when a tag is closed. 

796 

797 :meta private: 

798 """ 

799 if not self.tagStack: 

800 # Nothing to pop. This shouldn't happen. 

801 return None 

802 tag = self.tagStack.pop() 

803 if tag.name in self.open_tag_counter: 

804 self.open_tag_counter[tag.name] -= 1 

805 if ( 

806 self.preserve_whitespace_tag_stack 

807 and tag == self.preserve_whitespace_tag_stack[-1] 

808 ): 

809 self.preserve_whitespace_tag_stack.pop() 

810 if self.string_container_stack and tag == self.string_container_stack[-1]: 

811 self.string_container_stack.pop() 

812 # print("Pop", tag.name) 

813 if self.tagStack: 

814 self.currentTag = self.tagStack[-1] 

815 return self.currentTag 

816 

817 def pushTag(self, tag: Tag) -> None: 

818 """Internal method called by handle_starttag when a tag is opened. 

819 

820 :meta private: 

821 """ 

822 # print("Push", tag.name) 

823 if self.currentTag is not None: 

824 self.currentTag.contents.append(tag) 

825 self.tagStack.append(tag) 

826 self.currentTag = self.tagStack[-1] 

827 if tag.name != self.ROOT_TAG_NAME: 

828 self.open_tag_counter[tag.name] += 1 

829 if tag.name in self.builder.preserve_whitespace_tags: 

830 self.preserve_whitespace_tag_stack.append(tag) 

831 if tag.name in self.builder.string_containers: 

832 self.string_container_stack.append(tag) 

833 

834 def endData(self, containerClass: Optional[Type[NavigableString]] = None) -> None: 

835 """Method called by the TreeBuilder when the end of a data segment 

836 occurs. 

837 

838 :param containerClass: The class to use when incorporating the 

839 data segment into the parse tree. 

840 

841 :meta private: 

842 """ 

843 if self.current_data: 

844 current_data = "".join(self.current_data) 

845 # If whitespace is not preserved, and this string contains 

846 # nothing but ASCII spaces, replace it with a single space 

847 # or newline. 

848 if not self.preserve_whitespace_tag_stack: 

849 strippable = True 

850 for i in current_data: 

851 if i not in self.ASCII_SPACES: 

852 strippable = False 

853 break 

854 if strippable: 

855 if "\n" in current_data: 

856 current_data = "\n" 

857 else: 

858 current_data = " " 

859 

860 # Reset the data collector. 

861 self.current_data = [] 

862 

863 # Should we add this string to the tree at all? 

864 if ( 

865 self.parse_only 

866 and len(self.tagStack) <= 1 

867 and (not self.parse_only.allow_string_creation(current_data)) 

868 ): 

869 return 

870 

871 containerClass = self.string_container(containerClass) 

872 o = containerClass(current_data) 

873 self.object_was_parsed(o) 

874 

875 def object_was_parsed( 

876 self, 

877 o: PageElement, 

878 parent: Optional[Tag] = None, 

879 most_recent_element: Optional[PageElement] = None, 

880 ) -> None: 

881 """Method called by the TreeBuilder to integrate an object into the 

882 parse tree. 

883 

884 :meta private: 

885 """ 

886 if parent is None: 

887 parent = self.currentTag 

888 assert parent is not None 

889 previous_element: Optional[PageElement] 

890 if most_recent_element is not None: 

891 previous_element = most_recent_element 

892 else: 

893 previous_element = self._most_recent_element 

894 

895 next_element = previous_sibling = next_sibling = None 

896 if isinstance(o, Tag): 

897 next_element = o.next_element 

898 next_sibling = o.next_sibling 

899 previous_sibling = o.previous_sibling 

900 if previous_element is None: 

901 previous_element = o.previous_element 

902 

903 fix = parent.next_element is not None 

904 

905 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) 

906 

907 self._most_recent_element = o 

908 parent.contents.append(o) 

909 

910 # Check if we are inserting into an already parsed node. 

911 if fix: 

912 self._linkage_fixer(parent) 

913 

914 def _linkage_fixer(self, el: Tag) -> None: 

915 """Make sure linkage of this fragment is sound.""" 

916 

917 first = el.contents[0] 

918 child = el.contents[-1] 

919 descendant: PageElement = child 

920 

921 if child is first and el.parent is not None: 

922 # Parent should be linked to first child 

923 el.next_element = child 

924 # We are no longer linked to whatever this element is 

925 prev_el = child.previous_element 

926 if prev_el is not None and prev_el is not el: 

927 prev_el.next_element = None 

928 # First child should be linked to the parent, and no previous siblings. 

929 child.previous_element = el 

930 child.previous_sibling = None 

931 

932 # We have no sibling as we've been appended as the last. 

933 child.next_sibling = None 

934 

935 # This index is a tag, dig deeper for a "last descendant" 

936 if isinstance(child, Tag) and child.contents: 

937 # _last_decendant is typed as returning Optional[PageElement], 

938 # but the value can't be None here, because el is a Tag 

939 # which we know has contents. 

940 descendant = cast(PageElement, child._last_descendant(False)) 

941 

942 # As the final step, link last descendant. It should be linked 

943 # to the parent's next sibling (if found), else walk up the chain 

944 # and find a parent with a sibling. It should have no next sibling. 

945 descendant.next_element = None 

946 descendant.next_sibling = None 

947 

948 target: Optional[Tag] = el 

949 while True: 

950 if target is None: 

951 break 

952 elif target.next_sibling is not None: 

953 descendant.next_element = target.next_sibling 

954 target.next_sibling.previous_element = child 

955 break 

956 target = target.parent 

957 

958 def _popToTag( 

959 self, name: str, nsprefix: Optional[str] = None, inclusivePop: bool = True 

960 ) -> Optional[Tag]: 

961 """Pops the tag stack up to and including the most recent 

962 instance of the given tag. 

963 

964 If there are no open tags with the given name, nothing will be 

965 popped. 

966 

967 :param name: Pop up to the most recent tag with this name. 

968 :param nsprefix: The namespace prefix that goes with `name`. 

969 :param inclusivePop: It this is false, pops the tag stack up 

970 to but *not* including the most recent instqance of the 

971 given tag. 

972 

973 :meta private: 

974 """ 

975 # print("Popping to %s" % name) 

976 if name == self.ROOT_TAG_NAME: 

977 # The BeautifulSoup object itself can never be popped. 

978 return None 

979 

980 most_recently_popped = None 

981 

982 stack_size = len(self.tagStack) 

983 for i in range(stack_size - 1, 0, -1): 

984 if not self.open_tag_counter.get(name): 

985 break 

986 t = self.tagStack[i] 

987 if name == t.name and nsprefix == t.prefix: 

988 if inclusivePop: 

989 most_recently_popped = self.popTag() 

990 break 

991 most_recently_popped = self.popTag() 

992 

993 return most_recently_popped 

994 

995 def handle_starttag( 

996 self, 

997 name: str, 

998 namespace: Optional[str], 

999 nsprefix: Optional[str], 

1000 attrs: _RawAttributeValues, 

1001 sourceline: Optional[int] = None, 

1002 sourcepos: Optional[int] = None, 

1003 namespaces: Optional[Dict[str, str]] = None, 

1004 ) -> Optional[Tag]: 

1005 """Called by the tree builder when a new tag is encountered. 

1006 

1007 :param name: Name of the tag. 

1008 :param nsprefix: Namespace prefix for the tag. 

1009 :param attrs: A dictionary of attribute values. Note that 

1010 attribute values are expected to be simple strings; processing 

1011 of multi-valued attributes such as "class" comes later. 

1012 :param sourceline: The line number where this tag was found in its 

1013 source document. 

1014 :param sourcepos: The character position within `sourceline` where this 

1015 tag was found. 

1016 :param namespaces: A dictionary of all namespace prefix mappings 

1017 currently in scope in the document. 

1018 

1019 If this method returns None, the tag was rejected by an active 

1020 `ElementFilter`. You should proceed as if the tag had not occurred 

1021 in the document. For instance, if this was a self-closing tag, 

1022 don't call handle_endtag. 

1023 

1024 :meta private: 

1025 """ 

1026 # print("Start tag %s: %s" % (name, attrs)) 

1027 self.endData() 

1028 

1029 if ( 

1030 self.parse_only 

1031 and len(self.tagStack) <= 1 

1032 and not self.parse_only.allow_tag_creation(nsprefix, name, attrs) 

1033 ): 

1034 return None 

1035 

1036 tag_class = self.element_classes.get(Tag, Tag) 

1037 # Assume that this is either Tag or a subclass of Tag. If not, 

1038 # the user brought type-unsafety upon themselves. 

1039 tag_class = cast(Type[Tag], tag_class) 

1040 tag = tag_class( 

1041 self, 

1042 self.builder, 

1043 name, 

1044 namespace, 

1045 nsprefix, 

1046 attrs, 

1047 self.currentTag, 

1048 self._most_recent_element, 

1049 sourceline=sourceline, 

1050 sourcepos=sourcepos, 

1051 namespaces=namespaces, 

1052 ) 

1053 if tag is None: 

1054 return tag 

1055 if self._most_recent_element is not None: 

1056 self._most_recent_element.next_element = tag 

1057 self._most_recent_element = tag 

1058 self.pushTag(tag) 

1059 return tag 

1060 

1061 def handle_endtag(self, name: str, nsprefix: Optional[str] = None) -> None: 

1062 """Called by the tree builder when an ending tag is encountered. 

1063 

1064 :param name: Name of the tag. 

1065 :param nsprefix: Namespace prefix for the tag. 

1066 

1067 :meta private: 

1068 """ 

1069 # print("End tag: " + name) 

1070 self.endData() 

1071 self._popToTag(name, nsprefix) 

1072 

1073 def handle_data(self, data: str) -> None: 

1074 """Called by the tree builder when a chunk of textual data is 

1075 encountered. 

1076 

1077 :meta private: 

1078 """ 

1079 self.current_data.append(data) 

1080 

1081 def decode( 

1082 self, 

1083 indent_level: Optional[int] = None, 

1084 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, 

1085 formatter: Union[Formatter, str] = "minimal", 

1086 iterator: Optional[Iterator[PageElement]] = None, 

1087 **kwargs: Any, 

1088 ) -> str: 

1089 """Returns a string representation of the parse tree 

1090 as a full HTML or XML document. 

1091 

1092 :param indent_level: Each line of the rendering will be 

1093 indented this many levels. (The ``formatter`` decides what a 

1094 'level' means, in terms of spaces or other characters 

1095 output.) This is used internally in recursive calls while 

1096 pretty-printing. 

1097 :param eventual_encoding: The encoding of the final document. 

1098 If this is None, the document will be a Unicode string. 

1099 :param formatter: Either a `Formatter` object, or a string naming one of 

1100 the standard formatters. 

1101 :param iterator: The iterator to use when navigating over the 

1102 parse tree. This is only used by `Tag.decode_contents` and 

1103 you probably won't need to use it. 

1104 """ 

1105 if self.is_xml: 

1106 # Print the XML declaration 

1107 encoding_part = "" 

1108 declared_encoding: Optional[str] = eventual_encoding 

1109 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 

1110 # This is a special Python encoding; it can't actually 

1111 # go into an XML document because it means nothing 

1112 # outside of Python. 

1113 declared_encoding = None 

1114 if declared_encoding is not None: 

1115 encoding_part = ' encoding="%s"' % declared_encoding 

1116 prefix = '<?xml version="1.0"%s?>\n' % encoding_part 

1117 else: 

1118 prefix = "" 

1119 

1120 # Prior to 4.13.0, the first argument to this method was a 

1121 # bool called pretty_print, which gave the method a different 

1122 # signature from its superclass implementation, Tag.decode. 

1123 # 

1124 # The signatures of the two methods now match, but just in 

1125 # case someone is still passing a boolean in as the first 

1126 # argument to this method (or a keyword argument with the old 

1127 # name), we can handle it and put out a DeprecationWarning. 

1128 warning: Optional[str] = None 

1129 pretty_print: Optional[bool] = None 

1130 if isinstance(indent_level, bool): 

1131 if indent_level is True: 

1132 indent_level = 0 

1133 elif indent_level is False: 

1134 indent_level = None 

1135 warning = f"As of 4.13.0, the first argument to BeautifulSoup.decode has been changed from bool to int, to match Tag.decode. Pass in a value of {indent_level} instead." 

1136 else: 

1137 pretty_print = kwargs.pop("pretty_print", None) 

1138 assert not kwargs 

1139 if pretty_print is not None: 

1140 if pretty_print is True: 

1141 indent_level = 0 

1142 elif pretty_print is False: 

1143 indent_level = None 

1144 warning = f"As of 4.13.0, the pretty_print argument to BeautifulSoup.decode has been removed, to match Tag.decode. Pass in a value of indent_level={indent_level} instead." 

1145 

1146 if warning: 

1147 warnings.warn(warning, DeprecationWarning, stacklevel=2) 

1148 elif indent_level is False or pretty_print is False: 

1149 indent_level = None 

1150 return prefix + super(BeautifulSoup, self).decode( 

1151 indent_level, eventual_encoding, formatter, iterator 

1152 ) 

1153 

1154 

1155# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' 

1156_s = BeautifulSoup 

1157_soup = BeautifulSoup 

1158 

1159 

1160class BeautifulStoneSoup(BeautifulSoup): 

1161 """Deprecated interface to an XML parser.""" 

1162 

1163 def __init__(self, *args: Any, **kwargs: Any): 

1164 kwargs["features"] = "xml" 

1165 warnings.warn( 

1166 "The BeautifulStoneSoup class was deprecated in version 4.0.0. Instead of using " 

1167 'it, pass features="xml" into the BeautifulSoup constructor.', 

1168 DeprecationWarning, 

1169 stacklevel=2, 

1170 ) 

1171 super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 

1172 

1173 

1174# If this file is run as a script, act as an HTML pretty-printer. 

1175if __name__ == "__main__": 

1176 import sys 

1177 

1178 soup = BeautifulSoup(sys.stdin) 

1179 print((soup.prettify()))