Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/__init__.py: 16%

354 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". 

2 

3http://www.crummy.com/software/BeautifulSoup/ 

4 

5Beautiful Soup uses a pluggable XML or HTML parser to parse a 

6(possibly invalid) document into a tree representation. Beautiful Soup 

7provides methods and Pythonic idioms that make it easy to navigate, 

8search, and modify the parse tree. 

9 

10Beautiful Soup works with Python 3.6 and up. It works better if lxml 

11and/or html5lib is installed. 

12 

13For more than you ever wanted to know about Beautiful Soup, see the 

14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ 

15""" 

16 

17__author__ = "Leonard Richardson (leonardr@segfault.org)" 

18__version__ = "4.12.2" 

19__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson" 

20# Use of this source code is governed by the MIT license. 

21__license__ = "MIT" 

22 

23__all__ = ['BeautifulSoup'] 

24 

25from collections import Counter 

26import os 

27import re 

28import sys 

29import traceback 

30import warnings 

31 

32# The very first thing we do is give a useful error if someone is 

33# running this code under Python 2. 

34if sys.version_info.major < 3: 

35 raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.') 

36 

37from .builder import ( 

38 builder_registry, 

39 ParserRejectedMarkup, 

40 XMLParsedAsHTMLWarning, 

41 HTMLParserTreeBuilder 

42) 

43from .dammit import UnicodeDammit 

44from .element import ( 

45 CData, 

46 Comment, 

47 CSS, 

48 DEFAULT_OUTPUT_ENCODING, 

49 Declaration, 

50 Doctype, 

51 NavigableString, 

52 PageElement, 

53 ProcessingInstruction, 

54 PYTHON_SPECIFIC_ENCODINGS, 

55 ResultSet, 

56 Script, 

57 Stylesheet, 

58 SoupStrainer, 

59 Tag, 

60 TemplateString, 

61 ) 

62 

63# Define some custom warnings. 

64class GuessedAtParserWarning(UserWarning): 

65 """The warning issued when BeautifulSoup has to guess what parser to 

66 use -- probably because no parser was specified in the constructor. 

67 """ 

68 

69class MarkupResemblesLocatorWarning(UserWarning): 

70 """The warning issued when BeautifulSoup is given 'markup' that 

71 actually looks like a resource locator -- a URL or a path to a file 

72 on disk. 

73 """ 

74 

75 

76class BeautifulSoup(Tag): 

77 """A data structure representing a parsed HTML or XML document. 

78 

79 Most of the methods you'll call on a BeautifulSoup object are inherited from 

80 PageElement or Tag. 

81 

82 Internally, this class defines the basic interface called by the 

83 tree builders when converting an HTML/XML document into a data 

84 structure. The interface abstracts away the differences between 

85 parsers. To write a new tree builder, you'll need to understand 

86 these methods as a whole. 

87 

88 These methods will be called by the BeautifulSoup constructor: 

89 * reset() 

90 * feed(markup) 

91 

92 The tree builder may call these methods from its feed() implementation: 

93 * handle_starttag(name, attrs) # See note about return value 

94 * handle_endtag(name) 

95 * handle_data(data) # Appends to the current data node 

96 * endData(containerClass) # Ends the current data node 

97 

98 No matter how complicated the underlying parser is, you should be 

99 able to build a tree using 'start tag' events, 'end tag' events, 

100 'data' events, and "done with data" events. 

101 

102 If you encounter an empty-element tag (aka a self-closing tag, 

103 like HTML's <br> tag), call handle_starttag and then 

104 handle_endtag. 

105 """ 

106 

107 # Since BeautifulSoup subclasses Tag, it's possible to treat it as 

108 # a Tag with a .name. This name makes it clear the BeautifulSoup 

109 # object isn't a real markup tag. 

110 ROOT_TAG_NAME = '[document]' 

111 

112 # If the end-user gives no indication which tree builder they 

113 # want, look for one with these features. 

114 DEFAULT_BUILDER_FEATURES = ['html', 'fast'] 

115 

116 # A string containing all ASCII whitespace characters, used in 

117 # endData() to detect data chunks that seem 'empty'. 

118 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' 

119 

120 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" 

121 

122 def __init__(self, markup="", features=None, builder=None, 

123 parse_only=None, from_encoding=None, exclude_encodings=None, 

124 element_classes=None, **kwargs): 

125 """Constructor. 

126 

127 :param markup: A string or a file-like object representing 

128 markup to be parsed. 

129 

130 :param features: Desirable features of the parser to be 

131 used. This may be the name of a specific parser ("lxml", 

132 "lxml-xml", "html.parser", or "html5lib") or it may be the 

133 type of markup to be used ("html", "html5", "xml"). It's 

134 recommended that you name a specific parser, so that 

135 Beautiful Soup gives you the same results across platforms 

136 and virtual environments. 

137 

138 :param builder: A TreeBuilder subclass to instantiate (or 

139 instance to use) instead of looking one up based on 

140 `features`. You only need to use this if you've implemented a 

141 custom TreeBuilder. 

142 

143 :param parse_only: A SoupStrainer. Only parts of the document 

144 matching the SoupStrainer will be considered. This is useful 

145 when parsing part of a document that would otherwise be too 

146 large to fit into memory. 

147 

148 :param from_encoding: A string indicating the encoding of the 

149 document to be parsed. Pass this in if Beautiful Soup is 

150 guessing wrongly about the document's encoding. 

151 

152 :param exclude_encodings: A list of strings indicating 

153 encodings known to be wrong. Pass this in if you don't know 

154 the document's encoding but you know Beautiful Soup's guess is 

155 wrong. 

156 

157 :param element_classes: A dictionary mapping BeautifulSoup 

158 classes like Tag and NavigableString, to other classes you'd 

159 like to be instantiated instead as the parse tree is 

160 built. This is useful for subclassing Tag or NavigableString 

161 to modify default behavior. 

162 

163 :param kwargs: For backwards compatibility purposes, the 

164 constructor accepts certain keyword arguments used in 

165 Beautiful Soup 3. None of these arguments do anything in 

166 Beautiful Soup 4; they will result in a warning and then be 

167 ignored. 

168  

169 Apart from this, any keyword arguments passed into the 

170 BeautifulSoup constructor are propagated to the TreeBuilder 

171 constructor. This makes it possible to configure a 

172 TreeBuilder by passing in arguments, not just by saying which 

173 one to use. 

174 """ 

175 if 'convertEntities' in kwargs: 

176 del kwargs['convertEntities'] 

177 warnings.warn( 

178 "BS4 does not respect the convertEntities argument to the " 

179 "BeautifulSoup constructor. Entities are always converted " 

180 "to Unicode characters.") 

181 

182 if 'markupMassage' in kwargs: 

183 del kwargs['markupMassage'] 

184 warnings.warn( 

185 "BS4 does not respect the markupMassage argument to the " 

186 "BeautifulSoup constructor. The tree builder is responsible " 

187 "for any necessary markup massage.") 

188 

189 if 'smartQuotesTo' in kwargs: 

190 del kwargs['smartQuotesTo'] 

191 warnings.warn( 

192 "BS4 does not respect the smartQuotesTo argument to the " 

193 "BeautifulSoup constructor. Smart quotes are always converted " 

194 "to Unicode characters.") 

195 

196 if 'selfClosingTags' in kwargs: 

197 del kwargs['selfClosingTags'] 

198 warnings.warn( 

199 "BS4 does not respect the selfClosingTags argument to the " 

200 "BeautifulSoup constructor. The tree builder is responsible " 

201 "for understanding self-closing tags.") 

202 

203 if 'isHTML' in kwargs: 

204 del kwargs['isHTML'] 

205 warnings.warn( 

206 "BS4 does not respect the isHTML argument to the " 

207 "BeautifulSoup constructor. Suggest you use " 

208 "features='lxml' for HTML and features='lxml-xml' for " 

209 "XML.") 

210 

211 def deprecated_argument(old_name, new_name): 

212 if old_name in kwargs: 

213 warnings.warn( 

214 'The "%s" argument to the BeautifulSoup constructor ' 

215 'has been renamed to "%s."' % (old_name, new_name), 

216 DeprecationWarning, stacklevel=3 

217 ) 

218 return kwargs.pop(old_name) 

219 return None 

220 

221 parse_only = parse_only or deprecated_argument( 

222 "parseOnlyThese", "parse_only") 

223 

224 from_encoding = from_encoding or deprecated_argument( 

225 "fromEncoding", "from_encoding") 

226 

227 if from_encoding and isinstance(markup, str): 

228 warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") 

229 from_encoding = None 

230 

231 self.element_classes = element_classes or dict() 

232 

233 # We need this information to track whether or not the builder 

234 # was specified well enough that we can omit the 'you need to 

235 # specify a parser' warning. 

236 original_builder = builder 

237 original_features = features 

238 

239 if isinstance(builder, type): 

240 # A builder class was passed in; it needs to be instantiated. 

241 builder_class = builder 

242 builder = None 

243 elif builder is None: 

244 if isinstance(features, str): 

245 features = [features] 

246 if features is None or len(features) == 0: 

247 features = self.DEFAULT_BUILDER_FEATURES 

248 builder_class = builder_registry.lookup(*features) 

249 if builder_class is None: 

250 raise FeatureNotFound( 

251 "Couldn't find a tree builder with the features you " 

252 "requested: %s. Do you need to install a parser library?" 

253 % ",".join(features)) 

254 

255 # At this point either we have a TreeBuilder instance in 

256 # builder, or we have a builder_class that we can instantiate 

257 # with the remaining **kwargs. 

258 if builder is None: 

259 builder = builder_class(**kwargs) 

260 if not original_builder and not ( 

261 original_features == builder.NAME or 

262 original_features in builder.ALTERNATE_NAMES 

263 ) and markup: 

264 # The user did not tell us which TreeBuilder to use, 

265 # and we had to guess. Issue a warning. 

266 if builder.is_xml: 

267 markup_type = "XML" 

268 else: 

269 markup_type = "HTML" 

270 

271 # This code adapted from warnings.py so that we get the same line 

272 # of code as our warnings.warn() call gets, even if the answer is wrong 

273 # (as it may be in a multithreading situation). 

274 caller = None 

275 try: 

276 caller = sys._getframe(1) 

277 except ValueError: 

278 pass 

279 if caller: 

280 globals = caller.f_globals 

281 line_number = caller.f_lineno 

282 else: 

283 globals = sys.__dict__ 

284 line_number= 1 

285 filename = globals.get('__file__') 

286 if filename: 

287 fnl = filename.lower() 

288 if fnl.endswith((".pyc", ".pyo")): 

289 filename = filename[:-1] 

290 if filename: 

291 # If there is no filename at all, the user is most likely in a REPL, 

292 # and the warning is not necessary. 

293 values = dict( 

294 filename=filename, 

295 line_number=line_number, 

296 parser=builder.NAME, 

297 markup_type=markup_type 

298 ) 

299 warnings.warn( 

300 self.NO_PARSER_SPECIFIED_WARNING % values, 

301 GuessedAtParserWarning, stacklevel=2 

302 ) 

303 else: 

304 if kwargs: 

305 warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") 

306 

307 self.builder = builder 

308 self.is_xml = builder.is_xml 

309 self.known_xml = self.is_xml 

310 self._namespaces = dict() 

311 self.parse_only = parse_only 

312 

313 if hasattr(markup, 'read'): # It's a file-type object. 

314 markup = markup.read() 

315 elif len(markup) <= 256 and ( 

316 (isinstance(markup, bytes) and not b'<' in markup) 

317 or (isinstance(markup, str) and not '<' in markup) 

318 ): 

319 # Issue warnings for a couple beginner problems 

320 # involving passing non-markup to Beautiful Soup. 

321 # Beautiful Soup will still parse the input as markup, 

322 # since that is sometimes the intended behavior. 

323 if not self._markup_is_url(markup): 

324 self._markup_resembles_filename(markup) 

325 

326 rejections = [] 

327 success = False 

328 for (self.markup, self.original_encoding, self.declared_html_encoding, 

329 self.contains_replacement_characters) in ( 

330 self.builder.prepare_markup( 

331 markup, from_encoding, exclude_encodings=exclude_encodings)): 

332 self.reset() 

333 self.builder.initialize_soup(self) 

334 try: 

335 self._feed() 

336 success = True 

337 break 

338 except ParserRejectedMarkup as e: 

339 rejections.append(e) 

340 pass 

341 

342 if not success: 

343 other_exceptions = [str(e) for e in rejections] 

344 raise ParserRejectedMarkup( 

345 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) 

346 ) 

347 

348 # Clear out the markup and remove the builder's circular 

349 # reference to this object. 

350 self.markup = None 

351 self.builder.soup = None 

352 

353 def _clone(self): 

354 """Create a new BeautifulSoup object with the same TreeBuilder, 

355 but not associated with any markup. 

356 

357 This is the first step of the deepcopy process. 

358 """ 

359 clone = type(self)("", None, self.builder) 

360 

361 # Keep track of the encoding of the original document, 

362 # since we won't be parsing it again. 

363 clone.original_encoding = self.original_encoding 

364 return clone 

365 

366 def __getstate__(self): 

367 # Frequently a tree builder can't be pickled. 

368 d = dict(self.__dict__) 

369 if 'builder' in d and d['builder'] is not None and not self.builder.picklable: 

370 d['builder'] = type(self.builder) 

371 # Store the contents as a Unicode string. 

372 d['contents'] = [] 

373 d['markup'] = self.decode() 

374 

375 # If _most_recent_element is present, it's a Tag object left 

376 # over from initial parse. It might not be picklable and we 

377 # don't need it. 

378 if '_most_recent_element' in d: 

379 del d['_most_recent_element'] 

380 return d 

381 

382 def __setstate__(self, state): 

383 # If necessary, restore the TreeBuilder by looking it up. 

384 self.__dict__ = state 

385 if isinstance(self.builder, type): 

386 self.builder = self.builder() 

387 elif not self.builder: 

388 # We don't know which builder was used to build this 

389 # parse tree, so use a default we know is always available. 

390 self.builder = HTMLParserTreeBuilder() 

391 self.builder.soup = self 

392 self.reset() 

393 self._feed() 

394 return state 

395 

396 

397 @classmethod 

398 def _decode_markup(cls, markup): 

399 """Ensure `markup` is bytes so it's safe to send into warnings.warn. 

400 

401 TODO: warnings.warn had this problem back in 2010 but it might not 

402 anymore. 

403 """ 

404 if isinstance(markup, bytes): 

405 decoded = markup.decode('utf-8', 'replace') 

406 else: 

407 decoded = markup 

408 return decoded 

409 

410 @classmethod 

411 def _markup_is_url(cls, markup): 

412 """Error-handling method to raise a warning if incoming markup looks 

413 like a URL. 

414 

415 :param markup: A string. 

416 :return: Whether or not the markup resembles a URL 

417 closely enough to justify a warning. 

418 """ 

419 if isinstance(markup, bytes): 

420 space = b' ' 

421 cant_start_with = (b"http:", b"https:") 

422 elif isinstance(markup, str): 

423 space = ' ' 

424 cant_start_with = ("http:", "https:") 

425 else: 

426 return False 

427 

428 if any(markup.startswith(prefix) for prefix in cant_start_with): 

429 if not space in markup: 

430 warnings.warn( 

431 'The input looks more like a URL than markup. You may want to use' 

432 ' an HTTP client like requests to get the document behind' 

433 ' the URL, and feed that document to Beautiful Soup.', 

434 MarkupResemblesLocatorWarning, 

435 stacklevel=3 

436 ) 

437 return True 

438 return False 

439 

440 @classmethod 

441 def _markup_resembles_filename(cls, markup): 

442 """Error-handling method to raise a warning if incoming markup 

443 resembles a filename. 

444 

445 :param markup: A bytestring or string. 

446 :return: Whether or not the markup resembles a filename 

447 closely enough to justify a warning. 

448 """ 

449 path_characters = '/\\' 

450 extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt'] 

451 if isinstance(markup, bytes): 

452 path_characters = path_characters.encode("utf8") 

453 extensions = [x.encode('utf8') for x in extensions] 

454 filelike = False 

455 if any(x in markup for x in path_characters): 

456 filelike = True 

457 else: 

458 lower = markup.lower() 

459 if any(lower.endswith(ext) for ext in extensions): 

460 filelike = True 

461 if filelike: 

462 warnings.warn( 

463 'The input looks more like a filename than markup. You may' 

464 ' want to open this file and pass the filehandle into' 

465 ' Beautiful Soup.', 

466 MarkupResemblesLocatorWarning, stacklevel=3 

467 ) 

468 return True 

469 return False 

470 

471 def _feed(self): 

472 """Internal method that parses previously set markup, creating a large 

473 number of Tag and NavigableString objects. 

474 """ 

475 # Convert the document to Unicode. 

476 self.builder.reset() 

477 

478 self.builder.feed(self.markup) 

479 # Close out any unfinished strings and close all the open tags. 

480 self.endData() 

481 while self.currentTag.name != self.ROOT_TAG_NAME: 

482 self.popTag() 

483 

484 def reset(self): 

485 """Reset this object to a state as though it had never parsed any 

486 markup. 

487 """ 

488 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 

489 self.hidden = 1 

490 self.builder.reset() 

491 self.current_data = [] 

492 self.currentTag = None 

493 self.tagStack = [] 

494 self.open_tag_counter = Counter() 

495 self.preserve_whitespace_tag_stack = [] 

496 self.string_container_stack = [] 

497 self._most_recent_element = None 

498 self.pushTag(self) 

499 

500 def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, 

501 sourceline=None, sourcepos=None, **kwattrs): 

502 """Create a new Tag associated with this BeautifulSoup object. 

503 

504 :param name: The name of the new Tag. 

505 :param namespace: The URI of the new Tag's XML namespace, if any. 

506 :param prefix: The prefix for the new Tag's XML namespace, if any. 

507 :param attrs: A dictionary of this Tag's attribute values; can 

508 be used instead of `kwattrs` for attributes like 'class' 

509 that are reserved words in Python. 

510 :param sourceline: The line number where this tag was 

511 (purportedly) found in its source document. 

512 :param sourcepos: The character position within `sourceline` where this 

513 tag was (purportedly) found. 

514 :param kwattrs: Keyword arguments for the new Tag's attribute values. 

515 

516 """ 

517 kwattrs.update(attrs) 

518 return self.element_classes.get(Tag, Tag)( 

519 None, self.builder, name, namespace, nsprefix, kwattrs, 

520 sourceline=sourceline, sourcepos=sourcepos 

521 ) 

522 

523 def string_container(self, base_class=None): 

524 container = base_class or NavigableString 

525 

526 # There may be a general override of NavigableString. 

527 container = self.element_classes.get( 

528 container, container 

529 ) 

530 

531 # On top of that, we may be inside a tag that needs a special 

532 # container class. 

533 if self.string_container_stack and container is NavigableString: 

534 container = self.builder.string_containers.get( 

535 self.string_container_stack[-1].name, container 

536 ) 

537 return container 

538 

539 def new_string(self, s, subclass=None): 

540 """Create a new NavigableString associated with this BeautifulSoup 

541 object. 

542 """ 

543 container = self.string_container(subclass) 

544 return container(s) 

545 

546 def insert_before(self, *args): 

547 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement 

548 it because there is nothing before or after it in the parse tree. 

549 """ 

550 raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 

551 

552 def insert_after(self, *args): 

553 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement 

554 it because there is nothing before or after it in the parse tree. 

555 """ 

556 raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 

557 

558 def popTag(self): 

559 """Internal method called by _popToTag when a tag is closed.""" 

560 tag = self.tagStack.pop() 

561 if tag.name in self.open_tag_counter: 

562 self.open_tag_counter[tag.name] -= 1 

563 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: 

564 self.preserve_whitespace_tag_stack.pop() 

565 if self.string_container_stack and tag == self.string_container_stack[-1]: 

566 self.string_container_stack.pop() 

567 #print("Pop", tag.name) 

568 if self.tagStack: 

569 self.currentTag = self.tagStack[-1] 

570 return self.currentTag 

571 

572 def pushTag(self, tag): 

573 """Internal method called by handle_starttag when a tag is opened.""" 

574 #print("Push", tag.name) 

575 if self.currentTag is not None: 

576 self.currentTag.contents.append(tag) 

577 self.tagStack.append(tag) 

578 self.currentTag = self.tagStack[-1] 

579 if tag.name != self.ROOT_TAG_NAME: 

580 self.open_tag_counter[tag.name] += 1 

581 if tag.name in self.builder.preserve_whitespace_tags: 

582 self.preserve_whitespace_tag_stack.append(tag) 

583 if tag.name in self.builder.string_containers: 

584 self.string_container_stack.append(tag) 

585 

586 def endData(self, containerClass=None): 

587 """Method called by the TreeBuilder when the end of a data segment 

588 occurs. 

589 """ 

590 if self.current_data: 

591 current_data = ''.join(self.current_data) 

592 # If whitespace is not preserved, and this string contains 

593 # nothing but ASCII spaces, replace it with a single space 

594 # or newline. 

595 if not self.preserve_whitespace_tag_stack: 

596 strippable = True 

597 for i in current_data: 

598 if i not in self.ASCII_SPACES: 

599 strippable = False 

600 break 

601 if strippable: 

602 if '\n' in current_data: 

603 current_data = '\n' 

604 else: 

605 current_data = ' ' 

606 

607 # Reset the data collector. 

608 self.current_data = [] 

609 

610 # Should we add this string to the tree at all? 

611 if self.parse_only and len(self.tagStack) <= 1 and \ 

612 (not self.parse_only.text or \ 

613 not self.parse_only.search(current_data)): 

614 return 

615 

616 containerClass = self.string_container(containerClass) 

617 o = containerClass(current_data) 

618 self.object_was_parsed(o) 

619 

620 def object_was_parsed(self, o, parent=None, most_recent_element=None): 

621 """Method called by the TreeBuilder to integrate an object into the parse tree.""" 

622 if parent is None: 

623 parent = self.currentTag 

624 if most_recent_element is not None: 

625 previous_element = most_recent_element 

626 else: 

627 previous_element = self._most_recent_element 

628 

629 next_element = previous_sibling = next_sibling = None 

630 if isinstance(o, Tag): 

631 next_element = o.next_element 

632 next_sibling = o.next_sibling 

633 previous_sibling = o.previous_sibling 

634 if previous_element is None: 

635 previous_element = o.previous_element 

636 

637 fix = parent.next_element is not None 

638 

639 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) 

640 

641 self._most_recent_element = o 

642 parent.contents.append(o) 

643 

644 # Check if we are inserting into an already parsed node. 

645 if fix: 

646 self._linkage_fixer(parent) 

647 

648 def _linkage_fixer(self, el): 

649 """Make sure linkage of this fragment is sound.""" 

650 

651 first = el.contents[0] 

652 child = el.contents[-1] 

653 descendant = child 

654 

655 if child is first and el.parent is not None: 

656 # Parent should be linked to first child 

657 el.next_element = child 

658 # We are no longer linked to whatever this element is 

659 prev_el = child.previous_element 

660 if prev_el is not None and prev_el is not el: 

661 prev_el.next_element = None 

662 # First child should be linked to the parent, and no previous siblings. 

663 child.previous_element = el 

664 child.previous_sibling = None 

665 

666 # We have no sibling as we've been appended as the last. 

667 child.next_sibling = None 

668 

669 # This index is a tag, dig deeper for a "last descendant" 

670 if isinstance(child, Tag) and child.contents: 

671 descendant = child._last_descendant(False) 

672 

673 # As the final step, link last descendant. It should be linked 

674 # to the parent's next sibling (if found), else walk up the chain 

675 # and find a parent with a sibling. It should have no next sibling. 

676 descendant.next_element = None 

677 descendant.next_sibling = None 

678 target = el 

679 while True: 

680 if target is None: 

681 break 

682 elif target.next_sibling is not None: 

683 descendant.next_element = target.next_sibling 

684 target.next_sibling.previous_element = child 

685 break 

686 target = target.parent 

687 

688 def _popToTag(self, name, nsprefix=None, inclusivePop=True): 

689 """Pops the tag stack up to and including the most recent 

690 instance of the given tag. 

691 

692 If there are no open tags with the given name, nothing will be 

693 popped. 

694 

695 :param name: Pop up to the most recent tag with this name. 

696 :param nsprefix: The namespace prefix that goes with `name`. 

697 :param inclusivePop: It this is false, pops the tag stack up 

698 to but *not* including the most recent instqance of the 

699 given tag. 

700 

701 """ 

702 #print("Popping to %s" % name) 

703 if name == self.ROOT_TAG_NAME: 

704 # The BeautifulSoup object itself can never be popped. 

705 return 

706 

707 most_recently_popped = None 

708 

709 stack_size = len(self.tagStack) 

710 for i in range(stack_size - 1, 0, -1): 

711 if not self.open_tag_counter.get(name): 

712 break 

713 t = self.tagStack[i] 

714 if (name == t.name and nsprefix == t.prefix): 

715 if inclusivePop: 

716 most_recently_popped = self.popTag() 

717 break 

718 most_recently_popped = self.popTag() 

719 

720 return most_recently_popped 

721 

722 def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, 

723 sourcepos=None, namespaces=None): 

724 """Called by the tree builder when a new tag is encountered. 

725 

726 :param name: Name of the tag. 

727 :param nsprefix: Namespace prefix for the tag. 

728 :param attrs: A dictionary of attribute values. 

729 :param sourceline: The line number where this tag was found in its 

730 source document. 

731 :param sourcepos: The character position within `sourceline` where this 

732 tag was found. 

733 :param namespaces: A dictionary of all namespace prefix mappings  

734 currently in scope in the document. 

735 

736 If this method returns None, the tag was rejected by an active 

737 SoupStrainer. You should proceed as if the tag had not occurred 

738 in the document. For instance, if this was a self-closing tag, 

739 don't call handle_endtag. 

740 """ 

741 # print("Start tag %s: %s" % (name, attrs)) 

742 self.endData() 

743 

744 if (self.parse_only and len(self.tagStack) <= 1 

745 and (self.parse_only.text 

746 or not self.parse_only.search_tag(name, attrs))): 

747 return None 

748 

749 tag = self.element_classes.get(Tag, Tag)( 

750 self, self.builder, name, namespace, nsprefix, attrs, 

751 self.currentTag, self._most_recent_element, 

752 sourceline=sourceline, sourcepos=sourcepos, 

753 namespaces=namespaces 

754 ) 

755 if tag is None: 

756 return tag 

757 if self._most_recent_element is not None: 

758 self._most_recent_element.next_element = tag 

759 self._most_recent_element = tag 

760 self.pushTag(tag) 

761 return tag 

762 

763 def handle_endtag(self, name, nsprefix=None): 

764 """Called by the tree builder when an ending tag is encountered. 

765 

766 :param name: Name of the tag. 

767 :param nsprefix: Namespace prefix for the tag. 

768 """ 

769 #print("End tag: " + name) 

770 self.endData() 

771 self._popToTag(name, nsprefix) 

772 

773 def handle_data(self, data): 

774 """Called by the tree builder when a chunk of textual data is encountered.""" 

775 self.current_data.append(data) 

776 

777 def decode(self, pretty_print=False, 

778 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 

779 formatter="minimal", iterator=None): 

780 """Returns a string or Unicode representation of the parse tree 

781 as an HTML or XML document. 

782 

783 :param pretty_print: If this is True, indentation will be used to 

784 make the document more readable. 

785 :param eventual_encoding: The encoding of the final document. 

786 If this is None, the document will be a Unicode string. 

787 """ 

788 if self.is_xml: 

789 # Print the XML declaration 

790 encoding_part = '' 

791 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 

792 # This is a special Python encoding; it can't actually 

793 # go into an XML document because it means nothing 

794 # outside of Python. 

795 eventual_encoding = None 

796 if eventual_encoding != None: 

797 encoding_part = ' encoding="%s"' % eventual_encoding 

798 prefix = '<?xml version="1.0"%s?>\n' % encoding_part 

799 else: 

800 prefix = '' 

801 if not pretty_print: 

802 indent_level = None 

803 else: 

804 indent_level = 0 

805 return prefix + super(BeautifulSoup, self).decode( 

806 indent_level, eventual_encoding, formatter, iterator) 

807 

808# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' 

809_s = BeautifulSoup 

810_soup = BeautifulSoup 

811 

812class BeautifulStoneSoup(BeautifulSoup): 

813 """Deprecated interface to an XML parser.""" 

814 

815 def __init__(self, *args, **kwargs): 

816 kwargs['features'] = 'xml' 

817 warnings.warn( 

818 'The BeautifulStoneSoup class is deprecated. Instead of using ' 

819 'it, pass features="xml" into the BeautifulSoup constructor.', 

820 DeprecationWarning, stacklevel=2 

821 ) 

822 super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 

823 

824 

825class StopParsing(Exception): 

826 """Exception raised by a TreeBuilder if it's unable to continue parsing.""" 

827 pass 

828 

829class FeatureNotFound(ValueError): 

830 """Exception raised by the BeautifulSoup constructor if no parser with the 

831 requested features is found. 

832 """ 

833 pass 

834 

835 

836#If this file is run as a script, act as an HTML pretty-printer. 

837if __name__ == '__main__': 

838 import sys 

839 soup = BeautifulSoup(sys.stdin) 

840 print((soup.prettify()))