Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/__init__.py: 73%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

339 statements  

1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". 

2 

3http://www.crummy.com/software/BeautifulSoup/ 

4 

5Beautiful Soup uses a pluggable XML or HTML parser to parse a 

6(possibly invalid) document into a tree representation. Beautiful Soup 

7provides methods and Pythonic idioms that make it easy to navigate, 

8search, and modify the parse tree. 

9 

10Beautiful Soup works with Python 3.5 and up. It works better if lxml 

11and/or html5lib is installed. 

12 

13For more than you ever wanted to know about Beautiful Soup, see the 

14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ 

15""" 

16 

17__author__ = "Leonard Richardson (leonardr@segfault.org)" 

18__version__ = "4.11.1" 

19__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson" 

20# Use of this source code is governed by the MIT license. 

21__license__ = "MIT" 

22 

23__all__ = ['BeautifulSoup'] 

24 

25from collections import Counter 

26import os 

27import re 

28import sys 

29import traceback 

30import warnings 

31 

32# The very first thing we do is give a useful error if someone is 

33# running this code under Python 2. 

34if sys.version_info.major < 3: 

35 raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.') 

36 

37from .builder import ( 

38 builder_registry, 

39 ParserRejectedMarkup, 

40 XMLParsedAsHTMLWarning, 

41) 

42from .dammit import UnicodeDammit 

43from .element import ( 

44 CData, 

45 Comment, 

46 DEFAULT_OUTPUT_ENCODING, 

47 Declaration, 

48 Doctype, 

49 NavigableString, 

50 PageElement, 

51 ProcessingInstruction, 

52 PYTHON_SPECIFIC_ENCODINGS, 

53 ResultSet, 

54 Script, 

55 Stylesheet, 

56 SoupStrainer, 

57 Tag, 

58 TemplateString, 

59 ) 

60 

61# Define some custom warnings. 

62class GuessedAtParserWarning(UserWarning): 

63 """The warning issued when BeautifulSoup has to guess what parser to 

64 use -- probably because no parser was specified in the constructor. 

65 """ 

66 

67class MarkupResemblesLocatorWarning(UserWarning): 

68 """The warning issued when BeautifulSoup is given 'markup' that 

69 actually looks like a resource locator -- a URL or a path to a file 

70 on disk. 

71 """ 

72 

73 

74class BeautifulSoup(Tag): 

75 """A data structure representing a parsed HTML or XML document. 

76 

77 Most of the methods you'll call on a BeautifulSoup object are inherited from 

78 PageElement or Tag. 

79 

80 Internally, this class defines the basic interface called by the 

81 tree builders when converting an HTML/XML document into a data 

82 structure. The interface abstracts away the differences between 

83 parsers. To write a new tree builder, you'll need to understand 

84 these methods as a whole. 

85 

86 These methods will be called by the BeautifulSoup constructor: 

87 * reset() 

88 * feed(markup) 

89 

90 The tree builder may call these methods from its feed() implementation: 

91 * handle_starttag(name, attrs) # See note about return value 

92 * handle_endtag(name) 

93 * handle_data(data) # Appends to the current data node 

94 * endData(containerClass) # Ends the current data node 

95 

96 No matter how complicated the underlying parser is, you should be 

97 able to build a tree using 'start tag' events, 'end tag' events, 

98 'data' events, and "done with data" events. 

99 

100 If you encounter an empty-element tag (aka a self-closing tag, 

101 like HTML's <br> tag), call handle_starttag and then 

102 handle_endtag. 

103 """ 

104 

105 # Since BeautifulSoup subclasses Tag, it's possible to treat it as 

106 # a Tag with a .name. This name makes it clear the BeautifulSoup 

107 # object isn't a real markup tag. 

108 ROOT_TAG_NAME = '[document]' 

109 

110 # If the end-user gives no indication which tree builder they 

111 # want, look for one with these features. 

112 DEFAULT_BUILDER_FEATURES = ['html', 'fast'] 

113 

114 # A string containing all ASCII whitespace characters, used in 

115 # endData() to detect data chunks that seem 'empty'. 

116 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' 

117 

118 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" 

119 

120 def __init__(self, markup="", features=None, builder=None, 

121 parse_only=None, from_encoding=None, exclude_encodings=None, 

122 element_classes=None, **kwargs): 

123 """Constructor. 

124 

125 :param markup: A string or a file-like object representing 

126 markup to be parsed. 

127 

128 :param features: Desirable features of the parser to be 

129 used. This may be the name of a specific parser ("lxml", 

130 "lxml-xml", "html.parser", or "html5lib") or it may be the 

131 type of markup to be used ("html", "html5", "xml"). It's 

132 recommended that you name a specific parser, so that 

133 Beautiful Soup gives you the same results across platforms 

134 and virtual environments. 

135 

136 :param builder: A TreeBuilder subclass to instantiate (or 

137 instance to use) instead of looking one up based on 

138 `features`. You only need to use this if you've implemented a 

139 custom TreeBuilder. 

140 

141 :param parse_only: A SoupStrainer. Only parts of the document 

142 matching the SoupStrainer will be considered. This is useful 

143 when parsing part of a document that would otherwise be too 

144 large to fit into memory. 

145 

146 :param from_encoding: A string indicating the encoding of the 

147 document to be parsed. Pass this in if Beautiful Soup is 

148 guessing wrongly about the document's encoding. 

149 

150 :param exclude_encodings: A list of strings indicating 

151 encodings known to be wrong. Pass this in if you don't know 

152 the document's encoding but you know Beautiful Soup's guess is 

153 wrong. 

154 

155 :param element_classes: A dictionary mapping BeautifulSoup 

156 classes like Tag and NavigableString, to other classes you'd 

157 like to be instantiated instead as the parse tree is 

158 built. This is useful for subclassing Tag or NavigableString 

159 to modify default behavior. 

160 

161 :param kwargs: For backwards compatibility purposes, the 

162 constructor accepts certain keyword arguments used in 

163 Beautiful Soup 3. None of these arguments do anything in 

164 Beautiful Soup 4; they will result in a warning and then be 

165 ignored. 

166  

167 Apart from this, any keyword arguments passed into the 

168 BeautifulSoup constructor are propagated to the TreeBuilder 

169 constructor. This makes it possible to configure a 

170 TreeBuilder by passing in arguments, not just by saying which 

171 one to use. 

172 """ 

173 if 'convertEntities' in kwargs: 

174 del kwargs['convertEntities'] 

175 warnings.warn( 

176 "BS4 does not respect the convertEntities argument to the " 

177 "BeautifulSoup constructor. Entities are always converted " 

178 "to Unicode characters.") 

179 

180 if 'markupMassage' in kwargs: 

181 del kwargs['markupMassage'] 

182 warnings.warn( 

183 "BS4 does not respect the markupMassage argument to the " 

184 "BeautifulSoup constructor. The tree builder is responsible " 

185 "for any necessary markup massage.") 

186 

187 if 'smartQuotesTo' in kwargs: 

188 del kwargs['smartQuotesTo'] 

189 warnings.warn( 

190 "BS4 does not respect the smartQuotesTo argument to the " 

191 "BeautifulSoup constructor. Smart quotes are always converted " 

192 "to Unicode characters.") 

193 

194 if 'selfClosingTags' in kwargs: 

195 del kwargs['selfClosingTags'] 

196 warnings.warn( 

197 "BS4 does not respect the selfClosingTags argument to the " 

198 "BeautifulSoup constructor. The tree builder is responsible " 

199 "for understanding self-closing tags.") 

200 

201 if 'isHTML' in kwargs: 

202 del kwargs['isHTML'] 

203 warnings.warn( 

204 "BS4 does not respect the isHTML argument to the " 

205 "BeautifulSoup constructor. Suggest you use " 

206 "features='lxml' for HTML and features='lxml-xml' for " 

207 "XML.") 

208 

209 def deprecated_argument(old_name, new_name): 

210 if old_name in kwargs: 

211 warnings.warn( 

212 'The "%s" argument to the BeautifulSoup constructor ' 

213 'has been renamed to "%s."' % (old_name, new_name), 

214 DeprecationWarning 

215 ) 

216 return kwargs.pop(old_name) 

217 return None 

218 

219 parse_only = parse_only or deprecated_argument( 

220 "parseOnlyThese", "parse_only") 

221 

222 from_encoding = from_encoding or deprecated_argument( 

223 "fromEncoding", "from_encoding") 

224 

225 if from_encoding and isinstance(markup, str): 

226 warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") 

227 from_encoding = None 

228 

229 self.element_classes = element_classes or dict() 

230 

231 # We need this information to track whether or not the builder 

232 # was specified well enough that we can omit the 'you need to 

233 # specify a parser' warning. 

234 original_builder = builder 

235 original_features = features 

236 

237 if isinstance(builder, type): 

238 # A builder class was passed in; it needs to be instantiated. 

239 builder_class = builder 

240 builder = None 

241 elif builder is None: 

242 if isinstance(features, str): 

243 features = [features] 

244 if features is None or len(features) == 0: 

245 features = self.DEFAULT_BUILDER_FEATURES 

246 builder_class = builder_registry.lookup(*features) 

247 if builder_class is None: 

248 raise FeatureNotFound( 

249 "Couldn't find a tree builder with the features you " 

250 "requested: %s. Do you need to install a parser library?" 

251 % ",".join(features)) 

252 

253 # At this point either we have a TreeBuilder instance in 

254 # builder, or we have a builder_class that we can instantiate 

255 # with the remaining **kwargs. 

256 if builder is None: 

257 builder = builder_class(**kwargs) 

258 if not original_builder and not ( 

259 original_features == builder.NAME or 

260 original_features in builder.ALTERNATE_NAMES 

261 ) and markup: 

262 # The user did not tell us which TreeBuilder to use, 

263 # and we had to guess. Issue a warning. 

264 if builder.is_xml: 

265 markup_type = "XML" 

266 else: 

267 markup_type = "HTML" 

268 

269 # This code adapted from warnings.py so that we get the same line 

270 # of code as our warnings.warn() call gets, even if the answer is wrong 

271 # (as it may be in a multithreading situation). 

272 caller = None 

273 try: 

274 caller = sys._getframe(1) 

275 except ValueError: 

276 pass 

277 if caller: 

278 globals = caller.f_globals 

279 line_number = caller.f_lineno 

280 else: 

281 globals = sys.__dict__ 

282 line_number= 1 

283 filename = globals.get('__file__') 

284 if filename: 

285 fnl = filename.lower() 

286 if fnl.endswith((".pyc", ".pyo")): 

287 filename = filename[:-1] 

288 if filename: 

289 # If there is no filename at all, the user is most likely in a REPL, 

290 # and the warning is not necessary. 

291 values = dict( 

292 filename=filename, 

293 line_number=line_number, 

294 parser=builder.NAME, 

295 markup_type=markup_type 

296 ) 

297 warnings.warn( 

298 self.NO_PARSER_SPECIFIED_WARNING % values, 

299 GuessedAtParserWarning, stacklevel=2 

300 ) 

301 else: 

302 if kwargs: 

303 warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") 

304 

305 self.builder = builder 

306 self.is_xml = builder.is_xml 

307 self.known_xml = self.is_xml 

308 self._namespaces = dict() 

309 self.parse_only = parse_only 

310 

311 if hasattr(markup, 'read'): # It's a file-type object. 

312 markup = markup.read() 

313 elif len(markup) <= 256 and ( 

314 (isinstance(markup, bytes) and not b'<' in markup) 

315 or (isinstance(markup, str) and not '<' in markup) 

316 ): 

317 # Issue warnings for a couple beginner problems 

318 # involving passing non-markup to Beautiful Soup. 

319 # Beautiful Soup will still parse the input as markup, 

320 # since that is sometimes the intended behavior. 

321 if not self._markup_is_url(markup): 

322 self._markup_resembles_filename(markup) 

323 

324 rejections = [] 

325 success = False 

326 for (self.markup, self.original_encoding, self.declared_html_encoding, 

327 self.contains_replacement_characters) in ( 

328 self.builder.prepare_markup( 

329 markup, from_encoding, exclude_encodings=exclude_encodings)): 

330 self.reset() 

331 self.builder.initialize_soup(self) 

332 try: 

333 self._feed() 

334 success = True 

335 break 

336 except ParserRejectedMarkup as e: 

337 rejections.append(e) 

338 pass 

339 

340 if not success: 

341 other_exceptions = [str(e) for e in rejections] 

342 raise ParserRejectedMarkup( 

343 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) 

344 ) 

345 

346 # Clear out the markup and remove the builder's circular 

347 # reference to this object. 

348 self.markup = None 

349 self.builder.soup = None 

350 

351 def __copy__(self): 

352 """Copy a BeautifulSoup object by converting the document to a string and parsing it again.""" 

353 copy = type(self)( 

354 self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' 

355 ) 

356 

357 # Although we encoded the tree to UTF-8, that may not have 

358 # been the encoding of the original markup. Set the copy's 

359 # .original_encoding to reflect the original object's 

360 # .original_encoding. 

361 copy.original_encoding = self.original_encoding 

362 return copy 

363 

364 def __getstate__(self): 

365 # Frequently a tree builder can't be pickled. 

366 d = dict(self.__dict__) 

367 if 'builder' in d and d['builder'] is not None and not self.builder.picklable: 

368 d['builder'] = None 

369 return d 

370 

371 @classmethod 

372 def _decode_markup(cls, markup): 

373 """Ensure `markup` is bytes so it's safe to send into warnings.warn. 

374 

375 TODO: warnings.warn had this problem back in 2010 but it might not 

376 anymore. 

377 """ 

378 if isinstance(markup, bytes): 

379 decoded = markup.decode('utf-8', 'replace') 

380 else: 

381 decoded = markup 

382 return decoded 

383 

384 @classmethod 

385 def _markup_is_url(cls, markup): 

386 """Error-handling method to raise a warning if incoming markup looks 

387 like a URL. 

388 

389 :param markup: A string. 

390 :return: Whether or not the markup resembles a URL 

391 closely enough to justify a warning. 

392 """ 

393 if isinstance(markup, bytes): 

394 space = b' ' 

395 cant_start_with = (b"http:", b"https:") 

396 elif isinstance(markup, str): 

397 space = ' ' 

398 cant_start_with = ("http:", "https:") 

399 else: 

400 return False 

401 

402 if any(markup.startswith(prefix) for prefix in cant_start_with): 

403 if not space in markup: 

404 warnings.warn( 

405 'The input looks more like a URL than markup. You may want to use' 

406 ' an HTTP client like requests to get the document behind' 

407 ' the URL, and feed that document to Beautiful Soup.', 

408 MarkupResemblesLocatorWarning 

409 ) 

410 return True 

411 return False 

412 

413 @classmethod 

414 def _markup_resembles_filename(cls, markup): 

415 """Error-handling method to raise a warning if incoming markup 

416 resembles a filename. 

417 

418 :param markup: A bytestring or string. 

419 :return: Whether or not the markup resembles a filename 

420 closely enough to justify a warning. 

421 """ 

422 path_characters = '/\\' 

423 extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt'] 

424 if isinstance(markup, bytes): 

425 path_characters = path_characters.encode("utf8") 

426 extensions = [x.encode('utf8') for x in extensions] 

427 filelike = False 

428 if any(x in markup for x in path_characters): 

429 filelike = True 

430 else: 

431 lower = markup.lower() 

432 if any(lower.endswith(ext) for ext in extensions): 

433 filelike = True 

434 if filelike: 

435 warnings.warn( 

436 'The input looks more like a filename than markup. You may' 

437 ' want to open this file and pass the filehandle into' 

438 ' Beautiful Soup.', 

439 MarkupResemblesLocatorWarning 

440 ) 

441 return True 

442 return False 

443 

444 def _feed(self): 

445 """Internal method that parses previously set markup, creating a large 

446 number of Tag and NavigableString objects. 

447 """ 

448 # Convert the document to Unicode. 

449 self.builder.reset() 

450 

451 self.builder.feed(self.markup) 

452 # Close out any unfinished strings and close all the open tags. 

453 self.endData() 

454 while self.currentTag.name != self.ROOT_TAG_NAME: 

455 self.popTag() 

456 

457 def reset(self): 

458 """Reset this object to a state as though it had never parsed any 

459 markup. 

460 """ 

461 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 

462 self.hidden = 1 

463 self.builder.reset() 

464 self.current_data = [] 

465 self.currentTag = None 

466 self.tagStack = [] 

467 self.open_tag_counter = Counter() 

468 self.preserve_whitespace_tag_stack = [] 

469 self.string_container_stack = [] 

470 self.pushTag(self) 

471 

472 def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, 

473 sourceline=None, sourcepos=None, **kwattrs): 

474 """Create a new Tag associated with this BeautifulSoup object. 

475 

476 :param name: The name of the new Tag. 

477 :param namespace: The URI of the new Tag's XML namespace, if any. 

478 :param prefix: The prefix for the new Tag's XML namespace, if any. 

479 :param attrs: A dictionary of this Tag's attribute values; can 

480 be used instead of `kwattrs` for attributes like 'class' 

481 that are reserved words in Python. 

482 :param sourceline: The line number where this tag was 

483 (purportedly) found in its source document. 

484 :param sourcepos: The character position within `sourceline` where this 

485 tag was (purportedly) found. 

486 :param kwattrs: Keyword arguments for the new Tag's attribute values. 

487 

488 """ 

489 kwattrs.update(attrs) 

490 return self.element_classes.get(Tag, Tag)( 

491 None, self.builder, name, namespace, nsprefix, kwattrs, 

492 sourceline=sourceline, sourcepos=sourcepos 

493 ) 

494 

495 def string_container(self, base_class=None): 

496 container = base_class or NavigableString 

497 

498 # There may be a general override of NavigableString. 

499 container = self.element_classes.get( 

500 container, container 

501 ) 

502 

503 # On top of that, we may be inside a tag that needs a special 

504 # container class. 

505 if self.string_container_stack and container is NavigableString: 

506 container = self.builder.string_containers.get( 

507 self.string_container_stack[-1].name, container 

508 ) 

509 return container 

510 

511 def new_string(self, s, subclass=None): 

512 """Create a new NavigableString associated with this BeautifulSoup 

513 object. 

514 """ 

515 container = self.string_container(subclass) 

516 return container(s) 

517 

518 def insert_before(self, *args): 

519 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement 

520 it because there is nothing before or after it in the parse tree. 

521 """ 

522 raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 

523 

524 def insert_after(self, *args): 

525 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement 

526 it because there is nothing before or after it in the parse tree. 

527 """ 

528 raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 

529 

530 def popTag(self): 

531 """Internal method called by _popToTag when a tag is closed.""" 

532 tag = self.tagStack.pop() 

533 if tag.name in self.open_tag_counter: 

534 self.open_tag_counter[tag.name] -= 1 

535 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: 

536 self.preserve_whitespace_tag_stack.pop() 

537 if self.string_container_stack and tag == self.string_container_stack[-1]: 

538 self.string_container_stack.pop() 

539 #print("Pop", tag.name) 

540 if self.tagStack: 

541 self.currentTag = self.tagStack[-1] 

542 return self.currentTag 

543 

544 def pushTag(self, tag): 

545 """Internal method called by handle_starttag when a tag is opened.""" 

546 #print("Push", tag.name) 

547 if self.currentTag is not None: 

548 self.currentTag.contents.append(tag) 

549 self.tagStack.append(tag) 

550 self.currentTag = self.tagStack[-1] 

551 if tag.name != self.ROOT_TAG_NAME: 

552 self.open_tag_counter[tag.name] += 1 

553 if tag.name in self.builder.preserve_whitespace_tags: 

554 self.preserve_whitespace_tag_stack.append(tag) 

555 if tag.name in self.builder.string_containers: 

556 self.string_container_stack.append(tag) 

557 

558 def endData(self, containerClass=None): 

559 """Method called by the TreeBuilder when the end of a data segment 

560 occurs. 

561 """ 

562 if self.current_data: 

563 current_data = ''.join(self.current_data) 

564 # If whitespace is not preserved, and this string contains 

565 # nothing but ASCII spaces, replace it with a single space 

566 # or newline. 

567 if not self.preserve_whitespace_tag_stack: 

568 strippable = True 

569 for i in current_data: 

570 if i not in self.ASCII_SPACES: 

571 strippable = False 

572 break 

573 if strippable: 

574 if '\n' in current_data: 

575 current_data = '\n' 

576 else: 

577 current_data = ' ' 

578 

579 # Reset the data collector. 

580 self.current_data = [] 

581 

582 # Should we add this string to the tree at all? 

583 if self.parse_only and len(self.tagStack) <= 1 and \ 

584 (not self.parse_only.text or \ 

585 not self.parse_only.search(current_data)): 

586 return 

587 

588 containerClass = self.string_container(containerClass) 

589 o = containerClass(current_data) 

590 self.object_was_parsed(o) 

591 

592 def object_was_parsed(self, o, parent=None, most_recent_element=None): 

593 """Method called by the TreeBuilder to integrate an object into the parse tree.""" 

594 if parent is None: 

595 parent = self.currentTag 

596 if most_recent_element is not None: 

597 previous_element = most_recent_element 

598 else: 

599 previous_element = self._most_recent_element 

600 

601 next_element = previous_sibling = next_sibling = None 

602 if isinstance(o, Tag): 

603 next_element = o.next_element 

604 next_sibling = o.next_sibling 

605 previous_sibling = o.previous_sibling 

606 if previous_element is None: 

607 previous_element = o.previous_element 

608 

609 fix = parent.next_element is not None 

610 

611 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) 

612 

613 self._most_recent_element = o 

614 parent.contents.append(o) 

615 

616 # Check if we are inserting into an already parsed node. 

617 if fix: 

618 self._linkage_fixer(parent) 

619 

620 def _linkage_fixer(self, el): 

621 """Make sure linkage of this fragment is sound.""" 

622 

623 first = el.contents[0] 

624 child = el.contents[-1] 

625 descendant = child 

626 

627 if child is first and el.parent is not None: 

628 # Parent should be linked to first child 

629 el.next_element = child 

630 # We are no longer linked to whatever this element is 

631 prev_el = child.previous_element 

632 if prev_el is not None and prev_el is not el: 

633 prev_el.next_element = None 

634 # First child should be linked to the parent, and no previous siblings. 

635 child.previous_element = el 

636 child.previous_sibling = None 

637 

638 # We have no sibling as we've been appended as the last. 

639 child.next_sibling = None 

640 

641 # This index is a tag, dig deeper for a "last descendant" 

642 if isinstance(child, Tag) and child.contents: 

643 descendant = child._last_descendant(False) 

644 

645 # As the final step, link last descendant. It should be linked 

646 # to the parent's next sibling (if found), else walk up the chain 

647 # and find a parent with a sibling. It should have no next sibling. 

648 descendant.next_element = None 

649 descendant.next_sibling = None 

650 target = el 

651 while True: 

652 if target is None: 

653 break 

654 elif target.next_sibling is not None: 

655 descendant.next_element = target.next_sibling 

656 target.next_sibling.previous_element = child 

657 break 

658 target = target.parent 

659 

660 def _popToTag(self, name, nsprefix=None, inclusivePop=True): 

661 """Pops the tag stack up to and including the most recent 

662 instance of the given tag. 

663 

664 If there are no open tags with the given name, nothing will be 

665 popped. 

666 

667 :param name: Pop up to the most recent tag with this name. 

668 :param nsprefix: The namespace prefix that goes with `name`. 

669 :param inclusivePop: It this is false, pops the tag stack up 

670 to but *not* including the most recent instqance of the 

671 given tag. 

672 

673 """ 

674 #print("Popping to %s" % name) 

675 if name == self.ROOT_TAG_NAME: 

676 # The BeautifulSoup object itself can never be popped. 

677 return 

678 

679 most_recently_popped = None 

680 

681 stack_size = len(self.tagStack) 

682 for i in range(stack_size - 1, 0, -1): 

683 if not self.open_tag_counter.get(name): 

684 break 

685 t = self.tagStack[i] 

686 if (name == t.name and nsprefix == t.prefix): 

687 if inclusivePop: 

688 most_recently_popped = self.popTag() 

689 break 

690 most_recently_popped = self.popTag() 

691 

692 return most_recently_popped 

693 

694 def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, 

695 sourcepos=None, namespaces=None): 

696 """Called by the tree builder when a new tag is encountered. 

697 

698 :param name: Name of the tag. 

699 :param nsprefix: Namespace prefix for the tag. 

700 :param attrs: A dictionary of attribute values. 

701 :param sourceline: The line number where this tag was found in its 

702 source document. 

703 :param sourcepos: The character position within `sourceline` where this 

704 tag was found. 

705 :param namespaces: A dictionary of all namespace prefix mappings  

706 currently in scope in the document. 

707 

708 If this method returns None, the tag was rejected by an active 

709 SoupStrainer. You should proceed as if the tag had not occurred 

710 in the document. For instance, if this was a self-closing tag, 

711 don't call handle_endtag. 

712 """ 

713 # print("Start tag %s: %s" % (name, attrs)) 

714 self.endData() 

715 

716 if (self.parse_only and len(self.tagStack) <= 1 

717 and (self.parse_only.text 

718 or not self.parse_only.search_tag(name, attrs))): 

719 return None 

720 

721 tag = self.element_classes.get(Tag, Tag)( 

722 self, self.builder, name, namespace, nsprefix, attrs, 

723 self.currentTag, self._most_recent_element, 

724 sourceline=sourceline, sourcepos=sourcepos, 

725 namespaces=namespaces 

726 ) 

727 if tag is None: 

728 return tag 

729 if self._most_recent_element is not None: 

730 self._most_recent_element.next_element = tag 

731 self._most_recent_element = tag 

732 self.pushTag(tag) 

733 return tag 

734 

735 def handle_endtag(self, name, nsprefix=None): 

736 """Called by the tree builder when an ending tag is encountered. 

737 

738 :param name: Name of the tag. 

739 :param nsprefix: Namespace prefix for the tag. 

740 """ 

741 #print("End tag: " + name) 

742 self.endData() 

743 self._popToTag(name, nsprefix) 

744 

745 def handle_data(self, data): 

746 """Called by the tree builder when a chunk of textual data is encountered.""" 

747 self.current_data.append(data) 

748 

749 def decode(self, pretty_print=False, 

750 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 

751 formatter="minimal"): 

752 """Returns a string or Unicode representation of the parse tree 

753 as an HTML or XML document. 

754 

755 :param pretty_print: If this is True, indentation will be used to 

756 make the document more readable. 

757 :param eventual_encoding: The encoding of the final document. 

758 If this is None, the document will be a Unicode string. 

759 """ 

760 if self.is_xml: 

761 # Print the XML declaration 

762 encoding_part = '' 

763 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 

764 # This is a special Python encoding; it can't actually 

765 # go into an XML document because it means nothing 

766 # outside of Python. 

767 eventual_encoding = None 

768 if eventual_encoding != None: 

769 encoding_part = ' encoding="%s"' % eventual_encoding 

770 prefix = '<?xml version="1.0"%s?>\n' % encoding_part 

771 else: 

772 prefix = '' 

773 if not pretty_print: 

774 indent_level = None 

775 else: 

776 indent_level = 0 

777 return prefix + super(BeautifulSoup, self).decode( 

778 indent_level, eventual_encoding, formatter) 

779 

780# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' 

781_s = BeautifulSoup 

782_soup = BeautifulSoup 

783 

784class BeautifulStoneSoup(BeautifulSoup): 

785 """Deprecated interface to an XML parser.""" 

786 

787 def __init__(self, *args, **kwargs): 

788 kwargs['features'] = 'xml' 

789 warnings.warn( 

790 'The BeautifulStoneSoup class is deprecated. Instead of using ' 

791 'it, pass features="xml" into the BeautifulSoup constructor.', 

792 DeprecationWarning 

793 ) 

794 super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 

795 

796 

797class StopParsing(Exception): 

798 """Exception raised by a TreeBuilder if it's unable to continue parsing.""" 

799 pass 

800 

801class FeatureNotFound(ValueError): 

802 """Exception raised by the BeautifulSoup constructor if no parser with the 

803 requested features is found. 

804 """ 

805 pass 

806 

807 

808#If this file is run as a script, act as an HTML pretty-printer. 

809if __name__ == '__main__': 

810 import sys 

811 soup = BeautifulSoup(sys.stdin) 

812 print((soup.prettify()))