Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/builder/__init__.py: 48%

205 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1# Use of this source code is governed by the MIT license. 

2__license__ = "MIT" 

3 

4from collections import defaultdict 

5import itertools 

6import re 

7import warnings 

8import sys 

9from bs4.element import ( 

10 CharsetMetaAttributeValue, 

11 ContentMetaAttributeValue, 

12 RubyParenthesisString, 

13 RubyTextString, 

14 Stylesheet, 

15 Script, 

16 TemplateString, 

17 nonwhitespace_re 

18) 

19 

20__all__ = [ 

21 'HTMLTreeBuilder', 

22 'SAXTreeBuilder', 

23 'TreeBuilder', 

24 'TreeBuilderRegistry', 

25 ] 

26 

27# Some useful features for a TreeBuilder to have. 

28FAST = 'fast' 

29PERMISSIVE = 'permissive' 

30STRICT = 'strict' 

31XML = 'xml' 

32HTML = 'html' 

33HTML_5 = 'html5' 

34 

35class XMLParsedAsHTMLWarning(UserWarning): 

36 """The warning issued when an HTML parser is used to parse 

37 XML that is not XHTML. 

38 """ 

39 MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.""" 

40 

41 

42class TreeBuilderRegistry(object): 

43 """A way of looking up TreeBuilder subclasses by their name or by desired 

44 features. 

45 """ 

46 

47 def __init__(self): 

48 self.builders_for_feature = defaultdict(list) 

49 self.builders = [] 

50 

51 def register(self, treebuilder_class): 

52 """Register a treebuilder based on its advertised features. 

53 

54 :param treebuilder_class: A subclass of Treebuilder. its .features 

55 attribute should list its features. 

56 """ 

57 for feature in treebuilder_class.features: 

58 self.builders_for_feature[feature].insert(0, treebuilder_class) 

59 self.builders.insert(0, treebuilder_class) 

60 

61 def lookup(self, *features): 

62 """Look up a TreeBuilder subclass with the desired features. 

63 

64 :param features: A list of features to look for. If none are 

65 provided, the most recently registered TreeBuilder subclass 

66 will be used. 

67 :return: A TreeBuilder subclass, or None if there's no 

68 registered subclass with all the requested features. 

69 """ 

70 if len(self.builders) == 0: 

71 # There are no builders at all. 

72 return None 

73 

74 if len(features) == 0: 

75 # They didn't ask for any features. Give them the most 

76 # recently registered builder. 

77 return self.builders[0] 

78 

79 # Go down the list of features in order, and eliminate any builders 

80 # that don't match every feature. 

81 features = list(features) 

82 features.reverse() 

83 candidates = None 

84 candidate_set = None 

85 while len(features) > 0: 

86 feature = features.pop() 

87 we_have_the_feature = self.builders_for_feature.get(feature, []) 

88 if len(we_have_the_feature) > 0: 

89 if candidates is None: 

90 candidates = we_have_the_feature 

91 candidate_set = set(candidates) 

92 else: 

93 # Eliminate any candidates that don't have this feature. 

94 candidate_set = candidate_set.intersection( 

95 set(we_have_the_feature)) 

96 

97 # The only valid candidates are the ones in candidate_set. 

98 # Go through the original list of candidates and pick the first one 

99 # that's in candidate_set. 

100 if candidate_set is None: 

101 return None 

102 for candidate in candidates: 

103 if candidate in candidate_set: 

104 return candidate 

105 return None 

106 

107# The BeautifulSoup class will take feature lists from developers and use them 

108# to look up builders in this registry. 

109builder_registry = TreeBuilderRegistry() 

110 

111class TreeBuilder(object): 

112 """Turn a textual document into a Beautiful Soup object tree.""" 

113 

114 NAME = "[Unknown tree builder]" 

115 ALTERNATE_NAMES = [] 

116 features = [] 

117 

118 is_xml = False 

119 picklable = False 

120 empty_element_tags = None # A tag will be considered an empty-element 

121 # tag when and only when it has no contents. 

122 

123 # A value for these tag/attribute combinations is a space- or 

124 # comma-separated list of CDATA, rather than a single CDATA. 

125 DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list) 

126 

127 # Whitespace should be preserved inside these tags. 

128 DEFAULT_PRESERVE_WHITESPACE_TAGS = set() 

129 

130 # The textual contents of tags with these names should be 

131 # instantiated with some class other than NavigableString. 

132 DEFAULT_STRING_CONTAINERS = {} 

133 

134 USE_DEFAULT = object() 

135 

136 # Most parsers don't keep track of line numbers. 

137 TRACKS_LINE_NUMBERS = False 

138 

139 def __init__(self, multi_valued_attributes=USE_DEFAULT, 

140 preserve_whitespace_tags=USE_DEFAULT, 

141 store_line_numbers=USE_DEFAULT, 

142 string_containers=USE_DEFAULT, 

143 ): 

144 """Constructor. 

145 

146 :param multi_valued_attributes: If this is set to None, the 

147 TreeBuilder will not turn any values for attributes like 

148 'class' into lists. Setting this to a dictionary will 

149 customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES 

150 for an example. 

151 

152 Internally, these are called "CDATA list attributes", but that 

153 probably doesn't make sense to an end-user, so the argument name 

154 is `multi_valued_attributes`. 

155 

156 :param preserve_whitespace_tags: A list of tags to treat 

157 the way <pre> tags are treated in HTML. Tags in this list 

158 are immune from pretty-printing; their contents will always be 

159 output as-is. 

160 

161 :param string_containers: A dictionary mapping tag names to 

162 the classes that should be instantiated to contain the textual 

163 contents of those tags. The default is to use NavigableString 

164 for every tag, no matter what the name. You can override the 

165 default by changing DEFAULT_STRING_CONTAINERS. 

166 

167 :param store_line_numbers: If the parser keeps track of the 

168 line numbers and positions of the original markup, that 

169 information will, by default, be stored in each corresponding 

170 `Tag` object. You can turn this off by passing 

171 store_line_numbers=False. If the parser you're using doesn't  

172 keep track of this information, then setting store_line_numbers=True 

173 will do nothing. 

174 """ 

175 self.soup = None 

176 if multi_valued_attributes is self.USE_DEFAULT: 

177 multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES 

178 self.cdata_list_attributes = multi_valued_attributes 

179 if preserve_whitespace_tags is self.USE_DEFAULT: 

180 preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS 

181 self.preserve_whitespace_tags = preserve_whitespace_tags 

182 if store_line_numbers == self.USE_DEFAULT: 

183 store_line_numbers = self.TRACKS_LINE_NUMBERS 

184 self.store_line_numbers = store_line_numbers 

185 if string_containers == self.USE_DEFAULT: 

186 string_containers = self.DEFAULT_STRING_CONTAINERS 

187 self.string_containers = string_containers 

188 

189 def initialize_soup(self, soup): 

190 """The BeautifulSoup object has been initialized and is now 

191 being associated with the TreeBuilder. 

192 

193 :param soup: A BeautifulSoup object. 

194 """ 

195 self.soup = soup 

196 

197 def reset(self): 

198 """Do any work necessary to reset the underlying parser 

199 for a new document. 

200 

201 By default, this does nothing. 

202 """ 

203 pass 

204 

205 def can_be_empty_element(self, tag_name): 

206 """Might a tag with this name be an empty-element tag? 

207 

208 The final markup may or may not actually present this tag as 

209 self-closing. 

210 

211 For instance: an HTMLBuilder does not consider a <p> tag to be 

212 an empty-element tag (it's not in 

213 HTMLBuilder.empty_element_tags). This means an empty <p> tag 

214 will be presented as "<p></p>", not "<p/>" or "<p>". 

215 

216 The default implementation has no opinion about which tags are 

217 empty-element tags, so a tag will be presented as an 

218 empty-element tag if and only if it has no children. 

219 "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will 

220 be left alone. 

221 

222 :param tag_name: The name of a markup tag. 

223 """ 

224 if self.empty_element_tags is None: 

225 return True 

226 return tag_name in self.empty_element_tags 

227 

228 def feed(self, markup): 

229 """Run some incoming markup through some parsing process, 

230 populating the `BeautifulSoup` object in self.soup. 

231 

232 This method is not implemented in TreeBuilder; it must be 

233 implemented in subclasses. 

234 

235 :return: None. 

236 """ 

237 raise NotImplementedError() 

238 

239 def prepare_markup(self, markup, user_specified_encoding=None, 

240 document_declared_encoding=None, exclude_encodings=None): 

241 """Run any preliminary steps necessary to make incoming markup 

242 acceptable to the parser. 

243 

244 :param markup: Some markup -- probably a bytestring. 

245 :param user_specified_encoding: The user asked to try this encoding. 

246 :param document_declared_encoding: The markup itself claims to be 

247 in this encoding. NOTE: This argument is not used by the 

248 calling code and can probably be removed. 

249 :param exclude_encodings: The user asked _not_ to try any of 

250 these encodings. 

251 

252 :yield: A series of 4-tuples: 

253 (markup, encoding, declared encoding, 

254 has undergone character replacement) 

255 

256 Each 4-tuple represents a strategy for converting the 

257 document to Unicode and parsing it. Each strategy will be tried  

258 in turn. 

259 

260 By default, the only strategy is to parse the markup 

261 as-is. See `LXMLTreeBuilderForXML` and 

262 `HTMLParserTreeBuilder` for implementations that take into 

263 account the quirks of particular parsers. 

264 """ 

265 yield markup, None, None, False 

266 

267 def test_fragment_to_document(self, fragment): 

268 """Wrap an HTML fragment to make it look like a document. 

269 

270 Different parsers do this differently. For instance, lxml 

271 introduces an empty <head> tag, and html5lib 

272 doesn't. Abstracting this away lets us write simple tests 

273 which run HTML fragments through the parser and compare the 

274 results against other HTML fragments. 

275 

276 This method should not be used outside of tests. 

277 

278 :param fragment: A string -- fragment of HTML. 

279 :return: A string -- a full HTML document. 

280 """ 

281 return fragment 

282 

283 def set_up_substitutions(self, tag): 

284 """Set up any substitutions that will need to be performed on  

285 a `Tag` when it's output as a string. 

286 

287 By default, this does nothing. See `HTMLTreeBuilder` for a 

288 case where this is used. 

289 

290 :param tag: A `Tag` 

291 :return: Whether or not a substitution was performed. 

292 """ 

293 return False 

294 

295 def _replace_cdata_list_attribute_values(self, tag_name, attrs): 

296 """When an attribute value is associated with a tag that can 

297 have multiple values for that attribute, convert the string 

298 value to a list of strings. 

299 

300 Basically, replaces class="foo bar" with class=["foo", "bar"] 

301 

302 NOTE: This method modifies its input in place. 

303 

304 :param tag_name: The name of a tag. 

305 :param attrs: A dictionary containing the tag's attributes. 

306 Any appropriate attribute values will be modified in place. 

307 """ 

308 if not attrs: 

309 return attrs 

310 if self.cdata_list_attributes: 

311 universal = self.cdata_list_attributes.get('*', []) 

312 tag_specific = self.cdata_list_attributes.get( 

313 tag_name.lower(), None) 

314 for attr in list(attrs.keys()): 

315 if attr in universal or (tag_specific and attr in tag_specific): 

316 # We have a "class"-type attribute whose string 

317 # value is a whitespace-separated list of 

318 # values. Split it into a list. 

319 value = attrs[attr] 

320 if isinstance(value, str): 

321 values = nonwhitespace_re.findall(value) 

322 else: 

323 # html5lib sometimes calls setAttributes twice 

324 # for the same tag when rearranging the parse 

325 # tree. On the second call the attribute value 

326 # here is already a list. If this happens, 

327 # leave the value alone rather than trying to 

328 # split it again. 

329 values = value 

330 attrs[attr] = values 

331 return attrs 

332 

333class SAXTreeBuilder(TreeBuilder): 

334 """A Beautiful Soup treebuilder that listens for SAX events. 

335 

336 This is not currently used for anything, but it demonstrates 

337 how a simple TreeBuilder would work. 

338 """ 

339 

340 def feed(self, markup): 

341 raise NotImplementedError() 

342 

343 def close(self): 

344 pass 

345 

346 def startElement(self, name, attrs): 

347 attrs = dict((key[1], value) for key, value in list(attrs.items())) 

348 #print("Start %s, %r" % (name, attrs)) 

349 self.soup.handle_starttag(name, attrs) 

350 

351 def endElement(self, name): 

352 #print("End %s" % name) 

353 self.soup.handle_endtag(name) 

354 

355 def startElementNS(self, nsTuple, nodeName, attrs): 

356 # Throw away (ns, nodeName) for now. 

357 self.startElement(nodeName, attrs) 

358 

359 def endElementNS(self, nsTuple, nodeName): 

360 # Throw away (ns, nodeName) for now. 

361 self.endElement(nodeName) 

362 #handler.endElementNS((ns, node.nodeName), node.nodeName) 

363 

364 def startPrefixMapping(self, prefix, nodeValue): 

365 # Ignore the prefix for now. 

366 pass 

367 

368 def endPrefixMapping(self, prefix): 

369 # Ignore the prefix for now. 

370 # handler.endPrefixMapping(prefix) 

371 pass 

372 

373 def characters(self, content): 

374 self.soup.handle_data(content) 

375 

376 def startDocument(self): 

377 pass 

378 

379 def endDocument(self): 

380 pass 

381 

382 

383class HTMLTreeBuilder(TreeBuilder): 

384 """This TreeBuilder knows facts about HTML. 

385 

386 Such as which tags are empty-element tags. 

387 """ 

388 

389 empty_element_tags = set([ 

390 # These are from HTML5. 

391 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', 

392 

393 # These are from earlier versions of HTML and are removed in HTML5. 

394 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' 

395 ]) 

396 

397 # The HTML standard defines these as block-level elements. Beautiful 

398 # Soup does not treat these elements differently from other elements, 

399 # but it may do so eventually, and this information is available if 

400 # you need to use it. 

401 block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) 

402 

403 # These HTML tags need special treatment so they can be 

404 # represented by a string class other than NavigableString. 

405 # 

406 # For some of these tags, it's because the HTML standard defines 

407 # an unusual content model for them. I made this list by going 

408 # through the HTML spec 

409 # (https://html.spec.whatwg.org/#metadata-content) and looking for 

410 # "metadata content" elements that can contain strings. 

411 # 

412 # The Ruby tags (<rt> and <rp>) are here despite being normal 

413 # "phrasing content" tags, because the content they contain is 

414 # qualitatively different from other text in the document, and it 

415 # can be useful to be able to distinguish it. 

416 # 

417 # TODO: Arguably <noscript> could go here but it seems 

418 # qualitatively different from the other tags. 

419 DEFAULT_STRING_CONTAINERS = { 

420 'rt' : RubyTextString, 

421 'rp' : RubyParenthesisString, 

422 'style': Stylesheet, 

423 'script': Script, 

424 'template': TemplateString, 

425 } 

426 

427 # The HTML standard defines these attributes as containing a 

428 # space-separated list of values, not a single value. That is, 

429 # class="foo bar" means that the 'class' attribute has two values, 

430 # 'foo' and 'bar', not the single value 'foo bar'. When we 

431 # encounter one of these attributes, we will parse its value into 

432 # a list of values if possible. Upon output, the list will be 

433 # converted back into a string. 

434 DEFAULT_CDATA_LIST_ATTRIBUTES = { 

435 "*" : ['class', 'accesskey', 'dropzone'], 

436 "a" : ['rel', 'rev'], 

437 "link" : ['rel', 'rev'], 

438 "td" : ["headers"], 

439 "th" : ["headers"], 

440 "td" : ["headers"], 

441 "form" : ["accept-charset"], 

442 "object" : ["archive"], 

443 

444 # These are HTML5 specific, as are *.accesskey and *.dropzone above. 

445 "area" : ["rel"], 

446 "icon" : ["sizes"], 

447 "iframe" : ["sandbox"], 

448 "output" : ["for"], 

449 } 

450 

451 DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) 

452 

453 def set_up_substitutions(self, tag): 

454 """Replace the declared encoding in a <meta> tag with a placeholder, 

455 to be substituted when the tag is output to a string. 

456 

457 An HTML document may come in to Beautiful Soup as one 

458 encoding, but exit in a different encoding, and the <meta> tag 

459 needs to be changed to reflect this. 

460 

461 :param tag: A `Tag` 

462 :return: Whether or not a substitution was performed. 

463 """ 

464 # We are only interested in <meta> tags 

465 if tag.name != 'meta': 

466 return False 

467 

468 http_equiv = tag.get('http-equiv') 

469 content = tag.get('content') 

470 charset = tag.get('charset') 

471 

472 # We are interested in <meta> tags that say what encoding the 

473 # document was originally in. This means HTML 5-style <meta> 

474 # tags that provide the "charset" attribute. It also means 

475 # HTML 4-style <meta> tags that provide the "content" 

476 # attribute and have "http-equiv" set to "content-type". 

477 # 

478 # In both cases we will replace the value of the appropriate 

479 # attribute with a standin object that can take on any 

480 # encoding. 

481 meta_encoding = None 

482 if charset is not None: 

483 # HTML 5 style: 

484 # <meta charset="utf8"> 

485 meta_encoding = charset 

486 tag['charset'] = CharsetMetaAttributeValue(charset) 

487 

488 elif (content is not None and http_equiv is not None 

489 and http_equiv.lower() == 'content-type'): 

490 # HTML 4 style: 

491 # <meta http-equiv="content-type" content="text/html; charset=utf8"> 

492 tag['content'] = ContentMetaAttributeValue(content) 

493 

494 return (meta_encoding is not None) 

495 

496class DetectsXMLParsedAsHTML(object): 

497 """A mixin class for any class (a TreeBuilder, or some class used by a 

498 TreeBuilder) that's in a position to detect whether an XML 

499 document is being incorrectly parsed as HTML, and issue an 

500 appropriate warning. 

501 

502 This requires being able to observe an incoming processing 

503 instruction that might be an XML declaration, and also able to 

504 observe tags as they're opened. If you can't do that for a given 

505 TreeBuilder, there's a less reliable implementation based on 

506 examining the raw markup. 

507 """ 

508 

509 # Regular expression for seeing if markup has an <html> tag. 

510 LOOKS_LIKE_HTML = re.compile("<[^ +]html", re.I) 

511 LOOKS_LIKE_HTML_B = re.compile(b"<[^ +]html", re.I) 

512 

513 XML_PREFIX = '<?xml' 

514 XML_PREFIX_B = b'<?xml' 

515 

516 @classmethod 

517 def warn_if_markup_looks_like_xml(cls, markup): 

518 """Perform a check on some markup to see if it looks like XML 

519 that's not XHTML. If so, issue a warning. 

520 

521 This is much less reliable than doing the check while parsing, 

522 but some of the tree builders can't do that. 

523 

524 :return: True if the markup looks like non-XHTML XML, False 

525 otherwise. 

526 """ 

527 if isinstance(markup, bytes): 

528 prefix = cls.XML_PREFIX_B 

529 looks_like_html = cls.LOOKS_LIKE_HTML_B 

530 else: 

531 prefix = cls.XML_PREFIX 

532 looks_like_html = cls.LOOKS_LIKE_HTML 

533 

534 if (markup is not None 

535 and markup.startswith(prefix) 

536 and not looks_like_html.search(markup[:500]) 

537 ): 

538 cls._warn() 

539 return True 

540 return False 

541 

542 @classmethod 

543 def _warn(cls): 

544 """Issue a warning about XML being parsed as HTML.""" 

545 warnings.warn( 

546 XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning 

547 ) 

548 

549 def _initialize_xml_detector(self): 

550 """Call this method before parsing a document.""" 

551 self._first_processing_instruction = None 

552 self._root_tag = None 

553 

554 def _document_might_be_xml(self, processing_instruction): 

555 """Call this method when encountering an XML declaration, or a 

556 "processing instruction" that might be an XML declaration. 

557 """ 

558 if (self._first_processing_instruction is not None 

559 or self._root_tag is not None): 

560 # The document has already started. Don't bother checking 

561 # anymore. 

562 return 

563 

564 self._first_processing_instruction = processing_instruction 

565 

566 # We won't know until we encounter the first tag whether or 

567 # not this is actually a problem. 

568 

569 def _root_tag_encountered(self, name): 

570 """Call this when you encounter the document's root tag. 

571 

572 This is where we actually check whether an XML document is 

573 being incorrectly parsed as HTML, and issue the warning. 

574 """ 

575 if self._root_tag is not None: 

576 # This method was incorrectly called multiple times. Do 

577 # nothing. 

578 return 

579 

580 self._root_tag = name 

581 if (name != 'html' and self._first_processing_instruction is not None 

582 and self._first_processing_instruction.lower().startswith('xml ')): 

583 # We encountered an XML declaration and then a tag other 

584 # than 'html'. This is a reliable indicator that a 

585 # non-XHTML document is being parsed as XML. 

586 self._warn() 

587 

588 

589def register_treebuilders_from(module): 

590 """Copy TreeBuilders from the given module into this module.""" 

591 this_module = sys.modules[__name__] 

592 for name in module.__all__: 

593 obj = getattr(module, name) 

594 

595 if issubclass(obj, TreeBuilder): 

596 setattr(this_module, name, obj) 

597 this_module.__all__.append(name) 

598 # Register the builder while we're at it. 

599 this_module.builder_registry.register(obj) 

600 

601class ParserRejectedMarkup(Exception): 

602 """An Exception to be raised when the underlying parser simply 

603 refuses to parse the given markup. 

604 """ 

605 def __init__(self, message_or_exception): 

606 """Explain why the parser rejected the given markup, either 

607 with a textual explanation or another exception. 

608 """ 

609 if isinstance(message_or_exception, Exception): 

610 e = message_or_exception 

611 message_or_exception = "%s: %s" % (e.__class__.__name__, str(e)) 

612 super(ParserRejectedMarkup, self).__init__(message_or_exception) 

613 

614# Builders are registered in reverse order of priority, so that custom 

615# builder registrations will take precedence. In general, we want lxml 

616# to take precedence over html5lib, because it's faster. And we only 

617# want to use HTMLParser as a last resort. 

618from . import _htmlparser 

619register_treebuilders_from(_htmlparser) 

620try: 

621 from . import _html5lib 

622 register_treebuilders_from(_html5lib) 

623except ImportError: 

624 # They don't have html5lib installed. 

625 pass 

626try: 

627 from . import _lxml 

628 register_treebuilders_from(_lxml) 

629except ImportError: 

630 # They don't have lxml installed. 

631 pass