Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/__init__.py: 79%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

232 statements  

1from __future__ import annotations 

2 

3# Use of this source code is governed by the MIT license. 

4__license__ = "MIT" 

5 

6from collections import defaultdict 

7import re 

8from types import ModuleType 

9from typing import ( 

10 Any, 

11 cast, 

12 Dict, 

13 Iterable, 

14 List, 

15 Optional, 

16 Pattern, 

17 Set, 

18 Tuple, 

19 Type, 

20 TYPE_CHECKING, 

21) 

22import warnings 

23import sys 

24from bs4.element import ( 

25 AttributeDict, 

26 AttributeValueList, 

27 CharsetMetaAttributeValue, 

28 ContentMetaAttributeValue, 

29 RubyParenthesisString, 

30 RubyTextString, 

31 Stylesheet, 

32 Script, 

33 TemplateString, 

34 nonwhitespace_re, 

35) 

36 

37# Exceptions were moved to their own module in 4.13. Import here for 

38# backwards compatibility. 

39from bs4.exceptions import ParserRejectedMarkup 

40 

41from bs4._typing import ( 

42 _AttributeValues, 

43 _RawAttributeValue, 

44) 

45 

46from bs4._warnings import XMLParsedAsHTMLWarning 

47 

48if TYPE_CHECKING: 

49 from bs4 import BeautifulSoup 

50 from bs4.element import ( 

51 NavigableString, 

52 Tag, 

53 ) 

54 from bs4._typing import ( 

55 _AttributeValue, 

56 _Encoding, 

57 _Encodings, 

58 _RawOrProcessedAttributeValues, 

59 _RawMarkup, 

60 ) 

61 

62__all__ = [ 

63 "HTMLTreeBuilder", 

64 "SAXTreeBuilder", 

65 "TreeBuilder", 

66 "TreeBuilderRegistry", 

67] 

68 

69# Some useful features for a TreeBuilder to have. 

70FAST = "fast" 

71PERMISSIVE = "permissive" 

72STRICT = "strict" 

73XML = "xml" 

74HTML = "html" 

75HTML_5 = "html5" 

76 

77__all__ = [ 

78 "TreeBuilderRegistry", 

79 "TreeBuilder", 

80 "HTMLTreeBuilder", 

81 "DetectsXMLParsedAsHTML", 

82 

83 "ParserRejectedMarkup", # backwards compatibility only as of 4.13.0 

84] 

85 

86class TreeBuilderRegistry(object): 

87 """A way of looking up TreeBuilder subclasses by their name or by desired 

88 features. 

89 """ 

90 

91 builders_for_feature: Dict[str, List[Type[TreeBuilder]]] 

92 builders: List[Type[TreeBuilder]] 

93 

94 def __init__(self) -> None: 

95 self.builders_for_feature = defaultdict(list) 

96 self.builders = [] 

97 

98 def register(self, treebuilder_class: type[TreeBuilder]) -> None: 

99 """Register a treebuilder based on its advertised features. 

100 

101 :param treebuilder_class: A subclass of `TreeBuilder`. its 

102 `TreeBuilder.features` attribute should list its features. 

103 """ 

104 for feature in treebuilder_class.features: 

105 self.builders_for_feature[feature].insert(0, treebuilder_class) 

106 self.builders.insert(0, treebuilder_class) 

107 

108 def lookup(self, *features: str) -> Optional[Type[TreeBuilder]]: 

109 """Look up a TreeBuilder subclass with the desired features. 

110 

111 :param features: A list of features to look for. If none are 

112 provided, the most recently registered TreeBuilder subclass 

113 will be used. 

114 :return: A TreeBuilder subclass, or None if there's no 

115 registered subclass with all the requested features. 

116 """ 

117 if len(self.builders) == 0: 

118 # There are no builders at all. 

119 return None 

120 

121 if len(features) == 0: 

122 # They didn't ask for any features. Give them the most 

123 # recently registered builder. 

124 return self.builders[0] 

125 

126 # Go down the list of features in order, and eliminate any builders 

127 # that don't match every feature. 

128 feature_list = list(features) 

129 feature_list.reverse() 

130 candidates = None 

131 candidate_set = None 

132 while len(feature_list) > 0: 

133 feature = feature_list.pop() 

134 we_have_the_feature = self.builders_for_feature.get(feature, []) 

135 if len(we_have_the_feature) > 0: 

136 if candidates is None: 

137 candidates = we_have_the_feature 

138 candidate_set = set(candidates) 

139 else: 

140 # Eliminate any candidates that don't have this feature. 

141 candidate_set = candidate_set.intersection(set(we_have_the_feature)) 

142 

143 # The only valid candidates are the ones in candidate_set. 

144 # Go through the original list of candidates and pick the first one 

145 # that's in candidate_set. 

146 if candidate_set is None or candidates is None: 

147 return None 

148 for candidate in candidates: 

149 if candidate in candidate_set: 

150 return candidate 

151 return None 

152 

153 

154#: The `BeautifulSoup` constructor will take a list of features 

155#: and use it to look up `TreeBuilder` classes in this registry. 

156builder_registry: TreeBuilderRegistry = TreeBuilderRegistry() 

157 

158 

159class TreeBuilder(object): 

160 """Turn a textual document into a Beautiful Soup object tree. 

161 

162 This is an abstract superclass which smooths out the behavior of 

163 different parser libraries into a single, unified interface. 

164 

165 :param multi_valued_attributes: If this is set to None, the 

166 TreeBuilder will not turn any values for attributes like 

167 'class' into lists. Setting this to a dictionary will 

168 customize this behavior; look at :py:attr:`bs4.builder.HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES` 

169 for an example. 

170 

171 Internally, these are called "CDATA list attributes", but that 

172 probably doesn't make sense to an end-user, so the argument name 

173 is ``multi_valued_attributes``. 

174 

175 :param preserve_whitespace_tags: A set of tags to treat 

176 the way <pre> tags are treated in HTML. Tags in this set 

177 are immune from pretty-printing; their contents will always be 

178 output as-is. 

179 

180 :param string_containers: A dictionary mapping tag names to 

181 the classes that should be instantiated to contain the textual 

182 contents of those tags. The default is to use NavigableString 

183 for every tag, no matter what the name. You can override the 

184 default by changing :py:attr:`DEFAULT_STRING_CONTAINERS`. 

185 

186 :param store_line_numbers: If the parser keeps track of the line 

187 numbers and positions of the original markup, that information 

188 will, by default, be stored in each corresponding 

189 :py:class:`bs4.element.Tag` object. You can turn this off by 

190 passing store_line_numbers=False; then Tag.sourcepos and 

191 Tag.sourceline will always be None. If the parser you're using 

192 doesn't keep track of this information, then store_line_numbers 

193 is irrelevant. 

194 

195 :param attribute_dict_class: The value of a multi-valued attribute 

196 (such as HTML's 'class') willl be stored in an instance of this 

197 class. The default is Beautiful Soup's built-in 

198 `AttributeValueList`, which is a normal Python list, and you 

199 will probably never need to change it. 

200 """ 

201 

202 USE_DEFAULT: Any = object() #: :meta private: 

203 

204 def __init__( 

205 self, 

206 multi_valued_attributes: Dict[str, Set[str]] = USE_DEFAULT, 

207 preserve_whitespace_tags: Set[str] = USE_DEFAULT, 

208 store_line_numbers: bool = USE_DEFAULT, 

209 string_containers: Dict[str, Type[NavigableString]] = USE_DEFAULT, 

210 empty_element_tags: Set[str] = USE_DEFAULT, 

211 attribute_dict_class: Type[AttributeDict] = AttributeDict, 

212 attribute_value_list_class: Type[AttributeValueList] = AttributeValueList, 

213 ): 

214 self.soup = None 

215 if multi_valued_attributes is self.USE_DEFAULT: 

216 multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES 

217 self.cdata_list_attributes = multi_valued_attributes 

218 if preserve_whitespace_tags is self.USE_DEFAULT: 

219 preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS 

220 self.preserve_whitespace_tags = preserve_whitespace_tags 

221 if empty_element_tags is self.USE_DEFAULT: 

222 self.empty_element_tags = self.DEFAULT_EMPTY_ELEMENT_TAGS 

223 else: 

224 self.empty_element_tags = empty_element_tags 

225 # TODO: store_line_numbers is probably irrelevant now that 

226 # the behavior of sourceline and sourcepos has been made consistent 

227 # everywhere. 

228 if store_line_numbers == self.USE_DEFAULT: 

229 store_line_numbers = self.TRACKS_LINE_NUMBERS 

230 self.store_line_numbers = store_line_numbers 

231 if string_containers == self.USE_DEFAULT: 

232 string_containers = self.DEFAULT_STRING_CONTAINERS 

233 self.string_containers = string_containers 

234 self.attribute_dict_class = attribute_dict_class 

235 self.attribute_value_list_class = attribute_value_list_class 

236 

237 NAME: str = "[Unknown tree builder]" 

238 ALTERNATE_NAMES: Iterable[str] = [] 

239 features: Iterable[str] = [] 

240 

241 is_xml: bool = False 

242 picklable: bool = False 

243 

244 soup: Optional[BeautifulSoup] #: :meta private: 

245 

246 #: A tag will be considered an empty-element 

247 #: tag when and only when it has no contents. 

248 empty_element_tags: Optional[Set[str]] = None #: :meta private: 

249 cdata_list_attributes: Dict[str, Set[str]] #: :meta private: 

250 preserve_whitespace_tags: Set[str] #: :meta private: 

251 string_containers: Dict[str, Type[NavigableString]] #: :meta private: 

252 tracks_line_numbers: bool #: :meta private: 

253 

254 #: A value for these tag/attribute combinations is a space- or 

255 #: comma-separated list of CDATA, rather than a single CDATA. 

256 DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = defaultdict(set) 

257 

258 #: Whitespace should be preserved inside these tags. 

259 DEFAULT_PRESERVE_WHITESPACE_TAGS: Set[str] = set() 

260 

261 #: The textual contents of tags with these names should be 

262 #: instantiated with some class other than `bs4.element.NavigableString`. 

263 DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {} 

264 

265 #: By default, tags are treated as empty-element tags if they have 

266 #: no contents--that is, using XML rules. HTMLTreeBuilder 

267 #: defines a different set of DEFAULT_EMPTY_ELEMENT_TAGS based on the 

268 #: HTML 4 and HTML5 standards. 

269 DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = None 

270 

271 #: Most parsers don't keep track of line numbers. 

272 TRACKS_LINE_NUMBERS: bool = False 

273 

274 def initialize_soup(self, soup: BeautifulSoup) -> None: 

275 """The BeautifulSoup object has been initialized and is now 

276 being associated with the TreeBuilder. 

277 

278 :param soup: A BeautifulSoup object. 

279 """ 

280 self.soup = soup 

281 

282 def reset(self) -> None: 

283 """Do any work necessary to reset the underlying parser 

284 for a new document. 

285 

286 By default, this does nothing. 

287 """ 

288 pass 

289 

290 def can_be_empty_element(self, tag_name: str) -> bool: 

291 """Might a tag with this name be an empty-element tag? 

292 

293 The final markup may or may not actually present this tag as 

294 self-closing. 

295 

296 For instance: an HTMLBuilder does not consider a <p> tag to be 

297 an empty-element tag (it's not in 

298 HTMLBuilder.empty_element_tags). This means an empty <p> tag 

299 will be presented as "<p></p>", not "<p/>" or "<p>". 

300 

301 The default implementation has no opinion about which tags are 

302 empty-element tags, so a tag will be presented as an 

303 empty-element tag if and only if it has no children. 

304 "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will 

305 be left alone. 

306 

307 :param tag_name: The name of a markup tag. 

308 """ 

309 if self.empty_element_tags is None: 

310 return True 

311 return tag_name in self.empty_element_tags 

312 

313 def feed(self, markup: _RawMarkup) -> None: 

314 """Run incoming markup through some parsing process.""" 

315 raise NotImplementedError() 

316 

317 def prepare_markup( 

318 self, 

319 markup: _RawMarkup, 

320 user_specified_encoding: Optional[_Encoding] = None, 

321 document_declared_encoding: Optional[_Encoding] = None, 

322 exclude_encodings: Optional[_Encodings] = None, 

323 ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]: 

324 """Run any preliminary steps necessary to make incoming markup 

325 acceptable to the parser. 

326 

327 :param markup: The markup that's about to be parsed. 

328 :param user_specified_encoding: The user asked to try this encoding 

329 to convert the markup into a Unicode string. 

330 :param document_declared_encoding: The markup itself claims to be 

331 in this encoding. NOTE: This argument is not used by the 

332 calling code and can probably be removed. 

333 :param exclude_encodings: The user asked *not* to try any of 

334 these encodings. 

335 

336 :yield: A series of 4-tuples: (markup, encoding, declared encoding, 

337 has undergone character replacement) 

338 

339 Each 4-tuple represents a strategy that the parser can try 

340 to convert the document to Unicode and parse it. Each 

341 strategy will be tried in turn. 

342 

343 By default, the only strategy is to parse the markup 

344 as-is. See `LXMLTreeBuilderForXML` and 

345 `HTMLParserTreeBuilder` for implementations that take into 

346 account the quirks of particular parsers. 

347 

348 :meta private: 

349 

350 """ 

351 yield markup, None, None, False 

352 

353 def test_fragment_to_document(self, fragment: str) -> str: 

354 """Wrap an HTML fragment to make it look like a document. 

355 

356 Different parsers do this differently. For instance, lxml 

357 introduces an empty <head> tag, and html5lib 

358 doesn't. Abstracting this away lets us write simple tests 

359 which run HTML fragments through the parser and compare the 

360 results against other HTML fragments. 

361 

362 This method should not be used outside of unit tests. 

363 

364 :param fragment: A fragment of HTML. 

365 :return: A full HTML document. 

366 :meta private: 

367 """ 

368 return fragment 

369 

370 def set_up_substitutions(self, tag: Tag) -> bool: 

371 """Set up any substitutions that will need to be performed on 

372 a `Tag` when it's output as a string. 

373 

374 By default, this does nothing. See `HTMLTreeBuilder` for a 

375 case where this is used. 

376 

377 :return: Whether or not a substitution was performed. 

378 :meta private: 

379 """ 

380 return False 

381 

382 def _replace_cdata_list_attribute_values( 

383 self, tag_name: str, attrs: _RawOrProcessedAttributeValues 

384 ) -> _AttributeValues: 

385 """When an attribute value is associated with a tag that can 

386 have multiple values for that attribute, convert the string 

387 value to a list of strings. 

388 

389 Basically, replaces class="foo bar" with class=["foo", "bar"] 

390 

391 NOTE: This method modifies its input in place. 

392 

393 :param tag_name: The name of a tag. 

394 :param attrs: A dictionary containing the tag's attributes. 

395 Any appropriate attribute values will be modified in place. 

396 :return: The modified dictionary that was originally passed in. 

397 """ 

398 

399 # First, cast the attrs dict to _AttributeValues. This might 

400 # not be accurate yet, but it will be by the time this method 

401 # returns. 

402 modified_attrs = cast(_AttributeValues, attrs) 

403 if not modified_attrs or not self.cdata_list_attributes: 

404 # Nothing to do. 

405 return modified_attrs 

406 

407 # There is at least a possibility that we need to modify one of 

408 # the attribute values. 

409 universal: Set[str] = self.cdata_list_attributes.get("*", set()) 

410 tag_specific = self.cdata_list_attributes.get(tag_name.lower(), None) 

411 for attr in list(modified_attrs.keys()): 

412 modified_value: _AttributeValue 

413 if attr in universal or (tag_specific and attr in tag_specific): 

414 # We have a "class"-type attribute whose string 

415 # value is a whitespace-separated list of 

416 # values. Split it into a list. 

417 original_value: _AttributeValue = modified_attrs[attr] 

418 if isinstance(original_value, _RawAttributeValue): 

419 # This is a _RawAttributeValue (a string) that 

420 # needs to be split and converted to a 

421 # AttributeValueList so it can be an 

422 # _AttributeValue. 

423 modified_value = self.attribute_value_list_class( 

424 nonwhitespace_re.findall(original_value) 

425 ) 

426 else: 

427 # html5lib calls setAttributes twice for the 

428 # same tag when rearranging the parse tree. On 

429 # the second call the attribute value here is 

430 # already a list. This can also happen when a 

431 # Tag object is cloned. If this happens, leave 

432 # the value alone rather than trying to split 

433 # it again. 

434 modified_value = original_value 

435 modified_attrs[attr] = modified_value 

436 return modified_attrs 

437 

438 

439class SAXTreeBuilder(TreeBuilder): 

440 """A Beautiful Soup treebuilder that listens for SAX events. 

441 

442 This is not currently used for anything, and it will be removed 

443 soon. It was a good idea, but it wasn't properly integrated into the 

444 rest of Beautiful Soup, so there have been long stretches where it 

445 hasn't worked properly. 

446 """ 

447 

448 def __init__(self, *args: Any, **kwargs: Any) -> None: 

449 warnings.warn( 

450 "The SAXTreeBuilder class was deprecated in 4.13.0 and will be removed soon thereafter. It is completely untested and probably doesn't work; do not use it.", 

451 DeprecationWarning, 

452 stacklevel=2, 

453 ) 

454 super(SAXTreeBuilder, self).__init__(*args, **kwargs) 

455 

456 def feed(self, markup: _RawMarkup) -> None: 

457 raise NotImplementedError() 

458 

459 def close(self) -> None: 

460 pass 

461 

462 def startElement(self, name: str, attrs: Dict[str, str]) -> None: 

463 attrs = AttributeDict((key[1], value) for key, value in list(attrs.items())) 

464 # print("Start %s, %r" % (name, attrs)) 

465 assert self.soup is not None 

466 self.soup.handle_starttag(name, None, None, attrs) 

467 

468 def endElement(self, name: str) -> None: 

469 # print("End %s" % name) 

470 assert self.soup is not None 

471 self.soup.handle_endtag(name) 

472 

473 def startElementNS( 

474 self, nsTuple: Tuple[str, str], nodeName: str, attrs: Dict[str, str] 

475 ) -> None: 

476 # Throw away (ns, nodeName) for now. 

477 self.startElement(nodeName, attrs) 

478 

479 def endElementNS(self, nsTuple: Tuple[str, str], nodeName: str) -> None: 

480 # Throw away (ns, nodeName) for now. 

481 self.endElement(nodeName) 

482 # handler.endElementNS((ns, node.nodeName), node.nodeName) 

483 

484 def startPrefixMapping(self, prefix: str, nodeValue: str) -> None: 

485 # Ignore the prefix for now. 

486 pass 

487 

488 def endPrefixMapping(self, prefix: str) -> None: 

489 # Ignore the prefix for now. 

490 # handler.endPrefixMapping(prefix) 

491 pass 

492 

493 def characters(self, content: str) -> None: 

494 assert self.soup is not None 

495 self.soup.handle_data(content) 

496 

497 def startDocument(self) -> None: 

498 pass 

499 

500 def endDocument(self) -> None: 

501 pass 

502 

503 

504class HTMLTreeBuilder(TreeBuilder): 

505 """This TreeBuilder knows facts about HTML, such as which tags are treated 

506 specially by the HTML standard. 

507 """ 

508 

509 #: Some HTML tags are defined as having no contents. Beautiful Soup 

510 #: treats these specially. 

511 DEFAULT_EMPTY_ELEMENT_TAGS: Set[str] = set( 

512 [ 

513 # These are from HTML5. 

514 "area", 

515 "base", 

516 "br", 

517 "col", 

518 "embed", 

519 "hr", 

520 "img", 

521 "input", 

522 "keygen", 

523 "link", 

524 "menuitem", 

525 "meta", 

526 "param", 

527 "source", 

528 "track", 

529 "wbr", 

530 # These are from earlier versions of HTML and are removed in HTML5. 

531 "basefont", 

532 "bgsound", 

533 "command", 

534 "frame", 

535 "image", 

536 "isindex", 

537 "nextid", 

538 "spacer", 

539 ] 

540 ) 

541 

542 #: The HTML standard defines these tags as block-level elements. Beautiful 

543 #: Soup does not treat these elements differently from other elements, 

544 #: but it may do so eventually, and this information is available if 

545 #: you need to use it. 

546 DEFAULT_BLOCK_ELEMENTS: Set[str] = set( 

547 [ 

548 "address", 

549 "article", 

550 "aside", 

551 "blockquote", 

552 "canvas", 

553 "dd", 

554 "div", 

555 "dl", 

556 "dt", 

557 "fieldset", 

558 "figcaption", 

559 "figure", 

560 "footer", 

561 "form", 

562 "h1", 

563 "h2", 

564 "h3", 

565 "h4", 

566 "h5", 

567 "h6", 

568 "header", 

569 "hr", 

570 "li", 

571 "main", 

572 "nav", 

573 "noscript", 

574 "ol", 

575 "output", 

576 "p", 

577 "pre", 

578 "section", 

579 "table", 

580 "tfoot", 

581 "ul", 

582 "video", 

583 ] 

584 ) 

585 

586 #: These HTML tags need special treatment so they can be 

587 #: represented by a string class other than `bs4.element.NavigableString`. 

588 #: 

589 #: For some of these tags, it's because the HTML standard defines 

590 #: an unusual content model for them. I made this list by going 

591 #: through the HTML spec 

592 #: (https://html.spec.whatwg.org/#metadata-content) and looking for 

593 #: "metadata content" elements that can contain strings. 

594 #: 

595 #: The Ruby tags (<rt> and <rp>) are here despite being normal 

596 #: "phrasing content" tags, because the content they contain is 

597 #: qualitatively different from other text in the document, and it 

598 #: can be useful to be able to distinguish it. 

599 #: 

600 #: TODO: Arguably <noscript> could go here but it seems 

601 #: qualitatively different from the other tags. 

602 DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = { 

603 "rt": RubyTextString, 

604 "rp": RubyParenthesisString, 

605 "style": Stylesheet, 

606 "script": Script, 

607 "template": TemplateString, 

608 } 

609 

610 #: The HTML standard defines these attributes as containing a 

611 #: space-separated list of values, not a single value. That is, 

612 #: class="foo bar" means that the 'class' attribute has two values, 

613 #: 'foo' and 'bar', not the single value 'foo bar'. When we 

614 #: encounter one of these attributes, we will parse its value into 

615 #: a list of values if possible. Upon output, the list will be 

616 #: converted back into a string. 

617 DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = { 

618 "*": {"class", "accesskey", "dropzone"}, 

619 "a": {"rel", "rev"}, 

620 "link": {"rel", "rev"}, 

621 "td": {"headers"}, 

622 "th": {"headers"}, 

623 "form": {"accept-charset"}, 

624 "object": {"archive"}, 

625 # These are HTML5 specific, as are *.accesskey and *.dropzone above. 

626 "area": {"rel"}, 

627 "icon": {"sizes"}, 

628 "iframe": {"sandbox"}, 

629 "output": {"for"}, 

630 } 

631 

632 #: By default, whitespace inside these HTML tags will be 

633 #: preserved rather than being collapsed. 

634 DEFAULT_PRESERVE_WHITESPACE_TAGS: set[str] = set(["pre", "textarea"]) 

635 

636 def set_up_substitutions(self, tag: Tag) -> bool: 

637 """Replace the declared encoding in a <meta> tag with a placeholder, 

638 to be substituted when the tag is output to a string. 

639 

640 An HTML document may come in to Beautiful Soup as one 

641 encoding, but exit in a different encoding, and the <meta> tag 

642 needs to be changed to reflect this. 

643 

644 :return: Whether or not a substitution was performed. 

645 

646 :meta private: 

647 """ 

648 # We are only interested in <meta> tags 

649 if tag.name != "meta": 

650 return False 

651 

652 # TODO: This cast will fail in the (very unlikely) scenario 

653 # that the programmer who instantiates the TreeBuilder 

654 # specifies meta['content'] or meta['charset'] as 

655 # cdata_list_attributes. 

656 content: Optional[str] = cast(Optional[str], tag.get("content")) 

657 charset: Optional[str] = cast(Optional[str], tag.get("charset")) 

658 

659 # But we can accommodate meta['http-equiv'] being made a 

660 # cdata_list_attribute (again, very unlikely) without much 

661 # trouble. 

662 http_equiv: List[str] = tag.get_attribute_list("http-equiv") 

663 

664 # We are interested in <meta> tags that say what encoding the 

665 # document was originally in. This means HTML 5-style <meta> 

666 # tags that provide the "charset" attribute. It also means 

667 # HTML 4-style <meta> tags that provide the "content" 

668 # attribute and have "http-equiv" set to "content-type". 

669 # 

670 # In both cases we will replace the value of the appropriate 

671 # attribute with a standin object that can take on any 

672 # encoding. 

673 substituted = False 

674 if charset is not None: 

675 # HTML 5 style: 

676 # <meta charset="utf8"> 

677 tag["charset"] = CharsetMetaAttributeValue(charset) 

678 substituted = True 

679 

680 elif content is not None and any( 

681 x.lower() == "content-type" for x in http_equiv 

682 ): 

683 # HTML 4 style: 

684 # <meta http-equiv="content-type" content="text/html; charset=utf8"> 

685 tag["content"] = ContentMetaAttributeValue(content) 

686 substituted = True 

687 

688 return substituted 

689 

690 

691class DetectsXMLParsedAsHTML(object): 

692 """A mixin class for any class (a TreeBuilder, or some class used by a 

693 TreeBuilder) that's in a position to detect whether an XML 

694 document is being incorrectly parsed as HTML, and issue an 

695 appropriate warning. 

696 

697 This requires being able to observe an incoming processing 

698 instruction that might be an XML declaration, and also able to 

699 observe tags as they're opened. If you can't do that for a given 

700 `TreeBuilder`, there's a less reliable implementation based on 

701 examining the raw markup. 

702 """ 

703 

704 #: Regular expression for seeing if string markup has an <html> tag. 

705 LOOKS_LIKE_HTML: Pattern[str] = re.compile("<[^ +]html", re.I) 

706 

707 #: Regular expression for seeing if byte markup has an <html> tag. 

708 LOOKS_LIKE_HTML_B: Pattern[bytes] = re.compile(b"<[^ +]html", re.I) 

709 

710 #: The start of an XML document string. 

711 XML_PREFIX: str = "<?xml" 

712 

713 #: The start of an XML document bytestring. 

714 XML_PREFIX_B: bytes = b"<?xml" 

715 

716 # This is typed as str, not `ProcessingInstruction`, because this 

717 # check may be run before any Beautiful Soup objects are created. 

718 _first_processing_instruction: Optional[str] #: :meta private: 

719 _root_tag_name: Optional[str] #: :meta private: 

720 

721 @classmethod 

722 def warn_if_markup_looks_like_xml( 

723 cls, markup: Optional[_RawMarkup], stacklevel: int = 3 

724 ) -> bool: 

725 """Perform a check on some markup to see if it looks like XML 

726 that's not XHTML. If so, issue a warning. 

727 

728 This is much less reliable than doing the check while parsing, 

729 but some of the tree builders can't do that. 

730 

731 :param stacklevel: The stacklevel of the code calling this\ 

732 function. 

733 

734 :return: True if the markup looks like non-XHTML XML, False 

735 otherwise. 

736 """ 

737 if markup is None: 

738 return False 

739 markup = markup[:500] 

740 if isinstance(markup, bytes): 

741 markup_b: bytes = markup 

742 looks_like_xml = markup_b.startswith( 

743 cls.XML_PREFIX_B 

744 ) and not cls.LOOKS_LIKE_HTML_B.search(markup) 

745 else: 

746 markup_s: str = markup 

747 looks_like_xml = markup_s.startswith( 

748 cls.XML_PREFIX 

749 ) and not cls.LOOKS_LIKE_HTML.search(markup) 

750 

751 if looks_like_xml: 

752 cls._warn(stacklevel=stacklevel + 2) 

753 return True 

754 return False 

755 

756 @classmethod 

757 def _warn(cls, stacklevel: int = 5) -> None: 

758 """Issue a warning about XML being parsed as HTML.""" 

759 warnings.warn( 

760 XMLParsedAsHTMLWarning.MESSAGE, 

761 XMLParsedAsHTMLWarning, 

762 stacklevel=stacklevel, 

763 ) 

764 

765 def _initialize_xml_detector(self) -> None: 

766 """Call this method before parsing a document.""" 

767 self._first_processing_instruction = None 

768 self._root_tag_name = None 

769 

770 def _document_might_be_xml(self, processing_instruction: str) -> None: 

771 """Call this method when encountering an XML declaration, or a 

772 "processing instruction" that might be an XML declaration. 

773 

774 This helps Beautiful Soup detect potential issues later, if 

775 the XML document turns out to be a non-XHTML document that's 

776 being parsed as XML. 

777 """ 

778 if ( 

779 self._first_processing_instruction is not None 

780 or self._root_tag_name is not None 

781 ): 

782 # The document has already started. Don't bother checking 

783 # anymore. 

784 return 

785 

786 self._first_processing_instruction = processing_instruction 

787 

788 # We won't know until we encounter the first tag whether or 

789 # not this is actually a problem. 

790 

791 def _root_tag_encountered(self, name: str) -> None: 

792 """Call this when you encounter the document's root tag. 

793 

794 This is where we actually check whether an XML document is 

795 being incorrectly parsed as HTML, and issue the warning. 

796 """ 

797 if self._root_tag_name is not None: 

798 # This method was incorrectly called multiple times. Do 

799 # nothing. 

800 return 

801 

802 self._root_tag_name = name 

803 

804 if ( 

805 name != "html" 

806 and self._first_processing_instruction is not None 

807 and self._first_processing_instruction.lower().startswith("xml ") 

808 ): 

809 # We encountered an XML declaration and then a tag other 

810 # than 'html'. This is a reliable indicator that a 

811 # non-XHTML document is being parsed as XML. 

812 self._warn(stacklevel=10) 

813 

814 

815def register_treebuilders_from(module: ModuleType) -> None: 

816 """Copy TreeBuilders from the given module into this module.""" 

817 this_module = sys.modules[__name__] 

818 for name in module.__all__: 

819 obj = getattr(module, name) 

820 

821 if issubclass(obj, TreeBuilder): 

822 setattr(this_module, name, obj) 

823 this_module.__all__.append(name) 

824 # Register the builder while we're at it. 

825 this_module.builder_registry.register(obj) 

826 

827 

828# Builders are registered in reverse order of priority, so that custom 

829# builder registrations will take precedence. In general, we want lxml 

830# to take precedence over html5lib, because it's faster. And we only 

831# want to use HTMLParser as a last resort. 

832from . import _htmlparser # noqa: E402 

833 

834register_treebuilders_from(_htmlparser) 

835try: 

836 from . import _html5lib 

837 

838 register_treebuilders_from(_html5lib) 

839except ImportError: 

840 # They don't have html5lib installed. 

841 pass 

842try: 

843 from . import _lxml 

844 

845 register_treebuilders_from(_lxml) 

846except ImportError: 

847 # They don't have lxml installed. 

848 pass