Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/__init_

1from __future__ import annotations

3# Use of this source code is governed by the MIT license.

4__license__ = "MIT"

6from collections import defaultdict

7import re

8from types import ModuleType

9from typing import (

10 Any,

11 cast,

12 Dict,

13 Iterable,

14 List,

15 Optional,

16 Pattern,

17 Set,

18 Tuple,

19 Type,

20 TYPE_CHECKING,

21)

22import warnings

23import sys

24from bs4.element import (

25 AttributeDict,

26 AttributeValueList,

27 CharsetMetaAttributeValue,

28 ContentMetaAttributeValue,

29 RubyParenthesisString,

30 RubyTextString,

31 Stylesheet,

32 Script,

33 TemplateString,

34 nonwhitespace_re,

35)

37# Exceptions were moved to their own module in 4.13. Import here for

38# backwards compatibility.

39from bs4.exceptions import ParserRejectedMarkup

41from bs4._typing import (

42 _AttributeValues,

43 _RawAttributeValue,

44)

46from bs4._warnings import XMLParsedAsHTMLWarning

48if TYPE_CHECKING:

49 from bs4 import BeautifulSoup

50 from bs4.element import (

51 NavigableString,

52 Tag,

53 )

54 from bs4._typing import (

55 _AttributeValue,

56 _Encoding,

57 _Encodings,

58 _RawOrProcessedAttributeValues,

59 _RawMarkup,

60 )

62__all__ = [

63 "HTMLTreeBuilder",

64 "SAXTreeBuilder",

65 "TreeBuilder",

66 "TreeBuilderRegistry",

67]

69# Some useful features for a TreeBuilder to have.

70FAST = "fast"

71PERMISSIVE = "permissive"

72STRICT = "strict"

73XML = "xml"

74HTML = "html"

75HTML_5 = "html5"

77__all__ = [

78 "TreeBuilderRegistry",

79 "TreeBuilder",

80 "HTMLTreeBuilder",

81 "DetectsXMLParsedAsHTML",

83 "ParserRejectedMarkup", # backwards compatibility only as of 4.13.0

84]

86class TreeBuilderRegistry(object):

87 """A way of looking up TreeBuilder subclasses by their name or by desired

88 features.

89 """

91 builders_for_feature: Dict[str, List[Type[TreeBuilder]]]

92 builders: List[Type[TreeBuilder]]

94 def __init__(self) -> None:

95 self.builders_for_feature = defaultdict(list)

96 self.builders = []

98 def register(self, treebuilder_class: type[TreeBuilder]) -> None:

99 """Register a treebuilder based on its advertised features.

100

101 :param treebuilder_class: A subclass of `TreeBuilder`. its

102 `TreeBuilder.features` attribute should list its features.

103 """

104 for feature in treebuilder_class.features:

105 self.builders_for_feature[feature].insert(0, treebuilder_class)

106 self.builders.insert(0, treebuilder_class)

107

108 def lookup(self, *features: str) -> Optional[Type[TreeBuilder]]:

109 """Look up a TreeBuilder subclass with the desired features.

110

111 :param features: A list of features to look for. If none are

112 provided, the most recently registered TreeBuilder subclass

113 will be used.

114 :return: A TreeBuilder subclass, or None if there's no

115 registered subclass with all the requested features.

116 """

117 if len(self.builders) == 0:

118 # There are no builders at all.

119 return None

120

121 if len(features) == 0:

122 # They didn't ask for any features. Give them the most

123 # recently registered builder.

124 return self.builders[0]

125

126 # Go down the list of features in order, and eliminate any builders

127 # that don't match every feature.

128 feature_list = list(features)

129 feature_list.reverse()

130 candidates = None

131 candidate_set = None

132 while len(feature_list) > 0:

133 feature = feature_list.pop()

134 we_have_the_feature = self.builders_for_feature.get(feature, [])

135 if len(we_have_the_feature) > 0:

136 if candidates is None:

137 candidates = we_have_the_feature

138 candidate_set = set(candidates)

139 else:

140 # Eliminate any candidates that don't have this feature.

141 candidate_set = candidate_set.intersection(set(we_have_the_feature))

142

143 # The only valid candidates are the ones in candidate_set.

144 # Go through the original list of candidates and pick the first one

145 # that's in candidate_set.

146 if candidate_set is None or candidates is None:

147 return None

148 for candidate in candidates:

149 if candidate in candidate_set:

150 return candidate

151 return None

152

153

154#: The `BeautifulSoup` constructor will take a list of features

155#: and use it to look up `TreeBuilder` classes in this registry.

156builder_registry: TreeBuilderRegistry = TreeBuilderRegistry()

157

158

159class TreeBuilder(object):

160 """Turn a textual document into a Beautiful Soup object tree.

161

162 This is an abstract superclass which smooths out the behavior of

163 different parser libraries into a single, unified interface.

164

165 :param multi_valued_attributes: If this is set to None, the

166 TreeBuilder will not turn any values for attributes like

167 'class' into lists. Setting this to a dictionary will

168 customize this behavior; look at :py:attr:`bs4.builder.HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES`

169 for an example.

170

171 Internally, these are called "CDATA list attributes", but that

172 probably doesn't make sense to an end-user, so the argument name

173 is ``multi_valued_attributes``.

174

175 :param preserve_whitespace_tags: A set of tags to treat

176 the way <pre> tags are treated in HTML. Tags in this set

177 are immune from pretty-printing; their contents will always be

178 output as-is.

179

180 :param string_containers: A dictionary mapping tag names to

181 the classes that should be instantiated to contain the textual

182 contents of those tags. The default is to use NavigableString

183 for every tag, no matter what the name. You can override the

184 default by changing :py:attr:`DEFAULT_STRING_CONTAINERS`.

185

186 :param store_line_numbers: If the parser keeps track of the line

187 numbers and positions of the original markup, that information

188 will, by default, be stored in each corresponding

189 :py:class:`bs4.element.Tag` object. You can turn this off by

190 passing store_line_numbers=False; then Tag.sourcepos and

191 Tag.sourceline will always be None. If the parser you're using

192 doesn't keep track of this information, then store_line_numbers

193 is irrelevant.

194

195 :param attribute_dict_class: The value of a multi-valued attribute

196 (such as HTML's 'class') willl be stored in an instance of this

197 class. The default is Beautiful Soup's built-in

198 `AttributeValueList`, which is a normal Python list, and you

199 will probably never need to change it.

200 """

201

202 USE_DEFAULT: Any = object() #: :meta private:

203

204 def __init__(

205 self,

206 multi_valued_attributes: Dict[str, Set[str]] = USE_DEFAULT,

207 preserve_whitespace_tags: Set[str] = USE_DEFAULT,

208 store_line_numbers: bool = USE_DEFAULT,

209 string_containers: Dict[str, Type[NavigableString]] = USE_DEFAULT,

210 empty_element_tags: Set[str] = USE_DEFAULT,

211 attribute_dict_class: Type[AttributeDict] = AttributeDict,

212 attribute_value_list_class: Type[AttributeValueList] = AttributeValueList,

213 ):

214 self.soup = None

215 if multi_valued_attributes is self.USE_DEFAULT:

216 multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES

217 self.cdata_list_attributes = multi_valued_attributes

218 if preserve_whitespace_tags is self.USE_DEFAULT:

219 preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS

220 self.preserve_whitespace_tags = preserve_whitespace_tags

221 if empty_element_tags is self.USE_DEFAULT:

222 self.empty_element_tags = self.DEFAULT_EMPTY_ELEMENT_TAGS

223 else:

224 self.empty_element_tags = empty_element_tags

225 # TODO: store_line_numbers is probably irrelevant now that

226 # the behavior of sourceline and sourcepos has been made consistent

227 # everywhere.

228 if store_line_numbers == self.USE_DEFAULT:

229 store_line_numbers = self.TRACKS_LINE_NUMBERS

230 self.store_line_numbers = store_line_numbers

231 if string_containers == self.USE_DEFAULT:

232 string_containers = self.DEFAULT_STRING_CONTAINERS

233 self.string_containers = string_containers

234 self.attribute_dict_class = attribute_dict_class

235 self.attribute_value_list_class = attribute_value_list_class

236

237 NAME: str = "[Unknown tree builder]"

238 ALTERNATE_NAMES: Iterable[str] = []

239 features: Iterable[str] = []

240

241 is_xml: bool = False

242 picklable: bool = False

243

244 soup: Optional[BeautifulSoup] #: :meta private:

245

246 #: A tag will be considered an empty-element

247 #: tag when and only when it has no contents.

248 empty_element_tags: Optional[Set[str]] = None #: :meta private:

249 cdata_list_attributes: Dict[str, Set[str]] #: :meta private:

250 preserve_whitespace_tags: Set[str] #: :meta private:

251 string_containers: Dict[str, Type[NavigableString]] #: :meta private:

252 tracks_line_numbers: bool #: :meta private:

253

254 #: A value for these tag/attribute combinations is a space- or

255 #: comma-separated list of CDATA, rather than a single CDATA.

256 DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = defaultdict(set)

257

258 #: Whitespace should be preserved inside these tags.

259 DEFAULT_PRESERVE_WHITESPACE_TAGS: Set[str] = set()

260

261 #: The textual contents of tags with these names should be

262 #: instantiated with some class other than `bs4.element.NavigableString`.

263 DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {}

264

265 #: By default, tags are treated as empty-element tags if they have

266 #: no contents--that is, using XML rules. HTMLTreeBuilder

267 #: defines a different set of DEFAULT_EMPTY_ELEMENT_TAGS based on the

268 #: HTML 4 and HTML5 standards.

269 DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = None

270

271 #: Most parsers don't keep track of line numbers.

272 TRACKS_LINE_NUMBERS: bool = False

273

274 def initialize_soup(self, soup: BeautifulSoup) -> None:

275 """The BeautifulSoup object has been initialized and is now

276 being associated with the TreeBuilder.

277

278 :param soup: A BeautifulSoup object.

279 """

280 self.soup = soup

281

282 def reset(self) -> None:

283 """Do any work necessary to reset the underlying parser

284 for a new document.

285

286 By default, this does nothing.

287 """

288 pass

289

290 def can_be_empty_element(self, tag_name: str) -> bool:

291 """Might a tag with this name be an empty-element tag?

292

293 The final markup may or may not actually present this tag as

294 self-closing.

295

296 For instance: an HTMLBuilder does not consider a tag to be

297 an empty-element tag (it's not in

298 HTMLBuilder.empty_element_tags). This means an empty tag

299 will be presented as "", not "" or "".

300

301 The default implementation has no opinion about which tags are

302 empty-element tags, so a tag will be presented as an

303 empty-element tag if and only if it has no children.

304 "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will

305 be left alone.

306

307 :param tag_name: The name of a markup tag.

308 """

309 if self.empty_element_tags is None:

310 return True

311 return tag_name in self.empty_element_tags

312

313 def feed(self, markup: _RawMarkup) -> None:

314 """Run incoming markup through some parsing process."""

315 raise NotImplementedError()

316

317 def prepare_markup(

318 self,

319 markup: _RawMarkup,

320 user_specified_encoding: Optional[_Encoding] = None,

321 document_declared_encoding: Optional[_Encoding] = None,

322 exclude_encodings: Optional[_Encodings] = None,

323 ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:

324 """Run any preliminary steps necessary to make incoming markup

325 acceptable to the parser.

326

327 :param markup: The markup that's about to be parsed.

328 :param user_specified_encoding: The user asked to try this encoding

329 to convert the markup into a Unicode string.

330 :param document_declared_encoding: The markup itself claims to be

331 in this encoding. NOTE: This argument is not used by the

332 calling code and can probably be removed.

333 :param exclude_encodings: The user asked *not* to try any of

334 these encodings.

335

336 :yield: A series of 4-tuples: (markup, encoding, declared encoding,

337 has undergone character replacement)

338

339 Each 4-tuple represents a strategy that the parser can try

340 to convert the document to Unicode and parse it. Each

341 strategy will be tried in turn.

342

343 By default, the only strategy is to parse the markup

344 as-is. See `LXMLTreeBuilderForXML` and

345 `HTMLParserTreeBuilder` for implementations that take into

346 account the quirks of particular parsers.

347

348 :meta private:

349

350 """

351 yield markup, None, None, False

352

353 def test_fragment_to_document(self, fragment: str) -> str:

354 """Wrap an HTML fragment to make it look like a document.

355

356 Different parsers do this differently. For instance, lxml

357 introduces an empty <head> tag, and html5lib

358 doesn't. Abstracting this away lets us write simple tests

359 which run HTML fragments through the parser and compare the

360 results against other HTML fragments.

361

362 This method should not be used outside of unit tests.

363

364 :param fragment: A fragment of HTML.

365 :return: A full HTML document.

366 :meta private:

367 """

368 return fragment

369

370 def set_up_substitutions(self, tag: Tag) -> bool:

371 """Set up any substitutions that will need to be performed on

372 a `Tag` when it's output as a string.

373

374 By default, this does nothing. See `HTMLTreeBuilder` for a

375 case where this is used.

376

377 :return: Whether or not a substitution was performed.

378 :meta private:

379 """

380 return False

381

382 def _replace_cdata_list_attribute_values(

383 self, tag_name: str, attrs: _RawOrProcessedAttributeValues

384 ) -> _AttributeValues:

385 """When an attribute value is associated with a tag that can

386 have multiple values for that attribute, convert the string

387 value to a list of strings.

388

389 Basically, replaces class="foo bar" with class=["foo", "bar"]

390

391 NOTE: This method modifies its input in place.

392

393 :param tag_name: The name of a tag.

394 :param attrs: A dictionary containing the tag's attributes.

395 Any appropriate attribute values will be modified in place.

396 :return: The modified dictionary that was originally passed in.

397 """

398

399 # First, cast the attrs dict to _AttributeValues. This might

400 # not be accurate yet, but it will be by the time this method

401 # returns.

402 modified_attrs = cast(_AttributeValues, attrs)

403 if not modified_attrs or not self.cdata_list_attributes:

404 # Nothing to do.

405 return modified_attrs

406

407 # There is at least a possibility that we need to modify one of

408 # the attribute values.

409 universal: Set[str] = self.cdata_list_attributes.get("*", set())

410 tag_specific = self.cdata_list_attributes.get(tag_name.lower(), None)

411 for attr in list(modified_attrs.keys()):

412 modified_value: _AttributeValue

413 if attr in universal or (tag_specific and attr in tag_specific):

414 # We have a "class"-type attribute whose string

415 # value is a whitespace-separated list of

416 # values. Split it into a list.

417 original_value: _AttributeValue = modified_attrs[attr]

418 if isinstance(original_value, _RawAttributeValue):

419 # This is a _RawAttributeValue (a string) that

420 # needs to be split and converted to a

421 # AttributeValueList so it can be an

422 # _AttributeValue.

423 modified_value = self.attribute_value_list_class(

424 nonwhitespace_re.findall(original_value)

425 )

426 else:

427 # html5lib calls setAttributes twice for the

428 # same tag when rearranging the parse tree. On

429 # the second call the attribute value here is

430 # already a list. This can also happen when a

431 # Tag object is cloned. If this happens, leave

432 # the value alone rather than trying to split

433 # it again.

434 modified_value = original_value

435 modified_attrs[attr] = modified_value

436 return modified_attrs

437

438

439class SAXTreeBuilder(TreeBuilder):

440 """A Beautiful Soup treebuilder that listens for SAX events.

441

442 This is not currently used for anything, and it will be removed

443 soon. It was a good idea, but it wasn't properly integrated into the

444 rest of Beautiful Soup, so there have been long stretches where it

445 hasn't worked properly.

446 """

447

448 def __init__(self, *args: Any, **kwargs: Any) -> None:

449 warnings.warn(

450 "The SAXTreeBuilder class was deprecated in 4.13.0 and will be removed soon thereafter. It is completely untested and probably doesn't work; do not use it.",

451 DeprecationWarning,

452 stacklevel=2,

453 )

454 super(SAXTreeBuilder, self).__init__(*args, **kwargs)

455

456 def feed(self, markup: _RawMarkup) -> None:

457 raise NotImplementedError()

458

459 def close(self) -> None:

460 pass

461

462 def startElement(self, name: str, attrs: Dict[str, str]) -> None:

463 attrs = AttributeDict((key[1], value) for key, value in list(attrs.items()))

464 # print("Start %s, %r" % (name, attrs))

465 assert self.soup is not None

466 self.soup.handle_starttag(name, None, None, attrs)

467

468 def endElement(self, name: str) -> None:

469 # print("End %s" % name)

470 assert self.soup is not None

471 self.soup.handle_endtag(name)

472

473 def startElementNS(

474 self, nsTuple: Tuple[str, str], nodeName: str, attrs: Dict[str, str]

475 ) -> None:

476 # Throw away (ns, nodeName) for now.

477 self.startElement(nodeName, attrs)

478

479 def endElementNS(self, nsTuple: Tuple[str, str], nodeName: str) -> None:

480 # Throw away (ns, nodeName) for now.

481 self.endElement(nodeName)

482 # handler.endElementNS((ns, node.nodeName), node.nodeName)

483

484 def startPrefixMapping(self, prefix: str, nodeValue: str) -> None:

485 # Ignore the prefix for now.

486 pass

487

488 def endPrefixMapping(self, prefix: str) -> None:

489 # Ignore the prefix for now.

490 # handler.endPrefixMapping(prefix)

491 pass

492

493 def characters(self, content: str) -> None:

494 assert self.soup is not None

495 self.soup.handle_data(content)

496

497 def startDocument(self) -> None:

498 pass

499

500 def endDocument(self) -> None:

501 pass

502

503

504class HTMLTreeBuilder(TreeBuilder):

505 """This TreeBuilder knows facts about HTML, such as which tags are treated

506 specially by the HTML standard.

507 """

508

509 #: Some HTML tags are defined as having no contents. Beautiful Soup

510 #: treats these specially.

511 DEFAULT_EMPTY_ELEMENT_TAGS: Set[str] = set(

512 [

513 # These are from HTML5.

514 "area",

515 "base",

516 "br",

517 "col",

518 "embed",

519 "hr",

520 "img",

521 "input",

522 "keygen",

523 "link",

524 "menuitem",

525 "meta",

526 "param",

527 "source",

528 "track",

529 "wbr",

530 # These are from earlier versions of HTML and are removed in HTML5.

531 "basefont",

532 "bgsound",

533 "command",

534 "frame",

535 "image",

536 "isindex",

537 "nextid",

538 "spacer",

539 ]

540 )

541

542 #: The HTML standard defines these tags as block-level elements. Beautiful

543 #: Soup does not treat these elements differently from other elements,

544 #: but it may do so eventually, and this information is available if

545 #: you need to use it.

546 DEFAULT_BLOCK_ELEMENTS: Set[str] = set(

547 [

548 "address",

549 "article",

550 "aside",

551 "blockquote",

552 "canvas",

553 "dd",

554 "div",

555 "dl",

556 "dt",

557 "fieldset",

558 "figcaption",

559 "figure",

560 "footer",

561 "form",

562 "h1",

563 "h2",

564 "h3",

565 "h4",

566 "h5",

567 "h6",

568 "header",

569 "hr",

570 "li",

571 "main",

572 "nav",

573 "noscript",

574 "ol",

575 "output",

576 "p",

577 "pre",

578 "section",

579 "table",

580 "tfoot",

581 "ul",

582 "video",

583 ]

584 )

585

586 #: These HTML tags need special treatment so they can be

587 #: represented by a string class other than `bs4.element.NavigableString`.

588 #:

589 #: For some of these tags, it's because the HTML standard defines

590 #: an unusual content model for them. I made this list by going

591 #: through the HTML spec

592 #: (https://html.spec.whatwg.org/#metadata-content) and looking for

593 #: "metadata content" elements that can contain strings.

594 #:

595 #: The Ruby tags (<rt> and <rp>) are here despite being normal

596 #: "phrasing content" tags, because the content they contain is

597 #: qualitatively different from other text in the document, and it

598 #: can be useful to be able to distinguish it.

599 #:

600 #: TODO: Arguably <noscript> could go here but it seems

601 #: qualitatively different from the other tags.

602 DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {

603 "rt": RubyTextString,

604 "rp": RubyParenthesisString,

605 "style": Stylesheet,

606 "script": Script,

607 "template": TemplateString,

608 }

609

610 #: The HTML standard defines these attributes as containing a

611 #: space-separated list of values, not a single value. That is,

612 #: class="foo bar" means that the 'class' attribute has two values,

613 #: 'foo' and 'bar', not the single value 'foo bar'. When we

614 #: encounter one of these attributes, we will parse its value into

615 #: a list of values if possible. Upon output, the list will be

616 #: converted back into a string.

617 DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = {

618 "*": {"class", "accesskey", "dropzone"},

619 "a": {"rel", "rev"},

620 "link": {"rel", "rev"},

621 "td": {"headers"},

622 "th": {"headers"},

623 "form": {"accept-charset"},

624 "object": {"archive"},

625 # These are HTML5 specific, as are *.accesskey and *.dropzone above.

626 "area": {"rel"},

627 "icon": {"sizes"},

628 "iframe": {"sandbox"},

629 "output": {"for"},

630 }

631

632 #: By default, whitespace inside these HTML tags will be

633 #: preserved rather than being collapsed.

634 DEFAULT_PRESERVE_WHITESPACE_TAGS: set[str] = set(["pre", "textarea"])

635

636 def set_up_substitutions(self, tag: Tag) -> bool:

637 """Replace the declared encoding in a <meta> tag with a placeholder,

638 to be substituted when the tag is output to a string.

639

640 An HTML document may come in to Beautiful Soup as one

641 encoding, but exit in a different encoding, and the <meta> tag

642 needs to be changed to reflect this.

643

644 :return: Whether or not a substitution was performed.

645

646 :meta private:

647 """

648 # We are only interested in <meta> tags

649 if tag.name != "meta":

650 return False

651

652 # TODO: This cast will fail in the (very unlikely) scenario

653 # that the programmer who instantiates the TreeBuilder

654 # specifies meta['content'] or meta['charset'] as

655 # cdata_list_attributes.

656 content: Optional[str] = cast(Optional[str], tag.get("content"))

657 charset: Optional[str] = cast(Optional[str], tag.get("charset"))

658

659 # But we can accommodate meta['http-equiv'] being made a

660 # cdata_list_attribute (again, very unlikely) without much

661 # trouble.

662 http_equiv: List[str] = tag.get_attribute_list("http-equiv")

663

664 # We are interested in <meta> tags that say what encoding the

665 # document was originally in. This means HTML 5-style <meta>

666 # tags that provide the "charset" attribute. It also means

667 # HTML 4-style <meta> tags that provide the "content"

668 # attribute and have "http-equiv" set to "content-type".

669 #

670 # In both cases we will replace the value of the appropriate

671 # attribute with a standin object that can take on any

672 # encoding.

673 substituted = False

674 if charset is not None:

675 # HTML 5 style:

676 # <meta charset="utf8">

677 tag["charset"] = CharsetMetaAttributeValue(charset)

678 substituted = True

679

680 elif content is not None and any(

681 x.lower() == "content-type" for x in http_equiv

682 ):

683 # HTML 4 style:

684 # <meta http-equiv="content-type" content="text/html; charset=utf8">

685 tag["content"] = ContentMetaAttributeValue(content)

686 substituted = True

687

688 return substituted

689

690

691class DetectsXMLParsedAsHTML(object):

692 """A mixin class for any class (a TreeBuilder, or some class used by a

693 TreeBuilder) that's in a position to detect whether an XML

694 document is being incorrectly parsed as HTML, and issue an

695 appropriate warning.

696

697 This requires being able to observe an incoming processing

698 instruction that might be an XML declaration, and also able to

699 observe tags as they're opened. If you can't do that for a given

700 `TreeBuilder`, there's a less reliable implementation based on

701 examining the raw markup.

702 """

703

704 #: Regular expression for seeing if string markup has an <html> tag.

705 LOOKS_LIKE_HTML: Pattern[str] = re.compile("<[^ +]html", re.I)

706

707 #: Regular expression for seeing if byte markup has an <html> tag.

708 LOOKS_LIKE_HTML_B: Pattern[bytes] = re.compile(b"<[^ +]html", re.I)

709

710 #: The start of an XML document string.

711 XML_PREFIX: str = "<?xml"

712

713 #: The start of an XML document bytestring.

714 XML_PREFIX_B: bytes = b"<?xml"

715

716 # This is typed as str, not `ProcessingInstruction`, because this

717 # check may be run before any Beautiful Soup objects are created.

718 _first_processing_instruction: Optional[str] #: :meta private:

719 _root_tag_name: Optional[str] #: :meta private:

720

721 @classmethod

722 def warn_if_markup_looks_like_xml(

723 cls, markup: Optional[_RawMarkup], stacklevel: int = 3

724 ) -> bool:

725 """Perform a check on some markup to see if it looks like XML

726 that's not XHTML. If so, issue a warning.

727

728 This is much less reliable than doing the check while parsing,

729 but some of the tree builders can't do that.

730

731 :param stacklevel: The stacklevel of the code calling this\

732 function.

733

734 :return: True if the markup looks like non-XHTML XML, False

735 otherwise.

736 """

737 if markup is None:

738 return False

739 markup = markup[:500]

740 if isinstance(markup, bytes):

741 markup_b: bytes = markup

742 looks_like_xml = markup_b.startswith(

743 cls.XML_PREFIX_B

744 ) and not cls.LOOKS_LIKE_HTML_B.search(markup)

745 else:

746 markup_s: str = markup

747 looks_like_xml = markup_s.startswith(

748 cls.XML_PREFIX

749 ) and not cls.LOOKS_LIKE_HTML.search(markup)

750

751 if looks_like_xml:

752 cls._warn(stacklevel=stacklevel + 2)

753 return True

754 return False

755

756 @classmethod

757 def _warn(cls, stacklevel: int = 5) -> None:

758 """Issue a warning about XML being parsed as HTML."""

759 warnings.warn(

760 XMLParsedAsHTMLWarning.MESSAGE,

761 XMLParsedAsHTMLWarning,

762 stacklevel=stacklevel,

763 )

764

765 def _initialize_xml_detector(self) -> None:

766 """Call this method before parsing a document."""

767 self._first_processing_instruction = None

768 self._root_tag_name = None

769

770 def _document_might_be_xml(self, processing_instruction: str) -> None:

771 """Call this method when encountering an XML declaration, or a

772 "processing instruction" that might be an XML declaration.

773

774 This helps Beautiful Soup detect potential issues later, if

775 the XML document turns out to be a non-XHTML document that's

776 being parsed as XML.

777 """

778 if (

779 self._first_processing_instruction is not None

780 or self._root_tag_name is not None

781 ):

782 # The document has already started. Don't bother checking

783 # anymore.

784 return

785

786 self._first_processing_instruction = processing_instruction

787

788 # We won't know until we encounter the first tag whether or

789 # not this is actually a problem.

790

791 def _root_tag_encountered(self, name: str) -> None:

792 """Call this when you encounter the document's root tag.

793

794 This is where we actually check whether an XML document is

795 being incorrectly parsed as HTML, and issue the warning.

796 """

797 if self._root_tag_name is not None:

798 # This method was incorrectly called multiple times. Do

799 # nothing.

800 return

801

802 self._root_tag_name = name

803

804 if (

805 name != "html"

806 and self._first_processing_instruction is not None

807 and self._first_processing_instruction.lower().startswith("xml ")

808 ):

809 # We encountered an XML declaration and then a tag other

810 # than 'html'. This is a reliable indicator that a

811 # non-XHTML document is being parsed as XML.

812 self._warn(stacklevel=10)

813

814

815def register_treebuilders_from(module: ModuleType) -> None:

816 """Copy TreeBuilders from the given module into this module."""

817 this_module = sys.modules[__name__]

818 for name in module.__all__:

819 obj = getattr(module, name)

820

821 if issubclass(obj, TreeBuilder):

822 setattr(this_module, name, obj)

823 this_module.__all__.append(name)

824 # Register the builder while we're at it.

825 this_module.builder_registry.register(obj)

826

827

828# Builders are registered in reverse order of priority, so that custom

829# builder registrations will take precedence. In general, we want lxml

830# to take precedence over html5lib, because it's faster. And we only

831# want to use HTMLParser as a last resort.

832from . import _htmlparser # noqa: E402

833

834register_treebuilders_from(_htmlparser)

835try:

836 from . import _html5lib

837

838 register_treebuilders_from(_html5lib)

839except ImportError:

840 # They don't have html5lib installed.

841 pass

842try:

843 from . import _lxml

844

845 register_treebuilders_from(_lxml)

846except ImportError:

847 # They don't have lxml installed.

848 pass

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/builder/init.py: 79%

232 statements