Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/__init_

1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".

3http://www.crummy.com/software/BeautifulSoup/

5Beautiful Soup uses a pluggable XML or HTML parser to parse a

6(possibly invalid) document into a tree representation. Beautiful Soup

7provides methods and Pythonic idioms that make it easy to navigate,

8search, and modify the parse tree.

10Beautiful Soup works with Python 3.7 and up. It works better if lxml

11and/or html5lib is installed, but they are not required.

13For more than you ever wanted to know about Beautiful Soup, see the

14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/

15"""

17__author__ = "Leonard Richardson (leonardr@segfault.org)"

18__version__ = "4.13.5"

20# Use of this source code is governed by the MIT license.

21__license__ = "MIT"

23__all__ = [

24 "AttributeResemblesVariableWarning",

25 "BeautifulSoup",

26 "Comment",

27 "Declaration",

28 "ProcessingInstruction",

29 "ResultSet",

30 "CSS",

31 "Script",

32 "Stylesheet",

33 "Tag",

34 "TemplateString",

35 "ElementFilter",

36 "UnicodeDammit",

37 "CData",

38 "Doctype",

40 # Exceptions

41 "FeatureNotFound",

42 "ParserRejectedMarkup",

43 "StopParsing",

45 # Warnings

46 "AttributeResemblesVariableWarning",

47 "GuessedAtParserWarning",

48 "MarkupResemblesLocatorWarning",

49 "UnusualUsageWarning",

50 "XMLParsedAsHTMLWarning",

51]

53from collections import Counter

54import io

55import sys

56import warnings

58# The very first thing we do is give a useful error if someone is

59# running this code under Python 2.

60if sys.version_info.major < 3:

61 raise ImportError(

62 "You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3."

63 )

65from .builder import (

66 builder_registry,

67 TreeBuilder,

68)

69from .builder._htmlparser import HTMLParserTreeBuilder

70from .dammit import UnicodeDammit

71from .css import CSS

72from ._deprecation import (

73 _deprecated,

74)

75from .element import (

76 CData,

77 Comment,

78 DEFAULT_OUTPUT_ENCODING,

79 Declaration,

80 Doctype,

81 NavigableString,

82 PageElement,

83 ProcessingInstruction,

84 PYTHON_SPECIFIC_ENCODINGS,

85 ResultSet,

86 Script,

87 Stylesheet,

88 Tag,

89 TemplateString,

90)

91from .formatter import Formatter

92from .filter import (

93 ElementFilter,

94 SoupStrainer,

95)

96from typing import (

97 Any,

98 cast,

99 Counter as CounterType,

100 Dict,

101 Iterator,

102 List,

103 Sequence,

104 Sized,

105 Optional,

106 Type,

107 Union,

108)

109

110from bs4._typing import (

111 _Encoding,

112 _Encodings,

113 _IncomingMarkup,

114 _InsertableElement,

115 _RawAttributeValue,

116 _RawAttributeValues,

117 _RawMarkup,

118)

119

120# Import all warnings and exceptions into the main package.

121from bs4.exceptions import (

122 FeatureNotFound,

123 ParserRejectedMarkup,

124 StopParsing,

125)

126from bs4._warnings import (

127 AttributeResemblesVariableWarning,

128 GuessedAtParserWarning,

129 MarkupResemblesLocatorWarning,

130 UnusualUsageWarning,

131 XMLParsedAsHTMLWarning,

132)

133

134

135class BeautifulSoup(Tag):

136 """A data structure representing a parsed HTML or XML document.

137

138 Most of the methods you'll call on a BeautifulSoup object are inherited from

139 PageElement or Tag.

140

141 Internally, this class defines the basic interface called by the

142 tree builders when converting an HTML/XML document into a data

143 structure. The interface abstracts away the differences between

144 parsers. To write a new tree builder, you'll need to understand

145 these methods as a whole.

146

147 These methods will be called by the BeautifulSoup constructor:

148 * reset()

149 * feed(markup)

150

151 The tree builder may call these methods from its feed() implementation:

152 * handle_starttag(name, attrs) # See note about return value

153 * handle_endtag(name)

154 * handle_data(data) # Appends to the current data node

155 * endData(containerClass) # Ends the current data node

156

157 No matter how complicated the underlying parser is, you should be

158 able to build a tree using 'start tag' events, 'end tag' events,

159 'data' events, and "done with data" events.

160

161 If you encounter an empty-element tag (aka a self-closing tag,

162 like HTML's <br> tag), call handle_starttag and then

163 handle_endtag.

164 """

165

166 #: Since `BeautifulSoup` subclasses `Tag`, it's possible to treat it as

167 #: a `Tag` with a `Tag.name`. Hoever, this name makes it clear the

168 #: `BeautifulSoup` object isn't a real markup tag.

169 ROOT_TAG_NAME: str = "[document]"

170

171 #: If the end-user gives no indication which tree builder they

172 #: want, look for one with these features.

173 DEFAULT_BUILDER_FEATURES: Sequence[str] = ["html", "fast"]

174

175 #: A string containing all ASCII whitespace characters, used in

176 #: during parsing to detect data chunks that seem 'empty'.

177 ASCII_SPACES: str = "\x20\x0a\x09\x0c\x0d"

178

179 # FUTURE PYTHON:

180 element_classes: Dict[Type[PageElement], Type[PageElement]] #: :meta private:

181 builder: TreeBuilder #: :meta private:

182 is_xml: bool

183 known_xml: Optional[bool]

184 parse_only: Optional[SoupStrainer] #: :meta private:

185

186 # These members are only used while parsing markup.

187 markup: Optional[_RawMarkup] #: :meta private:

188 current_data: List[str] #: :meta private:

189 currentTag: Optional[Tag] #: :meta private:

190 tagStack: List[Tag] #: :meta private:

191 open_tag_counter: CounterType[str] #: :meta private:

192 preserve_whitespace_tag_stack: List[Tag] #: :meta private:

193 string_container_stack: List[Tag] #: :meta private:

194 _most_recent_element: Optional[PageElement] #: :meta private:

195

196 #: Beautiful Soup's best guess as to the character encoding of the

197 #: original document.

198 original_encoding: Optional[_Encoding]

199

200 #: The character encoding, if any, that was explicitly defined

201 #: in the original document. This may or may not match

202 #: `BeautifulSoup.original_encoding`.

203 declared_html_encoding: Optional[_Encoding]

204

205 #: This is True if the markup that was parsed contains

206 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present

207 #: in the original markup. These mark character sequences that

208 #: could not be represented in Unicode.

209 contains_replacement_characters: bool

210

211 def __init__(

212 self,

213 markup: _IncomingMarkup = "",

214 features: Optional[Union[str, Sequence[str]]] = None,

215 builder: Optional[Union[TreeBuilder, Type[TreeBuilder]]] = None,

216 parse_only: Optional[SoupStrainer] = None,

217 from_encoding: Optional[_Encoding] = None,

218 exclude_encodings: Optional[_Encodings] = None,

219 element_classes: Optional[Dict[Type[PageElement], Type[PageElement]]] = None,

220 **kwargs: Any,

221 ):

222 """Constructor.

223

224 :param markup: A string or a file-like object representing

225 markup to be parsed.

226

227 :param features: Desirable features of the parser to be

228 used. This may be the name of a specific parser ("lxml",

229 "lxml-xml", "html.parser", or "html5lib") or it may be the

230 type of markup to be used ("html", "html5", "xml"). It's

231 recommended that you name a specific parser, so that

232 Beautiful Soup gives you the same results across platforms

233 and virtual environments.

234

235 :param builder: A TreeBuilder subclass to instantiate (or

236 instance to use) instead of looking one up based on

237 `features`. You only need to use this if you've implemented a

238 custom TreeBuilder.

239

240 :param parse_only: A SoupStrainer. Only parts of the document

241 matching the SoupStrainer will be considered. This is useful

242 when parsing part of a document that would otherwise be too

243 large to fit into memory.

244

245 :param from_encoding: A string indicating the encoding of the

246 document to be parsed. Pass this in if Beautiful Soup is

247 guessing wrongly about the document's encoding.

248

249 :param exclude_encodings: A list of strings indicating

250 encodings known to be wrong. Pass this in if you don't know

251 the document's encoding but you know Beautiful Soup's guess is

252 wrong.

253

254 :param element_classes: A dictionary mapping BeautifulSoup

255 classes like Tag and NavigableString, to other classes you'd

256 like to be instantiated instead as the parse tree is

257 built. This is useful for subclassing Tag or NavigableString

258 to modify default behavior.

259

260 :param kwargs: For backwards compatibility purposes, the

261 constructor accepts certain keyword arguments used in

262 Beautiful Soup 3. None of these arguments do anything in

263 Beautiful Soup 4; they will result in a warning and then be

264 ignored.

265

266 Apart from this, any keyword arguments passed into the

267 BeautifulSoup constructor are propagated to the TreeBuilder

268 constructor. This makes it possible to configure a

269 TreeBuilder by passing in arguments, not just by saying which

270 one to use.

271 """

272 if "convertEntities" in kwargs:

273 del kwargs["convertEntities"]

274 warnings.warn(

275 "BS4 does not respect the convertEntities argument to the "

276 "BeautifulSoup constructor. Entities are always converted "

277 "to Unicode characters."

278 )

279

280 if "markupMassage" in kwargs:

281 del kwargs["markupMassage"]

282 warnings.warn(

283 "BS4 does not respect the markupMassage argument to the "

284 "BeautifulSoup constructor. The tree builder is responsible "

285 "for any necessary markup massage."

286 )

287

288 if "smartQuotesTo" in kwargs:

289 del kwargs["smartQuotesTo"]

290 warnings.warn(

291 "BS4 does not respect the smartQuotesTo argument to the "

292 "BeautifulSoup constructor. Smart quotes are always converted "

293 "to Unicode characters."

294 )

295

296 if "selfClosingTags" in kwargs:

297 del kwargs["selfClosingTags"]

298 warnings.warn(

299 "Beautiful Soup 4 does not respect the selfClosingTags argument to the "

300 "BeautifulSoup constructor. The tree builder is responsible "

301 "for understanding self-closing tags."

302 )

303

304 if "isHTML" in kwargs:

305 del kwargs["isHTML"]

306 warnings.warn(

307 "Beautiful Soup 4 does not respect the isHTML argument to the "

308 "BeautifulSoup constructor. Suggest you use "

309 "features='lxml' for HTML and features='lxml-xml' for "

310 "XML."

311 )

312

313 def deprecated_argument(old_name: str, new_name: str) -> Optional[Any]:

314 if old_name in kwargs:

315 warnings.warn(

316 'The "%s" argument to the BeautifulSoup constructor '

317 'was renamed to "%s" in Beautiful Soup 4.0.0'

318 % (old_name, new_name),

319 DeprecationWarning,

320 stacklevel=3,

321 )

322 return kwargs.pop(old_name)

323 return None

324

325 parse_only = parse_only or deprecated_argument("parseOnlyThese", "parse_only")

326 if parse_only is not None:

327 # Issue a warning if we can tell in advance that

328 # parse_only will exclude the entire tree.

329 if parse_only.excludes_everything:

330 warnings.warn(

331 f"The given value for parse_only will exclude everything: {parse_only}",

332 UserWarning,

333 stacklevel=3,

334 )

335

336 from_encoding = from_encoding or deprecated_argument(

337 "fromEncoding", "from_encoding"

338 )

339

340 if from_encoding and isinstance(markup, str):

341 warnings.warn(

342 "You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored."

343 )

344 from_encoding = None

345

346 self.element_classes = element_classes or dict()

347

348 # We need this information to track whether or not the builder

349 # was specified well enough that we can omit the 'you need to

350 # specify a parser' warning.

351 original_builder = builder

352 original_features = features

353

354 builder_class: Optional[Type[TreeBuilder]] = None

355 if isinstance(builder, type):

356 # A builder class was passed in; it needs to be instantiated.

357 builder_class = builder

358 builder = None

359 elif builder is None:

360 if isinstance(features, str):

361 features = [features]

362 if features is None or len(features) == 0:

363 features = self.DEFAULT_BUILDER_FEATURES

364 possible_builder_class = builder_registry.lookup(*features)

365 if possible_builder_class is None:

366 raise FeatureNotFound(

367 "Couldn't find a tree builder with the features you "

368 "requested: %s. Do you need to install a parser library?"

369 % ",".join(features)

370 )

371 builder_class = possible_builder_class

372

373 # At this point either we have a TreeBuilder instance in

374 # builder, or we have a builder_class that we can instantiate

375 # with the remaining **kwargs.

376 if builder is None:

377 assert builder_class is not None

378 builder = builder_class(**kwargs)

379 if (

380 not original_builder

381 and not (

382 original_features == builder.NAME

383 or (

384 isinstance(original_features, str)

385 and original_features in builder.ALTERNATE_NAMES

386 )

387 )

388 and markup

389 ):

390 # The user did not tell us which TreeBuilder to use,

391 # and we had to guess. Issue a warning.

392 if builder.is_xml:

393 markup_type = "XML"

394 else:

395 markup_type = "HTML"

396

397 # This code adapted from warnings.py so that we get the same line

398 # of code as our warnings.warn() call gets, even if the answer is wrong

399 # (as it may be in a multithreading situation).

400 caller = None

401 try:

402 caller = sys._getframe(1)

403 except ValueError:

404 pass

405 if caller:

406 globals = caller.f_globals

407 line_number = caller.f_lineno

408 else:

409 globals = sys.__dict__

410 line_number = 1

411 filename = globals.get("__file__")

412 if filename:

413 fnl = filename.lower()

414 if fnl.endswith((".pyc", ".pyo")):

415 filename = filename[:-1]

416 if filename:

417 # If there is no filename at all, the user is most likely in a REPL,

418 # and the warning is not necessary.

419 values = dict(

420 filename=filename,

421 line_number=line_number,

422 parser=builder.NAME,

423 markup_type=markup_type,

424 )

425 warnings.warn(

426 GuessedAtParserWarning.MESSAGE % values,

427 GuessedAtParserWarning,

428 stacklevel=2,

429 )

430 else:

431 if kwargs:

432 warnings.warn(

433 "Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`."

434 )

435

436 self.builder = builder

437 self.is_xml = builder.is_xml

438 self.known_xml = self.is_xml

439 self._namespaces = dict()

440 self.parse_only = parse_only

441

442 if hasattr(markup, "read"): # It's a file-type object.

443 markup = cast(io.IOBase, markup).read()

444 elif not isinstance(markup, (bytes, str)) and not hasattr(markup, "__len__"):

445 raise TypeError(

446 f"Incoming markup is of an invalid type: {markup!r}. Markup must be a string, a bytestring, or an open filehandle."

447 )

448 elif isinstance(markup, Sized) and len(markup) <= 256 and (

449 (isinstance(markup, bytes) and b"<" not in markup and b"\n" not in markup)

450 or (isinstance(markup, str) and "<" not in markup and "\n" not in markup)

451 ):

452 # Issue warnings for a couple beginner problems

453 # involving passing non-markup to Beautiful Soup.

454 # Beautiful Soup will still parse the input as markup,

455 # since that is sometimes the intended behavior.

456 if not self._markup_is_url(markup):

457 self._markup_resembles_filename(markup)

458

459 # At this point we know markup is a string or bytestring. If

460 # it was a file-type object, we've read from it.

461 markup = cast(_RawMarkup, markup)

462

463 rejections = []

464 success = False

465 for (

466 self.markup,

467 self.original_encoding,

468 self.declared_html_encoding,

469 self.contains_replacement_characters,

470 ) in self.builder.prepare_markup(

471 markup, from_encoding, exclude_encodings=exclude_encodings

472 ):

473 self.reset()

474 self.builder.initialize_soup(self)

475 try:

476 self._feed()

477 success = True

478 break

479 except ParserRejectedMarkup as e:

480 rejections.append(e)

481 pass

482

483 if not success:

484 other_exceptions = [str(e) for e in rejections]

485 raise ParserRejectedMarkup(

486 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n "

487 + "\n ".join(other_exceptions)

488 )

489

490 # Clear out the markup and remove the builder's circular

491 # reference to this object.

492 self.markup = None

493 self.builder.soup = None

494

495 def copy_self(self) -> "BeautifulSoup":

496 """Create a new BeautifulSoup object with the same TreeBuilder,

497 but not associated with any markup.

498

499 This is the first step of the deepcopy process.

500 """

501 clone = type(self)("", None, self.builder)

502

503 # Keep track of the encoding of the original document,

504 # since we won't be parsing it again.

505 clone.original_encoding = self.original_encoding

506 return clone

507

508 def __getstate__(self) -> Dict[str, Any]:

509 # Frequently a tree builder can't be pickled.

510 d = dict(self.__dict__)

511 if "builder" in d and d["builder"] is not None and not self.builder.picklable:

512 d["builder"] = type(self.builder)

513 # Store the contents as a Unicode string.

514 d["contents"] = []

515 d["markup"] = self.decode()

516

517 # If _most_recent_element is present, it's a Tag object left

518 # over from initial parse. It might not be picklable and we

519 # don't need it.

520 if "_most_recent_element" in d:

521 del d["_most_recent_element"]

522 return d

523

524 def __setstate__(self, state: Dict[str, Any]) -> None:

525 # If necessary, restore the TreeBuilder by looking it up.

526 self.__dict__ = state

527 if isinstance(self.builder, type):

528 self.builder = self.builder()

529 elif not self.builder:

530 # We don't know which builder was used to build this

531 # parse tree, so use a default we know is always available.

532 self.builder = HTMLParserTreeBuilder()

533 self.builder.soup = self

534 self.reset()

535 self._feed()

536

537 @classmethod

538 @_deprecated(

539 replaced_by="nothing (private method, will be removed)", version="4.13.0"

540 )

541 def _decode_markup(cls, markup: _RawMarkup) -> str:

542 """Ensure `markup` is Unicode so it's safe to send into warnings.warn.

543

544 warnings.warn had this problem back in 2010 but fortunately

545 not anymore. This has not been used for a long time; I just

546 noticed that fact while working on 4.13.0.

547 """

548 if isinstance(markup, bytes):

549 decoded = markup.decode("utf-8", "replace")

550 else:

551 decoded = markup

552 return decoded

553

554 @classmethod

555 def _markup_is_url(cls, markup: _RawMarkup) -> bool:

556 """Error-handling method to raise a warning if incoming markup looks

557 like a URL.

558

559 :param markup: A string of markup.

560 :return: Whether or not the markup resembled a URL

561 closely enough to justify issuing a warning.

562 """

563 problem: bool = False

564 if isinstance(markup, bytes):

565 problem = (

566 any(markup.startswith(prefix) for prefix in (b"http:", b"https:"))

567 and b" " not in markup

568 )

569 elif isinstance(markup, str):

570 problem = (

571 any(markup.startswith(prefix) for prefix in ("http:", "https:"))

572 and " " not in markup

573 )

574 else:

575 return False

576

577 if not problem:

578 return False

579 warnings.warn(

580 MarkupResemblesLocatorWarning.URL_MESSAGE % dict(what="URL"),

581 MarkupResemblesLocatorWarning,

582 stacklevel=3,

583 )

584 return True

585

586 @classmethod

587 def _markup_resembles_filename(cls, markup: _RawMarkup) -> bool:

588 """Error-handling method to issue a warning if incoming markup

589 resembles a filename.

590

591 :param markup: A string of markup.

592 :return: Whether or not the markup resembled a filename

593 closely enough to justify issuing a warning.

594 """

595 markup_b: bytes

596

597 # We're only checking ASCII characters, so rather than write

598 # the same tests twice, convert Unicode to a bytestring and

599 # operate on the bytestring.

600 if isinstance(markup, str):

601 markup_b = markup.encode("utf8")

602 else:

603 markup_b = markup

604

605 # Step 1: does it end with a common textual file extension?

606 filelike = False

607 lower = markup_b.lower()

608 extensions = [b".html", b".htm", b".xml", b".xhtml", b".txt"]

609 if any(lower.endswith(ext) for ext in extensions):

610 filelike = True

611 if not filelike:

612 return False

613

614 # Step 2: it _might_ be a file, but there are a few things

615 # we can look for that aren't very common in filenames.

616

617 # Characters that have special meaning to Unix shells. (< was

618 # excluded before this method was called.)

619 #

620 # Many of these are also reserved characters that cannot

621 # appear in Windows filenames.

622 for byte in markup_b:

623 if byte in b"?*#&;>$|":

624 return False

625

626 # Two consecutive forward slashes (as seen in a URL) or two

627 # consecutive spaces (as seen in fixed-width data).

628 #

629 # (Paths to Windows network shares contain consecutive

630 # backslashes, so checking that doesn't seem as helpful.)

631 if b"//" in markup_b:

632 return False

633 if b" " in markup_b:

634 return False

635

636 # A colon in any position other than position 1 (e.g. after a

637 # Windows drive letter).

638 if markup_b.startswith(b":"):

639 return False

640 colon_i = markup_b.rfind(b":")

641 if colon_i not in (-1, 1):

642 return False

643

644 # Step 3: If it survived all of those checks, it's similar

645 # enough to a file to justify issuing a warning.

646 warnings.warn(

647 MarkupResemblesLocatorWarning.FILENAME_MESSAGE % dict(what="filename"),

648 MarkupResemblesLocatorWarning,

649 stacklevel=3,

650 )

651 return True

652

653 def _feed(self) -> None:

654 """Internal method that parses previously set markup, creating a large

655 number of Tag and NavigableString objects.

656 """

657 # Convert the document to Unicode.

658 self.builder.reset()

659

660 if self.markup is not None:

661 self.builder.feed(self.markup)

662 # Close out any unfinished strings and close all the open tags.

663 self.endData()

664 while (

665 self.currentTag is not None and self.currentTag.name != self.ROOT_TAG_NAME

666 ):

667 self.popTag()

668

669 def reset(self) -> None:

670 """Reset this object to a state as though it had never parsed any

671 markup.

672 """

673 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)

674 self.hidden = True

675 self.builder.reset()

676 self.current_data = []

677 self.currentTag = None

678 self.tagStack = []

679 self.open_tag_counter = Counter()

680 self.preserve_whitespace_tag_stack = []

681 self.string_container_stack = []

682 self._most_recent_element = None

683 self.pushTag(self)

684

685 def new_tag(

686 self,

687 name: str,

688 namespace: Optional[str] = None,

689 nsprefix: Optional[str] = None,

690 attrs: Optional[_RawAttributeValues] = None,

691 sourceline: Optional[int] = None,

692 sourcepos: Optional[int] = None,

693 string: Optional[str] = None,

694 **kwattrs: _RawAttributeValue,

695 ) -> Tag:

696 """Create a new Tag associated with this BeautifulSoup object.

697

698 :param name: The name of the new Tag.

699 :param namespace: The URI of the new Tag's XML namespace, if any.

700 :param prefix: The prefix for the new Tag's XML namespace, if any.

701 :param attrs: A dictionary of this Tag's attribute values; can

702 be used instead of ``kwattrs`` for attributes like 'class'

703 that are reserved words in Python.

704 :param sourceline: The line number where this tag was

705 (purportedly) found in its source document.

706 :param sourcepos: The character position within ``sourceline`` where this

707 tag was (purportedly) found.

708 :param string: String content for the new Tag, if any.

709 :param kwattrs: Keyword arguments for the new Tag's attribute values.

710

711 """

712 attr_container = self.builder.attribute_dict_class(**kwattrs)

713 if attrs is not None:

714 attr_container.update(attrs)

715 tag_class = self.element_classes.get(Tag, Tag)

716

717 # Assume that this is either Tag or a subclass of Tag. If not,

718 # the user brought type-unsafety upon themselves.

719 tag_class = cast(Type[Tag], tag_class)

720 tag = tag_class(

721 None,

722 self.builder,

723 name,

724 namespace,

725 nsprefix,

726 attr_container,

727 sourceline=sourceline,

728 sourcepos=sourcepos,

729 )

730

731 if string is not None:

732 tag.string = string

733 return tag

734

735 def string_container(

736 self, base_class: Optional[Type[NavigableString]] = None

737 ) -> Type[NavigableString]:

738 """Find the class that should be instantiated to hold a given kind of

739 string.

740

741 This may be a built-in Beautiful Soup class or a custom class passed

742 in to the BeautifulSoup constructor.

743 """

744 container = base_class or NavigableString

745

746 # The user may want us to use some other class (hopefully a

747 # custom subclass) instead of the one we'd use normally.

748 container = cast(

749 Type[NavigableString], self.element_classes.get(container, container)

750 )

751

752 # On top of that, we may be inside a tag that needs a special

753 # container class.

754 if self.string_container_stack and container is NavigableString:

755 container = self.builder.string_containers.get(

756 self.string_container_stack[-1].name, container

757 )

758 return container

759

760 def new_string(

761 self, s: str, subclass: Optional[Type[NavigableString]] = None

762 ) -> NavigableString:

763 """Create a new `NavigableString` associated with this `BeautifulSoup`

764 object.

765

766 :param s: The string content of the `NavigableString`

767 :param subclass: The subclass of `NavigableString`, if any, to

768 use. If a document is being processed, an appropriate

769 subclass for the current location in the document will

770 be determined automatically.

771 """

772 container = self.string_container(subclass)

773 return container(s)

774

775 def insert_before(self, *args: _InsertableElement) -> List[PageElement]:

776 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement

777 it because there is nothing before or after it in the parse tree.

778 """

779 raise NotImplementedError(

780 "BeautifulSoup objects don't support insert_before()."

781 )

782

783 def insert_after(self, *args: _InsertableElement) -> List[PageElement]:

784 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement

785 it because there is nothing before or after it in the parse tree.

786 """

787 raise NotImplementedError("BeautifulSoup objects don't support insert_after().")

788

789 def popTag(self) -> Optional[Tag]:

790 """Internal method called by _popToTag when a tag is closed.

791

792 :meta private:

793 """

794 if not self.tagStack:

795 # Nothing to pop. This shouldn't happen.

796 return None

797 tag = self.tagStack.pop()

798 if tag.name in self.open_tag_counter:

799 self.open_tag_counter[tag.name] -= 1

800 if (

801 self.preserve_whitespace_tag_stack

802 and tag == self.preserve_whitespace_tag_stack[-1]

803 ):

804 self.preserve_whitespace_tag_stack.pop()

805 if self.string_container_stack and tag == self.string_container_stack[-1]:

806 self.string_container_stack.pop()

807 # print("Pop", tag.name)

808 if self.tagStack:

809 self.currentTag = self.tagStack[-1]

810 return self.currentTag

811

812 def pushTag(self, tag: Tag) -> None:

813 """Internal method called by handle_starttag when a tag is opened.

814

815 :meta private:

816 """

817 # print("Push", tag.name)

818 if self.currentTag is not None:

819 self.currentTag.contents.append(tag)

820 self.tagStack.append(tag)

821 self.currentTag = self.tagStack[-1]

822 if tag.name != self.ROOT_TAG_NAME:

823 self.open_tag_counter[tag.name] += 1

824 if tag.name in self.builder.preserve_whitespace_tags:

825 self.preserve_whitespace_tag_stack.append(tag)

826 if tag.name in self.builder.string_containers:

827 self.string_container_stack.append(tag)

828

829 def endData(self, containerClass: Optional[Type[NavigableString]] = None) -> None:

830 """Method called by the TreeBuilder when the end of a data segment

831 occurs.

832

833 :param containerClass: The class to use when incorporating the

834 data segment into the parse tree.

835

836 :meta private:

837 """

838 if self.current_data:

839 current_data = "".join(self.current_data)

840 # If whitespace is not preserved, and this string contains

841 # nothing but ASCII spaces, replace it with a single space

842 # or newline.

843 if not self.preserve_whitespace_tag_stack:

844 strippable = True

845 for i in current_data:

846 if i not in self.ASCII_SPACES:

847 strippable = False

848 break

849 if strippable:

850 if "\n" in current_data:

851 current_data = "\n"

852 else:

853 current_data = " "

854

855 # Reset the data collector.

856 self.current_data = []

857

858 # Should we add this string to the tree at all?

859 if (

860 self.parse_only

861 and len(self.tagStack) <= 1

862 and (not self.parse_only.allow_string_creation(current_data))

863 ):

864 return

865

866 containerClass = self.string_container(containerClass)

867 o = containerClass(current_data)

868 self.object_was_parsed(o)

869

870 def object_was_parsed(

871 self,

872 o: PageElement,

873 parent: Optional[Tag] = None,

874 most_recent_element: Optional[PageElement] = None,

875 ) -> None:

876 """Method called by the TreeBuilder to integrate an object into the

877 parse tree.

878

879 :meta private:

880 """

881 if parent is None:

882 parent = self.currentTag

883 assert parent is not None

884 previous_element: Optional[PageElement]

885 if most_recent_element is not None:

886 previous_element = most_recent_element

887 else:

888 previous_element = self._most_recent_element

889

890 next_element = previous_sibling = next_sibling = None

891 if isinstance(o, Tag):

892 next_element = o.next_element

893 next_sibling = o.next_sibling

894 previous_sibling = o.previous_sibling

895 if previous_element is None:

896 previous_element = o.previous_element

897

898 fix = parent.next_element is not None

899

900 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)

901

902 self._most_recent_element = o

903 parent.contents.append(o)

904

905 # Check if we are inserting into an already parsed node.

906 if fix:

907 self._linkage_fixer(parent)

908

909 def _linkage_fixer(self, el: Tag) -> None:

910 """Make sure linkage of this fragment is sound."""

911

912 first = el.contents[0]

913 child = el.contents[-1]

914 descendant: PageElement = child

915

916 if child is first and el.parent is not None:

917 # Parent should be linked to first child

918 el.next_element = child

919 # We are no longer linked to whatever this element is

920 prev_el = child.previous_element

921 if prev_el is not None and prev_el is not el:

922 prev_el.next_element = None

923 # First child should be linked to the parent, and no previous siblings.

924 child.previous_element = el

925 child.previous_sibling = None

926

927 # We have no sibling as we've been appended as the last.

928 child.next_sibling = None

929

930 # This index is a tag, dig deeper for a "last descendant"

931 if isinstance(child, Tag) and child.contents:

932 # _last_decendant is typed as returning Optional[PageElement],

933 # but the value can't be None here, because el is a Tag

934 # which we know has contents.

935 descendant = cast(PageElement, child._last_descendant(False))

936

937 # As the final step, link last descendant. It should be linked

938 # to the parent's next sibling (if found), else walk up the chain

939 # and find a parent with a sibling. It should have no next sibling.

940 descendant.next_element = None

941 descendant.next_sibling = None

942

943 target: Optional[Tag] = el

944 while True:

945 if target is None:

946 break

947 elif target.next_sibling is not None:

948 descendant.next_element = target.next_sibling

949 target.next_sibling.previous_element = child

950 break

951 target = target.parent

952

953 def _popToTag(

954 self, name: str, nsprefix: Optional[str] = None, inclusivePop: bool = True

955 ) -> Optional[Tag]:

956 """Pops the tag stack up to and including the most recent

957 instance of the given tag.

958

959 If there are no open tags with the given name, nothing will be

960 popped.

961

962 :param name: Pop up to the most recent tag with this name.

963 :param nsprefix: The namespace prefix that goes with `name`.

964 :param inclusivePop: It this is false, pops the tag stack up

965 to but *not* including the most recent instqance of the

966 given tag.

967

968 :meta private:

969 """

970 # print("Popping to %s" % name)

971 if name == self.ROOT_TAG_NAME:

972 # The BeautifulSoup object itself can never be popped.

973 return None

974

975 most_recently_popped = None

976

977 stack_size = len(self.tagStack)

978 for i in range(stack_size - 1, 0, -1):

979 if not self.open_tag_counter.get(name):

980 break

981 t = self.tagStack[i]

982 if name == t.name and nsprefix == t.prefix:

983 if inclusivePop:

984 most_recently_popped = self.popTag()

985 break

986 most_recently_popped = self.popTag()

987

988 return most_recently_popped

989

990 def handle_starttag(

991 self,

992 name: str,

993 namespace: Optional[str],

994 nsprefix: Optional[str],

995 attrs: _RawAttributeValues,

996 sourceline: Optional[int] = None,

997 sourcepos: Optional[int] = None,

998 namespaces: Optional[Dict[str, str]] = None,

999 ) -> Optional[Tag]:

1000 """Called by the tree builder when a new tag is encountered.

1001

1002 :param name: Name of the tag.

1003 :param nsprefix: Namespace prefix for the tag.

1004 :param attrs: A dictionary of attribute values. Note that

1005 attribute values are expected to be simple strings; processing

1006 of multi-valued attributes such as "class" comes later.

1007 :param sourceline: The line number where this tag was found in its

1008 source document.

1009 :param sourcepos: The character position within `sourceline` where this

1010 tag was found.

1011 :param namespaces: A dictionary of all namespace prefix mappings

1012 currently in scope in the document.

1013

1014 If this method returns None, the tag was rejected by an active

1015 `ElementFilter`. You should proceed as if the tag had not occurred

1016 in the document. For instance, if this was a self-closing tag,

1017 don't call handle_endtag.

1018

1019 :meta private:

1020 """

1021 # print("Start tag %s: %s" % (name, attrs))

1022 self.endData()

1023

1024 if (

1025 self.parse_only

1026 and len(self.tagStack) <= 1

1027 and not self.parse_only.allow_tag_creation(nsprefix, name, attrs)

1028 ):

1029 return None

1030

1031 tag_class = self.element_classes.get(Tag, Tag)

1032 # Assume that this is either Tag or a subclass of Tag. If not,

1033 # the user brought type-unsafety upon themselves.

1034 tag_class = cast(Type[Tag], tag_class)

1035 tag = tag_class(

1036 self,

1037 self.builder,

1038 name,

1039 namespace,

1040 nsprefix,

1041 attrs,

1042 self.currentTag,

1043 self._most_recent_element,

1044 sourceline=sourceline,

1045 sourcepos=sourcepos,

1046 namespaces=namespaces,

1047 )

1048 if tag is None:

1049 return tag

1050 if self._most_recent_element is not None:

1051 self._most_recent_element.next_element = tag

1052 self._most_recent_element = tag

1053 self.pushTag(tag)

1054 return tag

1055

1056 def handle_endtag(self, name: str, nsprefix: Optional[str] = None) -> None:

1057 """Called by the tree builder when an ending tag is encountered.

1058

1059 :param name: Name of the tag.

1060 :param nsprefix: Namespace prefix for the tag.

1061

1062 :meta private:

1063 """

1064 # print("End tag: " + name)

1065 self.endData()

1066 self._popToTag(name, nsprefix)

1067

1068 def handle_data(self, data: str) -> None:

1069 """Called by the tree builder when a chunk of textual data is

1070 encountered.

1071

1072 :meta private:

1073 """

1074 self.current_data.append(data)

1075

1076 def decode(

1077 self,

1078 indent_level: Optional[int] = None,

1079 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

1080 formatter: Union[Formatter, str] = "minimal",

1081 iterator: Optional[Iterator[PageElement]] = None,

1082 **kwargs: Any,

1083 ) -> str:

1084 """Returns a string representation of the parse tree

1085 as a full HTML or XML document.

1086

1087 :param indent_level: Each line of the rendering will be

1088 indented this many levels. (The ``formatter`` decides what a

1089 'level' means, in terms of spaces or other characters

1090 output.) This is used internally in recursive calls while

1091 pretty-printing.

1092 :param eventual_encoding: The encoding of the final document.

1093 If this is None, the document will be a Unicode string.

1094 :param formatter: Either a `Formatter` object, or a string naming one of

1095 the standard formatters.

1096 :param iterator: The iterator to use when navigating over the

1097 parse tree. This is only used by `Tag.decode_contents` and

1098 you probably won't need to use it.

1099 """

1100 if self.is_xml:

1101 # Print the XML declaration

1102 encoding_part = ""

1103 declared_encoding: Optional[str] = eventual_encoding

1104 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:

1105 # This is a special Python encoding; it can't actually

1106 # go into an XML document because it means nothing

1107 # outside of Python.

1108 declared_encoding = None

1109 if declared_encoding is not None:

1110 encoding_part = ' encoding="%s"' % declared_encoding

1111 prefix = '<?xml version="1.0"%s?>\n' % encoding_part

1112 else:

1113 prefix = ""

1114

1115 # Prior to 4.13.0, the first argument to this method was a

1116 # bool called pretty_print, which gave the method a different

1117 # signature from its superclass implementation, Tag.decode.

1118 #

1119 # The signatures of the two methods now match, but just in

1120 # case someone is still passing a boolean in as the first

1121 # argument to this method (or a keyword argument with the old

1122 # name), we can handle it and put out a DeprecationWarning.

1123 warning: Optional[str] = None

1124 pretty_print: Optional[bool] = None

1125 if isinstance(indent_level, bool):

1126 if indent_level is True:

1127 indent_level = 0

1128 elif indent_level is False:

1129 indent_level = None

1130 warning = f"As of 4.13.0, the first argument to BeautifulSoup.decode has been changed from bool to int, to match Tag.decode. Pass in a value of {indent_level} instead."

1131 else:

1132 pretty_print = kwargs.pop("pretty_print", None)

1133 assert not kwargs

1134 if pretty_print is not None:

1135 if pretty_print is True:

1136 indent_level = 0

1137 elif pretty_print is False:

1138 indent_level = None

1139 warning = f"As of 4.13.0, the pretty_print argument to BeautifulSoup.decode has been removed, to match Tag.decode. Pass in a value of indent_level={indent_level} instead."

1140

1141 if warning:

1142 warnings.warn(warning, DeprecationWarning, stacklevel=2)

1143 elif indent_level is False or pretty_print is False:

1144 indent_level = None

1145 return prefix + super(BeautifulSoup, self).decode(