Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/__init_

1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".

3http://www.crummy.com/software/BeautifulSoup/

5Beautiful Soup uses a pluggable XML or HTML parser to parse a

6(possibly invalid) document into a tree representation. Beautiful Soup

7provides methods and Pythonic idioms that make it easy to navigate,

8search, and modify the parse tree.

10Beautiful Soup works with Python 3.7 and up. It works better if lxml

11and/or html5lib is installed, but they are not required.

13For more than you ever wanted to know about Beautiful Soup, see the

14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/

15"""

17__author__ = "Leonard Richardson (leonardr@segfault.org)"

18__version__ = "4.15.0"

20# Use of this source code is governed by the MIT license.

21__license__ = "MIT"

23__all__ = [

24 "AttributeResemblesVariableWarning",

25 "BeautifulSoup",

26 "Comment",

27 "Declaration",

28 "ProcessingInstruction",

29 "ResultSet",

30 "CSS",

31 "Script",

32 "Stylesheet",

33 "Tag",

34 "TemplateString",

35 "ElementFilter",

36 "UnicodeDammit",

37 "CData",

38 "Doctype",

40 # Exceptions

41 "FeatureNotFound",

42 "ParserRejectedMarkup",

43 "StopParsing",

45 # Warnings

46 "AttributeResemblesVariableWarning",

47 "GuessedAtParserWarning",

48 "MarkupResemblesLocatorWarning",

49 "UnusualUsageWarning",

50 "XMLParsedAsHTMLWarning",

51]

53from collections import Counter

54import io

55import sys

56import warnings

58# The very first thing we do is give a useful error if someone is

59# running this code under Python 2.

60if sys.version_info.major < 3:

61 raise ImportError(

62 "You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3."

63 )

65from .builder import (

66 builder_registry,

67 TreeBuilder,

68)

69from .builder._htmlparser import HTMLParserTreeBuilder

70from .dammit import UnicodeDammit

71from .css import CSS

72from ._deprecation import (

73 _deprecated,

74)

75from .element import (

76 CData,

77 Comment,

78 DEFAULT_OUTPUT_ENCODING,

79 Declaration,

80 Doctype,

81 NavigableString,

82 PageElement,

83 ProcessingInstruction,

84 PYTHON_SPECIFIC_ENCODINGS,

85 ResultSet,

86 Script,

87 Stylesheet,

88 Tag,

89 TemplateString,

90)

91from .formatter import Formatter

92from .filter import (

93 ElementFilter,

94 SoupStrainer,

95)

96from typing import (

97 Any,

98 cast,

99 Counter as CounterType,

100 Dict,

101 Iterator,

102 List,

103 Sequence,

104 Sized,

105 Optional,

106 Type,

107 Union,

108)

109

110from bs4._typing import (

111 _Encoding,

112 _Encodings,

113 _IncomingMarkup,

114 _InsertableElement,

115 _RawAttributeValue,

116 _RawAttributeValues,

117 _RawMarkup,

118)

119

120# Import all warnings and exceptions into the main package.

121from bs4.exceptions import (

122 FeatureNotFound,

123 ParserRejectedMarkup,

124 StopParsing,

125)

126from bs4._warnings import (

127 AttributeResemblesVariableWarning,

128 GuessedAtParserWarning,

129 MarkupResemblesLocatorWarning,

130 UnusualUsageWarning,

131 XMLParsedAsHTMLWarning,

132)

133

134

135class BeautifulSoup(Tag):

136 """A data structure representing a parsed HTML or XML document.

137

138 Most of the methods you'll call on a BeautifulSoup object are inherited from

139 PageElement or Tag.

140

141 Internally, this class defines the basic interface called by the

142 tree builders when converting an HTML/XML document into a data

143 structure. The interface abstracts away the differences between

144 parsers. To write a new tree builder, you'll need to understand

145 these methods as a whole.

146

147 These methods will be called by the BeautifulSoup constructor:

148 * reset()

149 * feed(markup)

150

151 The tree builder may call these methods from its feed() implementation:

152 * handle_starttag(name, attrs) # See note about return value

153 * handle_endtag(name)

154 * handle_data(data) # Appends to the current data node

155 * endData(containerClass) # Ends the current data node

156

157 No matter how complicated the underlying parser is, you should be

158 able to build a tree using 'start tag' events, 'end tag' events,

159 'data' events, and "done with data" events.

160

161 If you encounter an empty-element tag (aka a self-closing tag,

162 like HTML's <br> tag), call handle_starttag and then

163 handle_endtag.

164 """

165

166 #: Since `BeautifulSoup` subclasses `Tag`, it's possible to treat it as

167 #: a `Tag` with a `Tag.name`. Hoever, this name makes it clear the

168 #: `BeautifulSoup` object isn't a real markup tag.

169 ROOT_TAG_NAME: str = "[document]"

170

171 #: If the end-user gives no indication which tree builder they

172 #: want, look for one with these features.

173 DEFAULT_BUILDER_FEATURES: Sequence[str] = ["html", "fast"]

174

175 #: A string containing all ASCII whitespace characters, used in

176 #: during parsing to detect data chunks that seem 'empty'.

177 ASCII_SPACES: str = "\x20\x0a\x09\x0c\x0d"

178

179 # FUTURE PYTHON:

180 element_classes: Dict[Type[PageElement], Type[PageElement]] #: :meta private:

181 builder: TreeBuilder #: :meta private:

182 is_xml: bool

183 known_xml: Optional[bool]

184 parse_only: Optional[SoupStrainer] #: :meta private:

185

186 # These members are only used while parsing markup.

187 markup: Optional[_RawMarkup] #: :meta private:

188 current_data: List[str] #: :meta private:

189 currentTag: Optional[Tag] #: :meta private:

190 tagStack: List[Tag] #: :meta private:

191 open_tag_counter: CounterType[str] #: :meta private:

192 preserve_whitespace_tag_stack: List[Tag] #: :meta private:

193 string_container_stack: List[Tag] #: :meta private:

194 _most_recent_element: Optional[PageElement] #: :meta private:

195

196 #: Beautiful Soup's best guess as to the character encoding of the

197 #: original document.

198 original_encoding: Optional[_Encoding]

199

200 #: The character encoding, if any, that was explicitly defined

201 #: in the original document. This may or may not match

202 #: `BeautifulSoup.original_encoding`.

203 declared_html_encoding: Optional[_Encoding]

204

205 #: This is True if the markup that was parsed contains

206 #: U+FFFD REPLACEMENT_CHARACTER characters which were not present

207 #: in the original markup. These mark character sequences that

208 #: could not be represented in Unicode.

209 contains_replacement_characters: bool

210

211 def __init__(

212 self,

213 markup: _IncomingMarkup = "",

214 features: Optional[Union[str, Sequence[str]]] = None,

215 builder: Optional[Union[TreeBuilder, Type[TreeBuilder]]] = None,

216 parse_only: Optional[SoupStrainer] = None,

217 from_encoding: Optional[_Encoding] = None,

218 exclude_encodings: Optional[_Encodings] = None,

219 element_classes: Optional[Dict[Type[PageElement], Type[PageElement]]] = None,

220 **kwargs: Any,

221 ):

222 """Constructor.

223

224 :param markup: A string or a file-like object representing

225 markup to be parsed.

226

227 :param features: Desirable features of the parser to be

228 used. This may be the name of a specific parser ("lxml",

229 "lxml-xml", "html.parser", or "html5lib") or it may be the

230 type of markup to be used ("html", "html5", "xml"). It's

231 recommended that you name a specific parser, so that

232 Beautiful Soup gives you the same results across platforms

233 and virtual environments.

234

235 :param builder: A TreeBuilder subclass to instantiate (or

236 instance to use) instead of looking one up based on

237 `features`. You only need to use this if you've implemented a

238 custom TreeBuilder.

239

240 :param parse_only: A SoupStrainer. Only parts of the document

241 matching the SoupStrainer will be considered. This is useful

242 when parsing part of a document that would otherwise be too

243 large to fit into memory.

244

245 :param from_encoding: A string indicating the encoding of the

246 document to be parsed. Pass this in if Beautiful Soup is

247 guessing wrongly about the document's encoding.

248

249 :param exclude_encodings: A list of strings indicating

250 encodings known to be wrong. Pass this in if you don't know

251 the document's encoding but you know Beautiful Soup's guess is

252 wrong.

253

254 :param element_classes: A dictionary mapping BeautifulSoup

255 classes like Tag and NavigableString, to other classes you'd

256 like to be instantiated instead as the parse tree is

257 built. This is useful for subclassing Tag or NavigableString

258 to modify default behavior.

259

260 :param kwargs: For backwards compatibility purposes, the

261 constructor accepts certain keyword arguments used in

262 Beautiful Soup 3. None of these arguments do anything in

263 Beautiful Soup 4; they will result in a warning and then be

264 ignored.

265

266 Apart from this, any keyword arguments passed into the

267 BeautifulSoup constructor are propagated to the TreeBuilder

268 constructor. This makes it possible to configure a

269 TreeBuilder by passing in arguments, not just by saying which

270 one to use.

271 """

272 if "convertEntities" in kwargs:

273 del kwargs["convertEntities"]

274 warnings.warn(

275 "BS4 does not respect the convertEntities argument to the "

276 "BeautifulSoup constructor. Entities are always converted "

277 "to Unicode characters."

278 )

279

280 if "markupMassage" in kwargs:

281 del kwargs["markupMassage"]

282 warnings.warn(

283 "BS4 does not respect the markupMassage argument to the "

284 "BeautifulSoup constructor. The tree builder is responsible "

285 "for any necessary markup massage."

286 )

287

288 if "smartQuotesTo" in kwargs:

289 del kwargs["smartQuotesTo"]

290 warnings.warn(

291 "BS4 does not respect the smartQuotesTo argument to the "

292 "BeautifulSoup constructor. Smart quotes are always converted "

293 "to Unicode characters."

294 )

295

296 if "selfClosingTags" in kwargs:

297 del kwargs["selfClosingTags"]

298 warnings.warn(

299 "Beautiful Soup 4 does not respect the selfClosingTags argument to the "

300 "BeautifulSoup constructor. The tree builder is responsible "

301 "for understanding self-closing tags."

302 )

303

304 if "isHTML" in kwargs:

305 del kwargs["isHTML"]

306 warnings.warn(

307 "Beautiful Soup 4 does not respect the isHTML argument to the "

308 "BeautifulSoup constructor. Suggest you use "

309 "features='lxml' for HTML and features='lxml-xml' for "

310 "XML."

311 )

312

313 def deprecated_argument(old_name: str, new_name: str) -> Optional[Any]:

314 if old_name in kwargs:

315 warnings.warn(

316 'The "%s" argument to the BeautifulSoup constructor '

317 'was renamed to "%s" in Beautiful Soup 4.0.0'

318 % (old_name, new_name),

319 DeprecationWarning,

320 stacklevel=3,

321 )

322 return kwargs.pop(old_name)

323 return None

324

325 parse_only = parse_only or deprecated_argument("parseOnlyThese", "parse_only")

326 if parse_only is not None:

327 # Issue a warning if we can tell in advance that

328 # parse_only will exclude the entire tree.

329 if parse_only.excludes_everything:

330 warnings.warn(

331 f"The given value for parse_only will exclude everything: {parse_only}",

332 UserWarning,

333 stacklevel=3,

334 )

335

336 from_encoding = from_encoding or deprecated_argument(

337 "fromEncoding", "from_encoding"

338 )

339

340 if from_encoding and isinstance(markup, str):

341 warnings.warn(

342 "You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored."

343 )

344 from_encoding = None

345

346 self.element_classes = element_classes or dict()

347

348 # We need this information to track whether or not the builder

349 # was specified well enough that we can omit the 'you need to

350 # specify a parser' warning.

351 original_builder = builder

352 original_features = features

353

354 builder_class: Optional[Type[TreeBuilder]] = None

355 if isinstance(builder, type):

356 # A builder class was passed in; it needs to be instantiated.

357 builder_class = builder

358 builder = None

359 elif builder is None:

360 if isinstance(features, str):

361 features = [features]

362 if features is None or len(features) == 0:

363 features = self.DEFAULT_BUILDER_FEATURES

364 possible_builder_class = builder_registry.lookup(*features)

365 if possible_builder_class is None:

366 raise FeatureNotFound(

367 "Couldn't find a tree builder with the features you "

368 "requested: %s. Do you need to install a parser library?"

369 % ",".join(features)

370 )

371 builder_class = possible_builder_class

372

373 # At this point either we have a TreeBuilder instance in

374 # builder, or we have a builder_class that we can instantiate

375 # with the remaining **kwargs.

376 if builder is None:

377 assert builder_class is not None

378 builder = builder_class(**kwargs)

379 if (

380 not original_builder

381 and not (

382 original_features == builder.NAME

383 or (

384 isinstance(original_features, str)

385 and original_features in builder.ALTERNATE_NAMES

386 )

387 )

388 and markup

389 ):

390 # The user did not tell us which TreeBuilder to use,

391 # and we had to guess. Issue a warning.

392 if builder.is_xml:

393 markup_type = "XML"

394 else:

395 markup_type = "HTML"

396

397 # This code adapted from warnings.py so that we get the same line

398 # of code as our warnings.warn() call gets, even if the answer is wrong

399 # (as it may be in a multithreading situation).

400 caller = None

401 try:

402 caller = sys._getframe(1)

403 except ValueError:

404 pass

405 if caller:

406 globals = caller.f_globals

407 line_number = caller.f_lineno

408 else:

409 globals = sys.__dict__

410 line_number = 1

411 filename = globals.get("__file__")

412 if filename:

413 fnl = filename.lower()

414 if fnl.endswith((".pyc", ".pyo")):

415 filename = filename[:-1]

416 if filename:

417 # If there is no filename at all, the user is most likely in a REPL,

418 # and the warning is not necessary.

419 values = dict(

420 filename=filename,

421 line_number=line_number,

422 parser=builder.NAME,

423 markup_type=markup_type,

424 )

425 warnings.warn(

426 GuessedAtParserWarning.MESSAGE % values,

427 GuessedAtParserWarning,

428 stacklevel=2,

429 )

430 else:

431 if kwargs:

432 warnings.warn(

433 "Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`."

434 )

435

436 self.builder = builder

437 self.is_xml = builder.is_xml

438 self.known_xml = self.is_xml

439 self._namespaces = dict()

440 self.parse_only = parse_only

441

442 if hasattr(markup, "read"): # It's a file-type object.

443 markup = cast(io.IOBase, markup).read()

444 elif not isinstance(markup, (bytes, str)) and not hasattr(markup, "__len__"):

445 raise TypeError(

446 f"Incoming markup is of an invalid type: {markup!r}. Markup must be a string, a bytestring, or an open filehandle."

447 )

448 elif isinstance(markup, Sized) and len(markup) <= 256 and (

449 (isinstance(markup, bytes) and b"<" not in markup and b"\n" not in markup)

450 or (isinstance(markup, str) and "<" not in markup and "\n" not in markup)

451 ):

452 # Issue warnings for a couple beginner problems

453 # involving passing non-markup to Beautiful Soup.

454 # Beautiful Soup will still parse the input as markup,

455 # since that is sometimes the intended behavior.

456 if not self._markup_is_url(markup):

457 self._markup_resembles_filename(markup)

458

459 # At this point we know markup is a string or bytestring. If

460 # it was a file-type object, we've read from it.

461 markup = cast(_RawMarkup, markup)

462

463 rejections = []

464 success = False

465 for (

466 self.markup,

467 self.original_encoding,

468 self.declared_html_encoding,

469 self.contains_replacement_characters,

470 ) in self.builder.prepare_markup(

471 markup, from_encoding, exclude_encodings=exclude_encodings

472 ):

473 self.reset()

474 self.builder.initialize_soup(self)

475 try:

476 self._feed()

477 success = True

478 break

479 except ParserRejectedMarkup as e:

480 rejections.append(e)

481 pass

482

483 if not success:

484 other_exceptions = [str(e) for e in rejections]

485 raise ParserRejectedMarkup(

486 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n "

487 + "\n ".join(other_exceptions)

488 )

489

490 # Clear out the markup and remove the builder's circular

491 # reference to this object.

492 self.markup = None

493 self.builder.soup = None

494

495 def copy_self(self) -> "BeautifulSoup":

496 """Create a new BeautifulSoup object with the same TreeBuilder,

497 but not associated with any markup.

498

499 This is the first step of the deepcopy process.

500 """

501 clone = type(self)("", None, self.builder)

502

503 # Keep track of the encoding of the original document,

504 # since we won't be parsing it again.

505 clone.original_encoding = self.original_encoding

506 return clone

507

508 def __getstate__(self) -> Dict[str, Any]:

509 # Frequently a tree builder can't be pickled.

510 d = dict(self.__dict__)

511 if "builder" in d and d["builder"] is not None and not self.builder.picklable:

512 d["builder"] = type(self.builder)

513 # Store the contents as a Unicode string.

514 d["contents"] = []

515 d["markup"] = self.decode()

516

517 # If _most_recent_element is present, it's a Tag object left

518 # over from initial parse. It might not be picklable and we

519 # don't need it.

520 if "_most_recent_element" in d:

521 del d["_most_recent_element"]

522 return d

523

524 def __setstate__(self, state: Dict[str, Any]) -> None:

525 # If necessary, restore the TreeBuilder by looking it up.

526 self.__dict__ = state

527 if isinstance(self.builder, type):

528 self.builder = self.builder()

529 elif not self.builder:

530 # We don't know which builder was used to build this

531 # parse tree, so use a default we know is always available.

532 self.builder = HTMLParserTreeBuilder()

533 self.builder.soup = self

534 self.reset()

535 self._feed()

536

537 @property

538 def _is_root(self):

539 """Yes, a BeautifulSoup object is the root of its parse tree. Used by the _root_object internal property."""

540 return True

541

542 @classmethod

543 @_deprecated(

544 replaced_by="nothing (private method, will be removed)", version="4.13.0"

545 )

546 def _decode_markup(cls, markup: _RawMarkup) -> str:

547 """Ensure `markup` is Unicode so it's safe to send into warnings.warn.

548

549 warnings.warn had this problem back in 2010 but fortunately

550 not anymore. This has not been used for a long time; I just

551 noticed that fact while working on 4.13.0.

552 """

553 if isinstance(markup, bytes):

554 decoded = markup.decode("utf-8", "replace")

555 else:

556 decoded = markup

557 return decoded

558

559 @classmethod

560 def _markup_is_url(cls, markup: _RawMarkup) -> bool:

561 """Error-handling method to raise a warning if incoming markup looks

562 like a URL.

563

564 :param markup: A string of markup.

565 :return: Whether or not the markup resembled a URL

566 closely enough to justify issuing a warning.

567 """

568 problem: bool = False

569 if isinstance(markup, bytes):

570 problem = (

571 any(markup.startswith(prefix) for prefix in (b"http:", b"https:"))

572 and b" " not in markup

573 )

574 elif isinstance(markup, str):

575 problem = (

576 any(markup.startswith(prefix) for prefix in ("http:", "https:"))

577 and " " not in markup

578 )

579 else:

580 return False

581

582 if not problem:

583 return False

584 warnings.warn(

585 MarkupResemblesLocatorWarning.URL_MESSAGE % dict(what="URL"),

586 MarkupResemblesLocatorWarning,

587 stacklevel=3,

588 )

589 return True

590

591 @classmethod

592 def _markup_resembles_filename(cls, markup: _RawMarkup) -> bool:

593 """Error-handling method to issue a warning if incoming markup

594 resembles a filename.

595

596 :param markup: A string of markup.

597 :return: Whether or not the markup resembled a filename

598 closely enough to justify issuing a warning.

599 """

600 markup_b: bytes

601

602 # We're only checking ASCII characters, so rather than write

603 # the same tests twice, convert Unicode to a bytestring and

604 # operate on the bytestring.

605 if isinstance(markup, str):

606 markup_b = markup.encode("utf8")

607 else:

608 markup_b = markup

609

610 # Step 1: does it end with a common textual file extension?

611 filelike = False

612 lower = markup_b.lower()

613 extensions = [b".html", b".htm", b".xml", b".xhtml", b".txt"]

614 if any(lower.endswith(ext) for ext in extensions):

615 filelike = True

616 if not filelike:

617 return False

618

619 # Step 2: it _might_ be a file, but there are a few things

620 # we can look for that aren't very common in filenames.

621

622 # Characters that have special meaning to Unix shells. (< was

623 # excluded before this method was called.)

624 #

625 # Many of these are also reserved characters that cannot

626 # appear in Windows filenames.

627 for byte in markup_b:

628 if byte in b"?*#&;>$|":

629 return False

630

631 # Two consecutive forward slashes (as seen in a URL) or two

632 # consecutive spaces (as seen in fixed-width data).

633 #

634 # (Paths to Windows network shares contain consecutive

635 # backslashes, so checking that doesn't seem as helpful.)

636 if b"//" in markup_b:

637 return False

638 if b" " in markup_b:

639 return False

640

641 # A colon in any position other than position 1 (e.g. after a

642 # Windows drive letter).

643 if markup_b.startswith(b":"):

644 return False

645 colon_i = markup_b.rfind(b":")

646 if colon_i not in (-1, 1):

647 return False

648

649 # Step 3: If it survived all of those checks, it's similar

650 # enough to a file to justify issuing a warning.

651 warnings.warn(

652 MarkupResemblesLocatorWarning.FILENAME_MESSAGE % dict(what="filename"),

653 MarkupResemblesLocatorWarning,

654 stacklevel=3,

655 )

656 return True

657

658 def _feed(self) -> None:

659 """Internal method that parses previously set markup, creating a large

660 number of Tag and NavigableString objects.

661 """

662 # Convert the document to Unicode.

663 self.builder.reset()

664

665 if self.markup is not None:

666 self.builder.feed(self.markup)

667 # Close out any unfinished strings and close all the open tags.

668 self.endData()

669 while (

670 self.currentTag is not None and self.currentTag.name != self.ROOT_TAG_NAME

671 ):

672 self.popTag()

673

674 def reset(self) -> None:

675 """Reset this object to a state as though it had never parsed any

676 markup.

677 """

678 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)

679 self.hidden = True

680 self.builder.reset()

681 self.current_data = []

682 self.currentTag = None

683 self.tagStack = []

684 self.open_tag_counter = Counter()

685 self.preserve_whitespace_tag_stack = []

686 self.string_container_stack = []

687 self._most_recent_element = None

688 self.pushTag(self)

689

690 def new_tag(

691 self,

692 name: str,

693 namespace: Optional[str] = None,

694 nsprefix: Optional[str] = None,

695 attrs: Optional[_RawAttributeValues] = None,

696 sourceline: Optional[int] = None,

697 sourcepos: Optional[int] = None,

698 string: Optional[str] = None,

699 **kwattrs: _RawAttributeValue,

700 ) -> Tag:

701 """Create a new Tag associated with this BeautifulSoup object.

702

703 :param name: The name of the new Tag.

704 :param namespace: The URI of the new Tag's XML namespace, if any.

705 :param prefix: The prefix for the new Tag's XML namespace, if any.

706 :param attrs: A dictionary of this Tag's attribute values; can

707 be used instead of ``kwattrs`` for attributes like 'class'

708 that are reserved words in Python.

709 :param sourceline: The line number where this tag was

710 (purportedly) found in its source document.

711 :param sourcepos: The character position within ``sourceline`` where this

712 tag was (purportedly) found.

713 :param string: String content for the new Tag, if any.

714 :param kwattrs: Keyword arguments for the new Tag's attribute values.

715

716 """

717 attr_container = self.builder.attribute_dict_class(**kwattrs)

718 if attrs is not None:

719 attr_container.update(attrs)

720 tag_class = self.element_classes.get(Tag, Tag)

721

722 # Assume that this is either Tag or a subclass of Tag. If not,

723 # the user brought type-unsafety upon themselves.

724 tag_class = cast(Type[Tag], tag_class)

725 tag = tag_class(

726 None,

727 self.builder,

728 name,

729 namespace,

730 nsprefix,

731 attr_container,

732 sourceline=sourceline,

733 sourcepos=sourcepos,

734 )

735

736 if string is not None:

737 tag.string = string

738 return tag

739

740 def string_container(

741 self, base_class: Optional[Type[NavigableString]] = None

742 ) -> Type[NavigableString]:

743 """Find the class that should be instantiated to hold a given kind of

744 string.

745

746 This may be a built-in Beautiful Soup class or a custom class passed

747 in to the BeautifulSoup constructor.

748 """

749 container = base_class or NavigableString

750

751 # The user may want us to use some other class (hopefully a

752 # custom subclass) instead of the one we'd use normally.

753 container = cast(

754 Type[NavigableString], self.element_classes.get(container, container)

755 )

756

757 # On top of that, we may be inside a tag that needs a special

758 # container class.

759 if self.string_container_stack and container is NavigableString:

760 container = self.builder.string_containers.get(

761 self.string_container_stack[-1].name, container

762 )

763 return container

764

765 def new_string(

766 self, s: str, subclass: Optional[Type[NavigableString]] = None

767 ) -> NavigableString:

768 """Create a new `NavigableString` associated with this `BeautifulSoup`

769 object.

770

771 :param s: The string content of the `NavigableString`

772 :param subclass: The subclass of `NavigableString`, if any, to

773 use. If a document is being processed, an appropriate

774 subclass for the current location in the document will

775 be determined automatically.

776 """

777 container = self.string_container(subclass)

778 return container(s)

779

780 def insert_before(self, *args: _InsertableElement) -> List[PageElement]:

781 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement

782 it because there is nothing before or after it in the parse tree.

783 """

784 raise NotImplementedError(

785 "BeautifulSoup objects don't support insert_before()."

786 )

787

788 def insert_after(self, *args: _InsertableElement) -> List[PageElement]:

789 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement

790 it because there is nothing before or after it in the parse tree.

791 """

792 raise NotImplementedError("BeautifulSoup objects don't support insert_after().")

793

794 def popTag(self) -> Optional[Tag]:

795 """Internal method called by _popToTag when a tag is closed.

796

797 :meta private:

798 """

799 if not self.tagStack:

800 # Nothing to pop. This shouldn't happen.

801 return None

802 tag = self.tagStack.pop()

803 if tag.name in self.open_tag_counter:

804 self.open_tag_counter[tag.name] -= 1

805 if (

806 self.preserve_whitespace_tag_stack

807 and tag == self.preserve_whitespace_tag_stack[-1]

808 ):

809 self.preserve_whitespace_tag_stack.pop()

810 if self.string_container_stack and tag == self.string_container_stack[-1]:

811 self.string_container_stack.pop()

812 # print("Pop", tag.name)

813 if self.tagStack:

814 self.currentTag = self.tagStack[-1]

815 return self.currentTag

816

817 def pushTag(self, tag: Tag) -> None:

818 """Internal method called by handle_starttag when a tag is opened.

819

820 :meta private:

821 """

822 # print("Push", tag.name)

823 if self.currentTag is not None:

824 self.currentTag.contents.append(tag)

825 self.tagStack.append(tag)

826 self.currentTag = self.tagStack[-1]

827 if tag.name != self.ROOT_TAG_NAME:

828 self.open_tag_counter[tag.name] += 1

829 if tag.name in self.builder.preserve_whitespace_tags:

830 self.preserve_whitespace_tag_stack.append(tag)

831 if tag.name in self.builder.string_containers:

832 self.string_container_stack.append(tag)

833

834 def endData(self, containerClass: Optional[Type[NavigableString]] = None) -> None:

835 """Method called by the TreeBuilder when the end of a data segment

836 occurs.

837

838 :param containerClass: The class to use when incorporating the

839 data segment into the parse tree.

840

841 :meta private:

842 """

843 if self.current_data:

844 current_data = "".join(self.current_data)

845 # If whitespace is not preserved, and this string contains

846 # nothing but ASCII spaces, replace it with a single space

847 # or newline.

848 if not self.preserve_whitespace_tag_stack:

849 strippable = True

850 for i in current_data:

851 if i not in self.ASCII_SPACES:

852 strippable = False

853 break

854 if strippable:

855 if "\n" in current_data:

856 current_data = "\n"

857 else:

858 current_data = " "

859

860 # Reset the data collector.

861 self.current_data = []

862

863 # Should we add this string to the tree at all?

864 if (

865 self.parse_only

866 and len(self.tagStack) <= 1

867 and (not self.parse_only.allow_string_creation(current_data))

868 ):

869 return

870

871 containerClass = self.string_container(containerClass)

872 o = containerClass(current_data)

873 self.object_was_parsed(o)

874

875 def object_was_parsed(

876 self,

877 o: PageElement,

878 parent: Optional[Tag] = None,

879 most_recent_element: Optional[PageElement] = None,

880 ) -> None:

881 """Method called by the TreeBuilder to integrate an object into the

882 parse tree.

883

884 :meta private:

885 """

886 if parent is None:

887 parent = self.currentTag

888 assert parent is not None

889 previous_element: Optional[PageElement]

890 if most_recent_element is not None:

891 previous_element = most_recent_element

892 else:

893 previous_element = self._most_recent_element

894

895 next_element = previous_sibling = next_sibling = None

896 if isinstance(o, Tag):

897 next_element = o.next_element

898 next_sibling = o.next_sibling

899 previous_sibling = o.previous_sibling

900 if previous_element is None:

901 previous_element = o.previous_element

902

903 fix = parent.next_element is not None

904

905 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)

906

907 self._most_recent_element = o

908 parent.contents.append(o)

909

910 # Check if we are inserting into an already parsed node.

911 if fix:

912 self._linkage_fixer(parent)

913

914 def _linkage_fixer(self, el: Tag) -> None:

915 """Make sure linkage of this fragment is sound."""

916

917 first = el.contents[0]

918 child = el.contents[-1]

919 descendant: PageElement = child

920

921 if child is first and el.parent is not None:

922 # Parent should be linked to first child

923 el.next_element = child

924 # We are no longer linked to whatever this element is

925 prev_el = child.previous_element

926 if prev_el is not None and prev_el is not el:

927 prev_el.next_element = None

928 # First child should be linked to the parent, and no previous siblings.

929 child.previous_element = el

930 child.previous_sibling = None

931

932 # We have no sibling as we've been appended as the last.

933 child.next_sibling = None

934

935 # This index is a tag, dig deeper for a "last descendant"

936 if isinstance(child, Tag) and child.contents:

937 # _last_decendant is typed as returning Optional[PageElement],

938 # but the value can't be None here, because el is a Tag

939 # which we know has contents.

940 descendant = cast(PageElement, child._last_descendant(False))

941

942 # As the final step, link last descendant. It should be linked

943 # to the parent's next sibling (if found), else walk up the chain

944 # and find a parent with a sibling. It should have no next sibling.

945 descendant.next_element = None

946 descendant.next_sibling = None

947

948 target: Optional[Tag] = el

949 while True:

950 if target is None:

951 break

952 elif target.next_sibling is not None:

953 descendant.next_element = target.next_sibling

954 target.next_sibling.previous_element = child

955 break

956 target = target.parent

957

958 def _popToTag(

959 self, name: str, nsprefix: Optional[str] = None, inclusivePop: bool = True

960 ) -> Optional[Tag]:

961 """Pops the tag stack up to and including the most recent

962 instance of the given tag.

963

964 If there are no open tags with the given name, nothing will be

965 popped.

966

967 :param name: Pop up to the most recent tag with this name.

968 :param nsprefix: The namespace prefix that goes with `name`.

969 :param inclusivePop: It this is false, pops the tag stack up

970 to but *not* including the most recent instqance of the

971 given tag.

972

973 :meta private:

974 """

975 # print("Popping to %s" % name)

976 if name == self.ROOT_TAG_NAME:

977 # The BeautifulSoup object itself can never be popped.

978 return None

979

980 most_recently_popped = None

981

982 stack_size = len(self.tagStack)

983 for i in range(stack_size - 1, 0, -1):

984 if not self.open_tag_counter.get(name):

985 break

986 t = self.tagStack[i]

987 if name == t.name and nsprefix == t.prefix:

988 if inclusivePop:

989 most_recently_popped = self.popTag()

990 break

991 most_recently_popped = self.popTag()

992

993 return most_recently_popped

994

995 def handle_starttag(

996 self,

997 name: str,

998 namespace: Optional[str],

999 nsprefix: Optional[str],

1000 attrs: _RawAttributeValues,

1001 sourceline: Optional[int] = None,

1002 sourcepos: Optional[int] = None,

1003 namespaces: Optional[Dict[str, str]] = None,

1004 ) -> Optional[Tag]:

1005 """Called by the tree builder when a new tag is encountered.

1006

1007 :param name: Name of the tag.

1008 :param nsprefix: Namespace prefix for the tag.

1009 :param attrs: A dictionary of attribute values. Note that

1010 attribute values are expected to be simple strings; processing

1011 of multi-valued attributes such as "class" comes later.

1012 :param sourceline: The line number where this tag was found in its

1013 source document.

1014 :param sourcepos: The character position within `sourceline` where this

1015 tag was found.

1016 :param namespaces: A dictionary of all namespace prefix mappings

1017 currently in scope in the document.

1018

1019 If this method returns None, the tag was rejected by an active

1020 `ElementFilter`. You should proceed as if the tag had not occurred

1021 in the document. For instance, if this was a self-closing tag,

1022 don't call handle_endtag.

1023

1024 :meta private:

1025 """

1026 # print("Start tag %s: %s" % (name, attrs))

1027 self.endData()

1028

1029 if (

1030 self.parse_only

1031 and len(self.tagStack) <= 1

1032 and not self.parse_only.allow_tag_creation(nsprefix, name, attrs)

1033 ):

1034 return None

1035

1036 tag_class = self.element_classes.get(Tag, Tag)

1037 # Assume that this is either Tag or a subclass of Tag. If not,

1038 # the user brought type-unsafety upon themselves.

1039 tag_class = cast(Type[Tag], tag_class)

1040 tag = tag_class(

1041 self,

1042 self.builder,

1043 name,

1044 namespace,

1045 nsprefix,

1046 attrs,

1047 self.currentTag,

1048 self._most_recent_element,

1049 sourceline=sourceline,

1050 sourcepos=sourcepos,

1051 namespaces=namespaces,

1052 )

1053 if tag is None:

1054 return tag

1055 if self._most_recent_element is not None:

1056 self._most_recent_element.next_element = tag

1057 self._most_recent_element = tag

1058 self.pushTag(tag)

1059 return tag

1060

1061 def handle_endtag(self, name: str, nsprefix: Optional[str] = None) -> None:

1062 """Called by the tree builder when an ending tag is encountered.

1063

1064 :param name: Name of the tag.

1065 :param nsprefix: Namespace prefix for the tag.

1066

1067 :meta private:

1068 """

1069 # print("End tag: " + name)

1070 self.endData()

1071 self._popToTag(name, nsprefix)

1072

1073 def handle_data(self, data: str) -> None:

1074 """Called by the tree builder when a chunk of textual data is

1075 encountered.

1076

1077 :meta private:

1078 """

1079 self.current_data.append(data)

1080

1081 def decode(

1082 self,

1083 indent_level: Optional[int] = None,

1084 eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING,

1085 formatter: Union[Formatter, str] = "minimal",

1086 iterator: Optional[Iterator[PageElement]] = None,

1087 **kwargs: Any,

1088 ) -> str:

1089 """Returns a string representation of the parse tree

1090 as a full HTML or XML document.

1091

1092 :param indent_level: Each line of the rendering will be

1093 indented this many levels. (The ``formatter`` decides what a

1094 'level' means, in terms of spaces or other characters

1095 output.) This is used internally in recursive calls while

1096 pretty-printing.

1097 :param eventual_encoding: The encoding of the final document.

1098 If this is None, the document will be a Unicode string.

1099 :param formatter: Either a `Formatter` object, or a string naming one of

1100 the standard formatters.

1101 :param iterator: The iterator to use when navigating over the

1102 parse tree. This is only used by `Tag.decode_contents` and

1103 you probably won't need to use it.

1104 """

1105 if self.is_xml:

1106 # Print the XML declaration

1107 encoding_part = ""

1108 declared_encoding: Optional[str] = eventual_encoding

1109 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:

1110 # This is a special Python encoding; it can't actually

1111 # go into an XML document because it means nothing

1112 # outside of Python.

1113 declared_encoding = None

1114 if declared_encoding is not None:

1115 encoding_part = ' encoding="%s"' % declared_encoding

1116 prefix = '<?xml version="1.0"%s?>\n' % encoding_part

1117 else:

1118 prefix = ""

1119

1120 # Prior to 4.13.0, the first argument to this method was a

1121 # bool called pretty_print, which gave the method a different

1122 # signature from its superclass implementation, Tag.decode.

1123 #

1124 # The signatures of the two methods now match, but just in

1125 # case someone is still passing a boolean in as the first

1126 # argument to this method (or a keyword argument with the old

1127 # name), we can handle it and put out a DeprecationWarning.

1128 warning: Optional[str] = None

1129 pretty_print: Optional[bool] = None

1130 if isinstance(indent_level, bool):

1131 if indent_level is True:

1132 indent_level = 0

1133 elif indent_level is False:

1134 indent_level = None

1135 warning = f"As of 4.13.0, the first argument to BeautifulSoup.decode has been changed from bool to int, to match Tag.decode. Pass in a value of {indent_level} instead."

1136 else:

1137 pretty_print = kwargs.pop("pretty_print", None)

1138 assert not kwargs

1139 if pretty_print is not None:

1140 if pretty_print is True:

1141 indent_level = 0

1142 elif pretty_print is False:

1143 indent_level = None

1144 warning = f"As of 4.13.0, the pretty_print argument to BeautifulSoup.decode has been removed, to match Tag.decode. Pass in a value of indent_level={indent_level} instead."

1145

1146 if warning:

1147 warnings.warn(warning, DeprecationWarning, stacklevel=2)

1148 elif indent_level is False or pretty_print is False:

1149 indent_level = None

1150 return prefix + super(BeautifulSoup, self).decode(