Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/__init__.py: 16%
354 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
3http://www.crummy.com/software/BeautifulSoup/
5Beautiful Soup uses a pluggable XML or HTML parser to parse a
6(possibly invalid) document into a tree representation. Beautiful Soup
7provides methods and Pythonic idioms that make it easy to navigate,
8search, and modify the parse tree.
10Beautiful Soup works with Python 3.6 and up. It works better if lxml
11and/or html5lib is installed.
13For more than you ever wanted to know about Beautiful Soup, see the
14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
15"""
17__author__ = "Leonard Richardson (leonardr@segfault.org)"
18__version__ = "4.12.2"
19__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
20# Use of this source code is governed by the MIT license.
21__license__ = "MIT"
23__all__ = ['BeautifulSoup']
25from collections import Counter
26import os
27import re
28import sys
29import traceback
30import warnings
32# The very first thing we do is give a useful error if someone is
33# running this code under Python 2.
34if sys.version_info.major < 3:
35 raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
37from .builder import (
38 builder_registry,
39 ParserRejectedMarkup,
40 XMLParsedAsHTMLWarning,
41 HTMLParserTreeBuilder
42)
43from .dammit import UnicodeDammit
44from .element import (
45 CData,
46 Comment,
47 CSS,
48 DEFAULT_OUTPUT_ENCODING,
49 Declaration,
50 Doctype,
51 NavigableString,
52 PageElement,
53 ProcessingInstruction,
54 PYTHON_SPECIFIC_ENCODINGS,
55 ResultSet,
56 Script,
57 Stylesheet,
58 SoupStrainer,
59 Tag,
60 TemplateString,
61 )
63# Define some custom warnings.
64class GuessedAtParserWarning(UserWarning):
65 """The warning issued when BeautifulSoup has to guess what parser to
66 use -- probably because no parser was specified in the constructor.
67 """
69class MarkupResemblesLocatorWarning(UserWarning):
70 """The warning issued when BeautifulSoup is given 'markup' that
71 actually looks like a resource locator -- a URL or a path to a file
72 on disk.
73 """
76class BeautifulSoup(Tag):
77 """A data structure representing a parsed HTML or XML document.
79 Most of the methods you'll call on a BeautifulSoup object are inherited from
80 PageElement or Tag.
82 Internally, this class defines the basic interface called by the
83 tree builders when converting an HTML/XML document into a data
84 structure. The interface abstracts away the differences between
85 parsers. To write a new tree builder, you'll need to understand
86 these methods as a whole.
88 These methods will be called by the BeautifulSoup constructor:
89 * reset()
90 * feed(markup)
92 The tree builder may call these methods from its feed() implementation:
93 * handle_starttag(name, attrs) # See note about return value
94 * handle_endtag(name)
95 * handle_data(data) # Appends to the current data node
96 * endData(containerClass) # Ends the current data node
98 No matter how complicated the underlying parser is, you should be
99 able to build a tree using 'start tag' events, 'end tag' events,
100 'data' events, and "done with data" events.
102 If you encounter an empty-element tag (aka a self-closing tag,
103 like HTML's <br> tag), call handle_starttag and then
104 handle_endtag.
105 """
107 # Since BeautifulSoup subclasses Tag, it's possible to treat it as
108 # a Tag with a .name. This name makes it clear the BeautifulSoup
109 # object isn't a real markup tag.
110 ROOT_TAG_NAME = '[document]'
112 # If the end-user gives no indication which tree builder they
113 # want, look for one with these features.
114 DEFAULT_BUILDER_FEATURES = ['html', 'fast']
116 # A string containing all ASCII whitespace characters, used in
117 # endData() to detect data chunks that seem 'empty'.
118 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
120 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
122 def __init__(self, markup="", features=None, builder=None,
123 parse_only=None, from_encoding=None, exclude_encodings=None,
124 element_classes=None, **kwargs):
125 """Constructor.
127 :param markup: A string or a file-like object representing
128 markup to be parsed.
130 :param features: Desirable features of the parser to be
131 used. This may be the name of a specific parser ("lxml",
132 "lxml-xml", "html.parser", or "html5lib") or it may be the
133 type of markup to be used ("html", "html5", "xml"). It's
134 recommended that you name a specific parser, so that
135 Beautiful Soup gives you the same results across platforms
136 and virtual environments.
138 :param builder: A TreeBuilder subclass to instantiate (or
139 instance to use) instead of looking one up based on
140 `features`. You only need to use this if you've implemented a
141 custom TreeBuilder.
143 :param parse_only: A SoupStrainer. Only parts of the document
144 matching the SoupStrainer will be considered. This is useful
145 when parsing part of a document that would otherwise be too
146 large to fit into memory.
148 :param from_encoding: A string indicating the encoding of the
149 document to be parsed. Pass this in if Beautiful Soup is
150 guessing wrongly about the document's encoding.
152 :param exclude_encodings: A list of strings indicating
153 encodings known to be wrong. Pass this in if you don't know
154 the document's encoding but you know Beautiful Soup's guess is
155 wrong.
157 :param element_classes: A dictionary mapping BeautifulSoup
158 classes like Tag and NavigableString, to other classes you'd
159 like to be instantiated instead as the parse tree is
160 built. This is useful for subclassing Tag or NavigableString
161 to modify default behavior.
163 :param kwargs: For backwards compatibility purposes, the
164 constructor accepts certain keyword arguments used in
165 Beautiful Soup 3. None of these arguments do anything in
166 Beautiful Soup 4; they will result in a warning and then be
167 ignored.
169 Apart from this, any keyword arguments passed into the
170 BeautifulSoup constructor are propagated to the TreeBuilder
171 constructor. This makes it possible to configure a
172 TreeBuilder by passing in arguments, not just by saying which
173 one to use.
174 """
175 if 'convertEntities' in kwargs:
176 del kwargs['convertEntities']
177 warnings.warn(
178 "BS4 does not respect the convertEntities argument to the "
179 "BeautifulSoup constructor. Entities are always converted "
180 "to Unicode characters.")
182 if 'markupMassage' in kwargs:
183 del kwargs['markupMassage']
184 warnings.warn(
185 "BS4 does not respect the markupMassage argument to the "
186 "BeautifulSoup constructor. The tree builder is responsible "
187 "for any necessary markup massage.")
189 if 'smartQuotesTo' in kwargs:
190 del kwargs['smartQuotesTo']
191 warnings.warn(
192 "BS4 does not respect the smartQuotesTo argument to the "
193 "BeautifulSoup constructor. Smart quotes are always converted "
194 "to Unicode characters.")
196 if 'selfClosingTags' in kwargs:
197 del kwargs['selfClosingTags']
198 warnings.warn(
199 "BS4 does not respect the selfClosingTags argument to the "
200 "BeautifulSoup constructor. The tree builder is responsible "
201 "for understanding self-closing tags.")
203 if 'isHTML' in kwargs:
204 del kwargs['isHTML']
205 warnings.warn(
206 "BS4 does not respect the isHTML argument to the "
207 "BeautifulSoup constructor. Suggest you use "
208 "features='lxml' for HTML and features='lxml-xml' for "
209 "XML.")
211 def deprecated_argument(old_name, new_name):
212 if old_name in kwargs:
213 warnings.warn(
214 'The "%s" argument to the BeautifulSoup constructor '
215 'has been renamed to "%s."' % (old_name, new_name),
216 DeprecationWarning, stacklevel=3
217 )
218 return kwargs.pop(old_name)
219 return None
221 parse_only = parse_only or deprecated_argument(
222 "parseOnlyThese", "parse_only")
224 from_encoding = from_encoding or deprecated_argument(
225 "fromEncoding", "from_encoding")
227 if from_encoding and isinstance(markup, str):
228 warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
229 from_encoding = None
231 self.element_classes = element_classes or dict()
233 # We need this information to track whether or not the builder
234 # was specified well enough that we can omit the 'you need to
235 # specify a parser' warning.
236 original_builder = builder
237 original_features = features
239 if isinstance(builder, type):
240 # A builder class was passed in; it needs to be instantiated.
241 builder_class = builder
242 builder = None
243 elif builder is None:
244 if isinstance(features, str):
245 features = [features]
246 if features is None or len(features) == 0:
247 features = self.DEFAULT_BUILDER_FEATURES
248 builder_class = builder_registry.lookup(*features)
249 if builder_class is None:
250 raise FeatureNotFound(
251 "Couldn't find a tree builder with the features you "
252 "requested: %s. Do you need to install a parser library?"
253 % ",".join(features))
255 # At this point either we have a TreeBuilder instance in
256 # builder, or we have a builder_class that we can instantiate
257 # with the remaining **kwargs.
258 if builder is None:
259 builder = builder_class(**kwargs)
260 if not original_builder and not (
261 original_features == builder.NAME or
262 original_features in builder.ALTERNATE_NAMES
263 ) and markup:
264 # The user did not tell us which TreeBuilder to use,
265 # and we had to guess. Issue a warning.
266 if builder.is_xml:
267 markup_type = "XML"
268 else:
269 markup_type = "HTML"
271 # This code adapted from warnings.py so that we get the same line
272 # of code as our warnings.warn() call gets, even if the answer is wrong
273 # (as it may be in a multithreading situation).
274 caller = None
275 try:
276 caller = sys._getframe(1)
277 except ValueError:
278 pass
279 if caller:
280 globals = caller.f_globals
281 line_number = caller.f_lineno
282 else:
283 globals = sys.__dict__
284 line_number= 1
285 filename = globals.get('__file__')
286 if filename:
287 fnl = filename.lower()
288 if fnl.endswith((".pyc", ".pyo")):
289 filename = filename[:-1]
290 if filename:
291 # If there is no filename at all, the user is most likely in a REPL,
292 # and the warning is not necessary.
293 values = dict(
294 filename=filename,
295 line_number=line_number,
296 parser=builder.NAME,
297 markup_type=markup_type
298 )
299 warnings.warn(
300 self.NO_PARSER_SPECIFIED_WARNING % values,
301 GuessedAtParserWarning, stacklevel=2
302 )
303 else:
304 if kwargs:
305 warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
307 self.builder = builder
308 self.is_xml = builder.is_xml
309 self.known_xml = self.is_xml
310 self._namespaces = dict()
311 self.parse_only = parse_only
313 if hasattr(markup, 'read'): # It's a file-type object.
314 markup = markup.read()
315 elif len(markup) <= 256 and (
316 (isinstance(markup, bytes) and not b'<' in markup)
317 or (isinstance(markup, str) and not '<' in markup)
318 ):
319 # Issue warnings for a couple beginner problems
320 # involving passing non-markup to Beautiful Soup.
321 # Beautiful Soup will still parse the input as markup,
322 # since that is sometimes the intended behavior.
323 if not self._markup_is_url(markup):
324 self._markup_resembles_filename(markup)
326 rejections = []
327 success = False
328 for (self.markup, self.original_encoding, self.declared_html_encoding,
329 self.contains_replacement_characters) in (
330 self.builder.prepare_markup(
331 markup, from_encoding, exclude_encodings=exclude_encodings)):
332 self.reset()
333 self.builder.initialize_soup(self)
334 try:
335 self._feed()
336 success = True
337 break
338 except ParserRejectedMarkup as e:
339 rejections.append(e)
340 pass
342 if not success:
343 other_exceptions = [str(e) for e in rejections]
344 raise ParserRejectedMarkup(
345 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
346 )
348 # Clear out the markup and remove the builder's circular
349 # reference to this object.
350 self.markup = None
351 self.builder.soup = None
353 def _clone(self):
354 """Create a new BeautifulSoup object with the same TreeBuilder,
355 but not associated with any markup.
357 This is the first step of the deepcopy process.
358 """
359 clone = type(self)("", None, self.builder)
361 # Keep track of the encoding of the original document,
362 # since we won't be parsing it again.
363 clone.original_encoding = self.original_encoding
364 return clone
366 def __getstate__(self):
367 # Frequently a tree builder can't be pickled.
368 d = dict(self.__dict__)
369 if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
370 d['builder'] = type(self.builder)
371 # Store the contents as a Unicode string.
372 d['contents'] = []
373 d['markup'] = self.decode()
375 # If _most_recent_element is present, it's a Tag object left
376 # over from initial parse. It might not be picklable and we
377 # don't need it.
378 if '_most_recent_element' in d:
379 del d['_most_recent_element']
380 return d
382 def __setstate__(self, state):
383 # If necessary, restore the TreeBuilder by looking it up.
384 self.__dict__ = state
385 if isinstance(self.builder, type):
386 self.builder = self.builder()
387 elif not self.builder:
388 # We don't know which builder was used to build this
389 # parse tree, so use a default we know is always available.
390 self.builder = HTMLParserTreeBuilder()
391 self.builder.soup = self
392 self.reset()
393 self._feed()
394 return state
397 @classmethod
398 def _decode_markup(cls, markup):
399 """Ensure `markup` is bytes so it's safe to send into warnings.warn.
401 TODO: warnings.warn had this problem back in 2010 but it might not
402 anymore.
403 """
404 if isinstance(markup, bytes):
405 decoded = markup.decode('utf-8', 'replace')
406 else:
407 decoded = markup
408 return decoded
410 @classmethod
411 def _markup_is_url(cls, markup):
412 """Error-handling method to raise a warning if incoming markup looks
413 like a URL.
415 :param markup: A string.
416 :return: Whether or not the markup resembles a URL
417 closely enough to justify a warning.
418 """
419 if isinstance(markup, bytes):
420 space = b' '
421 cant_start_with = (b"http:", b"https:")
422 elif isinstance(markup, str):
423 space = ' '
424 cant_start_with = ("http:", "https:")
425 else:
426 return False
428 if any(markup.startswith(prefix) for prefix in cant_start_with):
429 if not space in markup:
430 warnings.warn(
431 'The input looks more like a URL than markup. You may want to use'
432 ' an HTTP client like requests to get the document behind'
433 ' the URL, and feed that document to Beautiful Soup.',
434 MarkupResemblesLocatorWarning,
435 stacklevel=3
436 )
437 return True
438 return False
440 @classmethod
441 def _markup_resembles_filename(cls, markup):
442 """Error-handling method to raise a warning if incoming markup
443 resembles a filename.
445 :param markup: A bytestring or string.
446 :return: Whether or not the markup resembles a filename
447 closely enough to justify a warning.
448 """
449 path_characters = '/\\'
450 extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt']
451 if isinstance(markup, bytes):
452 path_characters = path_characters.encode("utf8")
453 extensions = [x.encode('utf8') for x in extensions]
454 filelike = False
455 if any(x in markup for x in path_characters):
456 filelike = True
457 else:
458 lower = markup.lower()
459 if any(lower.endswith(ext) for ext in extensions):
460 filelike = True
461 if filelike:
462 warnings.warn(
463 'The input looks more like a filename than markup. You may'
464 ' want to open this file and pass the filehandle into'
465 ' Beautiful Soup.',
466 MarkupResemblesLocatorWarning, stacklevel=3
467 )
468 return True
469 return False
471 def _feed(self):
472 """Internal method that parses previously set markup, creating a large
473 number of Tag and NavigableString objects.
474 """
475 # Convert the document to Unicode.
476 self.builder.reset()
478 self.builder.feed(self.markup)
479 # Close out any unfinished strings and close all the open tags.
480 self.endData()
481 while self.currentTag.name != self.ROOT_TAG_NAME:
482 self.popTag()
484 def reset(self):
485 """Reset this object to a state as though it had never parsed any
486 markup.
487 """
488 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
489 self.hidden = 1
490 self.builder.reset()
491 self.current_data = []
492 self.currentTag = None
493 self.tagStack = []
494 self.open_tag_counter = Counter()
495 self.preserve_whitespace_tag_stack = []
496 self.string_container_stack = []
497 self._most_recent_element = None
498 self.pushTag(self)
500 def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
501 sourceline=None, sourcepos=None, **kwattrs):
502 """Create a new Tag associated with this BeautifulSoup object.
504 :param name: The name of the new Tag.
505 :param namespace: The URI of the new Tag's XML namespace, if any.
506 :param prefix: The prefix for the new Tag's XML namespace, if any.
507 :param attrs: A dictionary of this Tag's attribute values; can
508 be used instead of `kwattrs` for attributes like 'class'
509 that are reserved words in Python.
510 :param sourceline: The line number where this tag was
511 (purportedly) found in its source document.
512 :param sourcepos: The character position within `sourceline` where this
513 tag was (purportedly) found.
514 :param kwattrs: Keyword arguments for the new Tag's attribute values.
516 """
517 kwattrs.update(attrs)
518 return self.element_classes.get(Tag, Tag)(
519 None, self.builder, name, namespace, nsprefix, kwattrs,
520 sourceline=sourceline, sourcepos=sourcepos
521 )
523 def string_container(self, base_class=None):
524 container = base_class or NavigableString
526 # There may be a general override of NavigableString.
527 container = self.element_classes.get(
528 container, container
529 )
531 # On top of that, we may be inside a tag that needs a special
532 # container class.
533 if self.string_container_stack and container is NavigableString:
534 container = self.builder.string_containers.get(
535 self.string_container_stack[-1].name, container
536 )
537 return container
539 def new_string(self, s, subclass=None):
540 """Create a new NavigableString associated with this BeautifulSoup
541 object.
542 """
543 container = self.string_container(subclass)
544 return container(s)
546 def insert_before(self, *args):
547 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
548 it because there is nothing before or after it in the parse tree.
549 """
550 raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
552 def insert_after(self, *args):
553 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
554 it because there is nothing before or after it in the parse tree.
555 """
556 raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
558 def popTag(self):
559 """Internal method called by _popToTag when a tag is closed."""
560 tag = self.tagStack.pop()
561 if tag.name in self.open_tag_counter:
562 self.open_tag_counter[tag.name] -= 1
563 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
564 self.preserve_whitespace_tag_stack.pop()
565 if self.string_container_stack and tag == self.string_container_stack[-1]:
566 self.string_container_stack.pop()
567 #print("Pop", tag.name)
568 if self.tagStack:
569 self.currentTag = self.tagStack[-1]
570 return self.currentTag
572 def pushTag(self, tag):
573 """Internal method called by handle_starttag when a tag is opened."""
574 #print("Push", tag.name)
575 if self.currentTag is not None:
576 self.currentTag.contents.append(tag)
577 self.tagStack.append(tag)
578 self.currentTag = self.tagStack[-1]
579 if tag.name != self.ROOT_TAG_NAME:
580 self.open_tag_counter[tag.name] += 1
581 if tag.name in self.builder.preserve_whitespace_tags:
582 self.preserve_whitespace_tag_stack.append(tag)
583 if tag.name in self.builder.string_containers:
584 self.string_container_stack.append(tag)
586 def endData(self, containerClass=None):
587 """Method called by the TreeBuilder when the end of a data segment
588 occurs.
589 """
590 if self.current_data:
591 current_data = ''.join(self.current_data)
592 # If whitespace is not preserved, and this string contains
593 # nothing but ASCII spaces, replace it with a single space
594 # or newline.
595 if not self.preserve_whitespace_tag_stack:
596 strippable = True
597 for i in current_data:
598 if i not in self.ASCII_SPACES:
599 strippable = False
600 break
601 if strippable:
602 if '\n' in current_data:
603 current_data = '\n'
604 else:
605 current_data = ' '
607 # Reset the data collector.
608 self.current_data = []
610 # Should we add this string to the tree at all?
611 if self.parse_only and len(self.tagStack) <= 1 and \
612 (not self.parse_only.text or \
613 not self.parse_only.search(current_data)):
614 return
616 containerClass = self.string_container(containerClass)
617 o = containerClass(current_data)
618 self.object_was_parsed(o)
620 def object_was_parsed(self, o, parent=None, most_recent_element=None):
621 """Method called by the TreeBuilder to integrate an object into the parse tree."""
622 if parent is None:
623 parent = self.currentTag
624 if most_recent_element is not None:
625 previous_element = most_recent_element
626 else:
627 previous_element = self._most_recent_element
629 next_element = previous_sibling = next_sibling = None
630 if isinstance(o, Tag):
631 next_element = o.next_element
632 next_sibling = o.next_sibling
633 previous_sibling = o.previous_sibling
634 if previous_element is None:
635 previous_element = o.previous_element
637 fix = parent.next_element is not None
639 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
641 self._most_recent_element = o
642 parent.contents.append(o)
644 # Check if we are inserting into an already parsed node.
645 if fix:
646 self._linkage_fixer(parent)
648 def _linkage_fixer(self, el):
649 """Make sure linkage of this fragment is sound."""
651 first = el.contents[0]
652 child = el.contents[-1]
653 descendant = child
655 if child is first and el.parent is not None:
656 # Parent should be linked to first child
657 el.next_element = child
658 # We are no longer linked to whatever this element is
659 prev_el = child.previous_element
660 if prev_el is not None and prev_el is not el:
661 prev_el.next_element = None
662 # First child should be linked to the parent, and no previous siblings.
663 child.previous_element = el
664 child.previous_sibling = None
666 # We have no sibling as we've been appended as the last.
667 child.next_sibling = None
669 # This index is a tag, dig deeper for a "last descendant"
670 if isinstance(child, Tag) and child.contents:
671 descendant = child._last_descendant(False)
673 # As the final step, link last descendant. It should be linked
674 # to the parent's next sibling (if found), else walk up the chain
675 # and find a parent with a sibling. It should have no next sibling.
676 descendant.next_element = None
677 descendant.next_sibling = None
678 target = el
679 while True:
680 if target is None:
681 break
682 elif target.next_sibling is not None:
683 descendant.next_element = target.next_sibling
684 target.next_sibling.previous_element = child
685 break
686 target = target.parent
688 def _popToTag(self, name, nsprefix=None, inclusivePop=True):
689 """Pops the tag stack up to and including the most recent
690 instance of the given tag.
692 If there are no open tags with the given name, nothing will be
693 popped.
695 :param name: Pop up to the most recent tag with this name.
696 :param nsprefix: The namespace prefix that goes with `name`.
697 :param inclusivePop: It this is false, pops the tag stack up
698 to but *not* including the most recent instqance of the
699 given tag.
701 """
702 #print("Popping to %s" % name)
703 if name == self.ROOT_TAG_NAME:
704 # The BeautifulSoup object itself can never be popped.
705 return
707 most_recently_popped = None
709 stack_size = len(self.tagStack)
710 for i in range(stack_size - 1, 0, -1):
711 if not self.open_tag_counter.get(name):
712 break
713 t = self.tagStack[i]
714 if (name == t.name and nsprefix == t.prefix):
715 if inclusivePop:
716 most_recently_popped = self.popTag()
717 break
718 most_recently_popped = self.popTag()
720 return most_recently_popped
722 def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
723 sourcepos=None, namespaces=None):
724 """Called by the tree builder when a new tag is encountered.
726 :param name: Name of the tag.
727 :param nsprefix: Namespace prefix for the tag.
728 :param attrs: A dictionary of attribute values.
729 :param sourceline: The line number where this tag was found in its
730 source document.
731 :param sourcepos: The character position within `sourceline` where this
732 tag was found.
733 :param namespaces: A dictionary of all namespace prefix mappings
734 currently in scope in the document.
736 If this method returns None, the tag was rejected by an active
737 SoupStrainer. You should proceed as if the tag had not occurred
738 in the document. For instance, if this was a self-closing tag,
739 don't call handle_endtag.
740 """
741 # print("Start tag %s: %s" % (name, attrs))
742 self.endData()
744 if (self.parse_only and len(self.tagStack) <= 1
745 and (self.parse_only.text
746 or not self.parse_only.search_tag(name, attrs))):
747 return None
749 tag = self.element_classes.get(Tag, Tag)(
750 self, self.builder, name, namespace, nsprefix, attrs,
751 self.currentTag, self._most_recent_element,
752 sourceline=sourceline, sourcepos=sourcepos,
753 namespaces=namespaces
754 )
755 if tag is None:
756 return tag
757 if self._most_recent_element is not None:
758 self._most_recent_element.next_element = tag
759 self._most_recent_element = tag
760 self.pushTag(tag)
761 return tag
763 def handle_endtag(self, name, nsprefix=None):
764 """Called by the tree builder when an ending tag is encountered.
766 :param name: Name of the tag.
767 :param nsprefix: Namespace prefix for the tag.
768 """
769 #print("End tag: " + name)
770 self.endData()
771 self._popToTag(name, nsprefix)
773 def handle_data(self, data):
774 """Called by the tree builder when a chunk of textual data is encountered."""
775 self.current_data.append(data)
777 def decode(self, pretty_print=False,
778 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
779 formatter="minimal", iterator=None):
780 """Returns a string or Unicode representation of the parse tree
781 as an HTML or XML document.
783 :param pretty_print: If this is True, indentation will be used to
784 make the document more readable.
785 :param eventual_encoding: The encoding of the final document.
786 If this is None, the document will be a Unicode string.
787 """
788 if self.is_xml:
789 # Print the XML declaration
790 encoding_part = ''
791 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
792 # This is a special Python encoding; it can't actually
793 # go into an XML document because it means nothing
794 # outside of Python.
795 eventual_encoding = None
796 if eventual_encoding != None:
797 encoding_part = ' encoding="%s"' % eventual_encoding
798 prefix = '<?xml version="1.0"%s?>\n' % encoding_part
799 else:
800 prefix = ''
801 if not pretty_print:
802 indent_level = None
803 else:
804 indent_level = 0
805 return prefix + super(BeautifulSoup, self).decode(
806 indent_level, eventual_encoding, formatter, iterator)
808# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
809_s = BeautifulSoup
810_soup = BeautifulSoup
812class BeautifulStoneSoup(BeautifulSoup):
813 """Deprecated interface to an XML parser."""
815 def __init__(self, *args, **kwargs):
816 kwargs['features'] = 'xml'
817 warnings.warn(
818 'The BeautifulStoneSoup class is deprecated. Instead of using '
819 'it, pass features="xml" into the BeautifulSoup constructor.',
820 DeprecationWarning, stacklevel=2
821 )
822 super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
825class StopParsing(Exception):
826 """Exception raised by a TreeBuilder if it's unable to continue parsing."""
827 pass
829class FeatureNotFound(ValueError):
830 """Exception raised by the BeautifulSoup constructor if no parser with the
831 requested features is found.
832 """
833 pass
836#If this file is run as a script, act as an HTML pretty-printer.
837if __name__ == '__main__':
838 import sys
839 soup = BeautifulSoup(sys.stdin)
840 print((soup.prettify()))