Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/__init__.py: 73%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
3http://www.crummy.com/software/BeautifulSoup/
5Beautiful Soup uses a pluggable XML or HTML parser to parse a
6(possibly invalid) document into a tree representation. Beautiful Soup
7provides methods and Pythonic idioms that make it easy to navigate,
8search, and modify the parse tree.
10Beautiful Soup works with Python 3.5 and up. It works better if lxml
11and/or html5lib is installed.
13For more than you ever wanted to know about Beautiful Soup, see the
14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
15"""
17__author__ = "Leonard Richardson (leonardr@segfault.org)"
18__version__ = "4.11.1"
19__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
20# Use of this source code is governed by the MIT license.
21__license__ = "MIT"
23__all__ = ['BeautifulSoup']
25from collections import Counter
26import os
27import re
28import sys
29import traceback
30import warnings
32# The very first thing we do is give a useful error if someone is
33# running this code under Python 2.
34if sys.version_info.major < 3:
35 raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
37from .builder import (
38 builder_registry,
39 ParserRejectedMarkup,
40 XMLParsedAsHTMLWarning,
41)
42from .dammit import UnicodeDammit
43from .element import (
44 CData,
45 Comment,
46 DEFAULT_OUTPUT_ENCODING,
47 Declaration,
48 Doctype,
49 NavigableString,
50 PageElement,
51 ProcessingInstruction,
52 PYTHON_SPECIFIC_ENCODINGS,
53 ResultSet,
54 Script,
55 Stylesheet,
56 SoupStrainer,
57 Tag,
58 TemplateString,
59 )
61# Define some custom warnings.
62class GuessedAtParserWarning(UserWarning):
63 """The warning issued when BeautifulSoup has to guess what parser to
64 use -- probably because no parser was specified in the constructor.
65 """
67class MarkupResemblesLocatorWarning(UserWarning):
68 """The warning issued when BeautifulSoup is given 'markup' that
69 actually looks like a resource locator -- a URL or a path to a file
70 on disk.
71 """
74class BeautifulSoup(Tag):
75 """A data structure representing a parsed HTML or XML document.
77 Most of the methods you'll call on a BeautifulSoup object are inherited from
78 PageElement or Tag.
80 Internally, this class defines the basic interface called by the
81 tree builders when converting an HTML/XML document into a data
82 structure. The interface abstracts away the differences between
83 parsers. To write a new tree builder, you'll need to understand
84 these methods as a whole.
86 These methods will be called by the BeautifulSoup constructor:
87 * reset()
88 * feed(markup)
90 The tree builder may call these methods from its feed() implementation:
91 * handle_starttag(name, attrs) # See note about return value
92 * handle_endtag(name)
93 * handle_data(data) # Appends to the current data node
94 * endData(containerClass) # Ends the current data node
96 No matter how complicated the underlying parser is, you should be
97 able to build a tree using 'start tag' events, 'end tag' events,
98 'data' events, and "done with data" events.
100 If you encounter an empty-element tag (aka a self-closing tag,
101 like HTML's <br> tag), call handle_starttag and then
102 handle_endtag.
103 """
105 # Since BeautifulSoup subclasses Tag, it's possible to treat it as
106 # a Tag with a .name. This name makes it clear the BeautifulSoup
107 # object isn't a real markup tag.
108 ROOT_TAG_NAME = '[document]'
110 # If the end-user gives no indication which tree builder they
111 # want, look for one with these features.
112 DEFAULT_BUILDER_FEATURES = ['html', 'fast']
114 # A string containing all ASCII whitespace characters, used in
115 # endData() to detect data chunks that seem 'empty'.
116 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
118 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
120 def __init__(self, markup="", features=None, builder=None,
121 parse_only=None, from_encoding=None, exclude_encodings=None,
122 element_classes=None, **kwargs):
123 """Constructor.
125 :param markup: A string or a file-like object representing
126 markup to be parsed.
128 :param features: Desirable features of the parser to be
129 used. This may be the name of a specific parser ("lxml",
130 "lxml-xml", "html.parser", or "html5lib") or it may be the
131 type of markup to be used ("html", "html5", "xml"). It's
132 recommended that you name a specific parser, so that
133 Beautiful Soup gives you the same results across platforms
134 and virtual environments.
136 :param builder: A TreeBuilder subclass to instantiate (or
137 instance to use) instead of looking one up based on
138 `features`. You only need to use this if you've implemented a
139 custom TreeBuilder.
141 :param parse_only: A SoupStrainer. Only parts of the document
142 matching the SoupStrainer will be considered. This is useful
143 when parsing part of a document that would otherwise be too
144 large to fit into memory.
146 :param from_encoding: A string indicating the encoding of the
147 document to be parsed. Pass this in if Beautiful Soup is
148 guessing wrongly about the document's encoding.
150 :param exclude_encodings: A list of strings indicating
151 encodings known to be wrong. Pass this in if you don't know
152 the document's encoding but you know Beautiful Soup's guess is
153 wrong.
155 :param element_classes: A dictionary mapping BeautifulSoup
156 classes like Tag and NavigableString, to other classes you'd
157 like to be instantiated instead as the parse tree is
158 built. This is useful for subclassing Tag or NavigableString
159 to modify default behavior.
161 :param kwargs: For backwards compatibility purposes, the
162 constructor accepts certain keyword arguments used in
163 Beautiful Soup 3. None of these arguments do anything in
164 Beautiful Soup 4; they will result in a warning and then be
165 ignored.
167 Apart from this, any keyword arguments passed into the
168 BeautifulSoup constructor are propagated to the TreeBuilder
169 constructor. This makes it possible to configure a
170 TreeBuilder by passing in arguments, not just by saying which
171 one to use.
172 """
173 if 'convertEntities' in kwargs:
174 del kwargs['convertEntities']
175 warnings.warn(
176 "BS4 does not respect the convertEntities argument to the "
177 "BeautifulSoup constructor. Entities are always converted "
178 "to Unicode characters.")
180 if 'markupMassage' in kwargs:
181 del kwargs['markupMassage']
182 warnings.warn(
183 "BS4 does not respect the markupMassage argument to the "
184 "BeautifulSoup constructor. The tree builder is responsible "
185 "for any necessary markup massage.")
187 if 'smartQuotesTo' in kwargs:
188 del kwargs['smartQuotesTo']
189 warnings.warn(
190 "BS4 does not respect the smartQuotesTo argument to the "
191 "BeautifulSoup constructor. Smart quotes are always converted "
192 "to Unicode characters.")
194 if 'selfClosingTags' in kwargs:
195 del kwargs['selfClosingTags']
196 warnings.warn(
197 "BS4 does not respect the selfClosingTags argument to the "
198 "BeautifulSoup constructor. The tree builder is responsible "
199 "for understanding self-closing tags.")
201 if 'isHTML' in kwargs:
202 del kwargs['isHTML']
203 warnings.warn(
204 "BS4 does not respect the isHTML argument to the "
205 "BeautifulSoup constructor. Suggest you use "
206 "features='lxml' for HTML and features='lxml-xml' for "
207 "XML.")
209 def deprecated_argument(old_name, new_name):
210 if old_name in kwargs:
211 warnings.warn(
212 'The "%s" argument to the BeautifulSoup constructor '
213 'has been renamed to "%s."' % (old_name, new_name),
214 DeprecationWarning
215 )
216 return kwargs.pop(old_name)
217 return None
219 parse_only = parse_only or deprecated_argument(
220 "parseOnlyThese", "parse_only")
222 from_encoding = from_encoding or deprecated_argument(
223 "fromEncoding", "from_encoding")
225 if from_encoding and isinstance(markup, str):
226 warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
227 from_encoding = None
229 self.element_classes = element_classes or dict()
231 # We need this information to track whether or not the builder
232 # was specified well enough that we can omit the 'you need to
233 # specify a parser' warning.
234 original_builder = builder
235 original_features = features
237 if isinstance(builder, type):
238 # A builder class was passed in; it needs to be instantiated.
239 builder_class = builder
240 builder = None
241 elif builder is None:
242 if isinstance(features, str):
243 features = [features]
244 if features is None or len(features) == 0:
245 features = self.DEFAULT_BUILDER_FEATURES
246 builder_class = builder_registry.lookup(*features)
247 if builder_class is None:
248 raise FeatureNotFound(
249 "Couldn't find a tree builder with the features you "
250 "requested: %s. Do you need to install a parser library?"
251 % ",".join(features))
253 # At this point either we have a TreeBuilder instance in
254 # builder, or we have a builder_class that we can instantiate
255 # with the remaining **kwargs.
256 if builder is None:
257 builder = builder_class(**kwargs)
258 if not original_builder and not (
259 original_features == builder.NAME or
260 original_features in builder.ALTERNATE_NAMES
261 ) and markup:
262 # The user did not tell us which TreeBuilder to use,
263 # and we had to guess. Issue a warning.
264 if builder.is_xml:
265 markup_type = "XML"
266 else:
267 markup_type = "HTML"
269 # This code adapted from warnings.py so that we get the same line
270 # of code as our warnings.warn() call gets, even if the answer is wrong
271 # (as it may be in a multithreading situation).
272 caller = None
273 try:
274 caller = sys._getframe(1)
275 except ValueError:
276 pass
277 if caller:
278 globals = caller.f_globals
279 line_number = caller.f_lineno
280 else:
281 globals = sys.__dict__
282 line_number= 1
283 filename = globals.get('__file__')
284 if filename:
285 fnl = filename.lower()
286 if fnl.endswith((".pyc", ".pyo")):
287 filename = filename[:-1]
288 if filename:
289 # If there is no filename at all, the user is most likely in a REPL,
290 # and the warning is not necessary.
291 values = dict(
292 filename=filename,
293 line_number=line_number,
294 parser=builder.NAME,
295 markup_type=markup_type
296 )
297 warnings.warn(
298 self.NO_PARSER_SPECIFIED_WARNING % values,
299 GuessedAtParserWarning, stacklevel=2
300 )
301 else:
302 if kwargs:
303 warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
305 self.builder = builder
306 self.is_xml = builder.is_xml
307 self.known_xml = self.is_xml
308 self._namespaces = dict()
309 self.parse_only = parse_only
311 if hasattr(markup, 'read'): # It's a file-type object.
312 markup = markup.read()
313 elif len(markup) <= 256 and (
314 (isinstance(markup, bytes) and not b'<' in markup)
315 or (isinstance(markup, str) and not '<' in markup)
316 ):
317 # Issue warnings for a couple beginner problems
318 # involving passing non-markup to Beautiful Soup.
319 # Beautiful Soup will still parse the input as markup,
320 # since that is sometimes the intended behavior.
321 if not self._markup_is_url(markup):
322 self._markup_resembles_filename(markup)
324 rejections = []
325 success = False
326 for (self.markup, self.original_encoding, self.declared_html_encoding,
327 self.contains_replacement_characters) in (
328 self.builder.prepare_markup(
329 markup, from_encoding, exclude_encodings=exclude_encodings)):
330 self.reset()
331 self.builder.initialize_soup(self)
332 try:
333 self._feed()
334 success = True
335 break
336 except ParserRejectedMarkup as e:
337 rejections.append(e)
338 pass
340 if not success:
341 other_exceptions = [str(e) for e in rejections]
342 raise ParserRejectedMarkup(
343 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
344 )
346 # Clear out the markup and remove the builder's circular
347 # reference to this object.
348 self.markup = None
349 self.builder.soup = None
351 def __copy__(self):
352 """Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
353 copy = type(self)(
354 self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
355 )
357 # Although we encoded the tree to UTF-8, that may not have
358 # been the encoding of the original markup. Set the copy's
359 # .original_encoding to reflect the original object's
360 # .original_encoding.
361 copy.original_encoding = self.original_encoding
362 return copy
364 def __getstate__(self):
365 # Frequently a tree builder can't be pickled.
366 d = dict(self.__dict__)
367 if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
368 d['builder'] = None
369 return d
371 @classmethod
372 def _decode_markup(cls, markup):
373 """Ensure `markup` is bytes so it's safe to send into warnings.warn.
375 TODO: warnings.warn had this problem back in 2010 but it might not
376 anymore.
377 """
378 if isinstance(markup, bytes):
379 decoded = markup.decode('utf-8', 'replace')
380 else:
381 decoded = markup
382 return decoded
384 @classmethod
385 def _markup_is_url(cls, markup):
386 """Error-handling method to raise a warning if incoming markup looks
387 like a URL.
389 :param markup: A string.
390 :return: Whether or not the markup resembles a URL
391 closely enough to justify a warning.
392 """
393 if isinstance(markup, bytes):
394 space = b' '
395 cant_start_with = (b"http:", b"https:")
396 elif isinstance(markup, str):
397 space = ' '
398 cant_start_with = ("http:", "https:")
399 else:
400 return False
402 if any(markup.startswith(prefix) for prefix in cant_start_with):
403 if not space in markup:
404 warnings.warn(
405 'The input looks more like a URL than markup. You may want to use'
406 ' an HTTP client like requests to get the document behind'
407 ' the URL, and feed that document to Beautiful Soup.',
408 MarkupResemblesLocatorWarning
409 )
410 return True
411 return False
413 @classmethod
414 def _markup_resembles_filename(cls, markup):
415 """Error-handling method to raise a warning if incoming markup
416 resembles a filename.
418 :param markup: A bytestring or string.
419 :return: Whether or not the markup resembles a filename
420 closely enough to justify a warning.
421 """
422 path_characters = '/\\'
423 extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt']
424 if isinstance(markup, bytes):
425 path_characters = path_characters.encode("utf8")
426 extensions = [x.encode('utf8') for x in extensions]
427 filelike = False
428 if any(x in markup for x in path_characters):
429 filelike = True
430 else:
431 lower = markup.lower()
432 if any(lower.endswith(ext) for ext in extensions):
433 filelike = True
434 if filelike:
435 warnings.warn(
436 'The input looks more like a filename than markup. You may'
437 ' want to open this file and pass the filehandle into'
438 ' Beautiful Soup.',
439 MarkupResemblesLocatorWarning
440 )
441 return True
442 return False
444 def _feed(self):
445 """Internal method that parses previously set markup, creating a large
446 number of Tag and NavigableString objects.
447 """
448 # Convert the document to Unicode.
449 self.builder.reset()
451 self.builder.feed(self.markup)
452 # Close out any unfinished strings and close all the open tags.
453 self.endData()
454 while self.currentTag.name != self.ROOT_TAG_NAME:
455 self.popTag()
457 def reset(self):
458 """Reset this object to a state as though it had never parsed any
459 markup.
460 """
461 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
462 self.hidden = 1
463 self.builder.reset()
464 self.current_data = []
465 self.currentTag = None
466 self.tagStack = []
467 self.open_tag_counter = Counter()
468 self.preserve_whitespace_tag_stack = []
469 self.string_container_stack = []
470 self.pushTag(self)
472 def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
473 sourceline=None, sourcepos=None, **kwattrs):
474 """Create a new Tag associated with this BeautifulSoup object.
476 :param name: The name of the new Tag.
477 :param namespace: The URI of the new Tag's XML namespace, if any.
478 :param prefix: The prefix for the new Tag's XML namespace, if any.
479 :param attrs: A dictionary of this Tag's attribute values; can
480 be used instead of `kwattrs` for attributes like 'class'
481 that are reserved words in Python.
482 :param sourceline: The line number where this tag was
483 (purportedly) found in its source document.
484 :param sourcepos: The character position within `sourceline` where this
485 tag was (purportedly) found.
486 :param kwattrs: Keyword arguments for the new Tag's attribute values.
488 """
489 kwattrs.update(attrs)
490 return self.element_classes.get(Tag, Tag)(
491 None, self.builder, name, namespace, nsprefix, kwattrs,
492 sourceline=sourceline, sourcepos=sourcepos
493 )
495 def string_container(self, base_class=None):
496 container = base_class or NavigableString
498 # There may be a general override of NavigableString.
499 container = self.element_classes.get(
500 container, container
501 )
503 # On top of that, we may be inside a tag that needs a special
504 # container class.
505 if self.string_container_stack and container is NavigableString:
506 container = self.builder.string_containers.get(
507 self.string_container_stack[-1].name, container
508 )
509 return container
511 def new_string(self, s, subclass=None):
512 """Create a new NavigableString associated with this BeautifulSoup
513 object.
514 """
515 container = self.string_container(subclass)
516 return container(s)
518 def insert_before(self, *args):
519 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
520 it because there is nothing before or after it in the parse tree.
521 """
522 raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
524 def insert_after(self, *args):
525 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
526 it because there is nothing before or after it in the parse tree.
527 """
528 raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
530 def popTag(self):
531 """Internal method called by _popToTag when a tag is closed."""
532 tag = self.tagStack.pop()
533 if tag.name in self.open_tag_counter:
534 self.open_tag_counter[tag.name] -= 1
535 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
536 self.preserve_whitespace_tag_stack.pop()
537 if self.string_container_stack and tag == self.string_container_stack[-1]:
538 self.string_container_stack.pop()
539 #print("Pop", tag.name)
540 if self.tagStack:
541 self.currentTag = self.tagStack[-1]
542 return self.currentTag
544 def pushTag(self, tag):
545 """Internal method called by handle_starttag when a tag is opened."""
546 #print("Push", tag.name)
547 if self.currentTag is not None:
548 self.currentTag.contents.append(tag)
549 self.tagStack.append(tag)
550 self.currentTag = self.tagStack[-1]
551 if tag.name != self.ROOT_TAG_NAME:
552 self.open_tag_counter[tag.name] += 1
553 if tag.name in self.builder.preserve_whitespace_tags:
554 self.preserve_whitespace_tag_stack.append(tag)
555 if tag.name in self.builder.string_containers:
556 self.string_container_stack.append(tag)
558 def endData(self, containerClass=None):
559 """Method called by the TreeBuilder when the end of a data segment
560 occurs.
561 """
562 if self.current_data:
563 current_data = ''.join(self.current_data)
564 # If whitespace is not preserved, and this string contains
565 # nothing but ASCII spaces, replace it with a single space
566 # or newline.
567 if not self.preserve_whitespace_tag_stack:
568 strippable = True
569 for i in current_data:
570 if i not in self.ASCII_SPACES:
571 strippable = False
572 break
573 if strippable:
574 if '\n' in current_data:
575 current_data = '\n'
576 else:
577 current_data = ' '
579 # Reset the data collector.
580 self.current_data = []
582 # Should we add this string to the tree at all?
583 if self.parse_only and len(self.tagStack) <= 1 and \
584 (not self.parse_only.text or \
585 not self.parse_only.search(current_data)):
586 return
588 containerClass = self.string_container(containerClass)
589 o = containerClass(current_data)
590 self.object_was_parsed(o)
592 def object_was_parsed(self, o, parent=None, most_recent_element=None):
593 """Method called by the TreeBuilder to integrate an object into the parse tree."""
594 if parent is None:
595 parent = self.currentTag
596 if most_recent_element is not None:
597 previous_element = most_recent_element
598 else:
599 previous_element = self._most_recent_element
601 next_element = previous_sibling = next_sibling = None
602 if isinstance(o, Tag):
603 next_element = o.next_element
604 next_sibling = o.next_sibling
605 previous_sibling = o.previous_sibling
606 if previous_element is None:
607 previous_element = o.previous_element
609 fix = parent.next_element is not None
611 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
613 self._most_recent_element = o
614 parent.contents.append(o)
616 # Check if we are inserting into an already parsed node.
617 if fix:
618 self._linkage_fixer(parent)
620 def _linkage_fixer(self, el):
621 """Make sure linkage of this fragment is sound."""
623 first = el.contents[0]
624 child = el.contents[-1]
625 descendant = child
627 if child is first and el.parent is not None:
628 # Parent should be linked to first child
629 el.next_element = child
630 # We are no longer linked to whatever this element is
631 prev_el = child.previous_element
632 if prev_el is not None and prev_el is not el:
633 prev_el.next_element = None
634 # First child should be linked to the parent, and no previous siblings.
635 child.previous_element = el
636 child.previous_sibling = None
638 # We have no sibling as we've been appended as the last.
639 child.next_sibling = None
641 # This index is a tag, dig deeper for a "last descendant"
642 if isinstance(child, Tag) and child.contents:
643 descendant = child._last_descendant(False)
645 # As the final step, link last descendant. It should be linked
646 # to the parent's next sibling (if found), else walk up the chain
647 # and find a parent with a sibling. It should have no next sibling.
648 descendant.next_element = None
649 descendant.next_sibling = None
650 target = el
651 while True:
652 if target is None:
653 break
654 elif target.next_sibling is not None:
655 descendant.next_element = target.next_sibling
656 target.next_sibling.previous_element = child
657 break
658 target = target.parent
660 def _popToTag(self, name, nsprefix=None, inclusivePop=True):
661 """Pops the tag stack up to and including the most recent
662 instance of the given tag.
664 If there are no open tags with the given name, nothing will be
665 popped.
667 :param name: Pop up to the most recent tag with this name.
668 :param nsprefix: The namespace prefix that goes with `name`.
669 :param inclusivePop: It this is false, pops the tag stack up
670 to but *not* including the most recent instqance of the
671 given tag.
673 """
674 #print("Popping to %s" % name)
675 if name == self.ROOT_TAG_NAME:
676 # The BeautifulSoup object itself can never be popped.
677 return
679 most_recently_popped = None
681 stack_size = len(self.tagStack)
682 for i in range(stack_size - 1, 0, -1):
683 if not self.open_tag_counter.get(name):
684 break
685 t = self.tagStack[i]
686 if (name == t.name and nsprefix == t.prefix):
687 if inclusivePop:
688 most_recently_popped = self.popTag()
689 break
690 most_recently_popped = self.popTag()
692 return most_recently_popped
694 def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
695 sourcepos=None, namespaces=None):
696 """Called by the tree builder when a new tag is encountered.
698 :param name: Name of the tag.
699 :param nsprefix: Namespace prefix for the tag.
700 :param attrs: A dictionary of attribute values.
701 :param sourceline: The line number where this tag was found in its
702 source document.
703 :param sourcepos: The character position within `sourceline` where this
704 tag was found.
705 :param namespaces: A dictionary of all namespace prefix mappings
706 currently in scope in the document.
708 If this method returns None, the tag was rejected by an active
709 SoupStrainer. You should proceed as if the tag had not occurred
710 in the document. For instance, if this was a self-closing tag,
711 don't call handle_endtag.
712 """
713 # print("Start tag %s: %s" % (name, attrs))
714 self.endData()
716 if (self.parse_only and len(self.tagStack) <= 1
717 and (self.parse_only.text
718 or not self.parse_only.search_tag(name, attrs))):
719 return None
721 tag = self.element_classes.get(Tag, Tag)(
722 self, self.builder, name, namespace, nsprefix, attrs,
723 self.currentTag, self._most_recent_element,
724 sourceline=sourceline, sourcepos=sourcepos,
725 namespaces=namespaces
726 )
727 if tag is None:
728 return tag
729 if self._most_recent_element is not None:
730 self._most_recent_element.next_element = tag
731 self._most_recent_element = tag
732 self.pushTag(tag)
733 return tag
735 def handle_endtag(self, name, nsprefix=None):
736 """Called by the tree builder when an ending tag is encountered.
738 :param name: Name of the tag.
739 :param nsprefix: Namespace prefix for the tag.
740 """
741 #print("End tag: " + name)
742 self.endData()
743 self._popToTag(name, nsprefix)
745 def handle_data(self, data):
746 """Called by the tree builder when a chunk of textual data is encountered."""
747 self.current_data.append(data)
749 def decode(self, pretty_print=False,
750 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
751 formatter="minimal"):
752 """Returns a string or Unicode representation of the parse tree
753 as an HTML or XML document.
755 :param pretty_print: If this is True, indentation will be used to
756 make the document more readable.
757 :param eventual_encoding: The encoding of the final document.
758 If this is None, the document will be a Unicode string.
759 """
760 if self.is_xml:
761 # Print the XML declaration
762 encoding_part = ''
763 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
764 # This is a special Python encoding; it can't actually
765 # go into an XML document because it means nothing
766 # outside of Python.
767 eventual_encoding = None
768 if eventual_encoding != None:
769 encoding_part = ' encoding="%s"' % eventual_encoding
770 prefix = '<?xml version="1.0"%s?>\n' % encoding_part
771 else:
772 prefix = ''
773 if not pretty_print:
774 indent_level = None
775 else:
776 indent_level = 0
777 return prefix + super(BeautifulSoup, self).decode(
778 indent_level, eventual_encoding, formatter)
780# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
781_s = BeautifulSoup
782_soup = BeautifulSoup
784class BeautifulStoneSoup(BeautifulSoup):
785 """Deprecated interface to an XML parser."""
787 def __init__(self, *args, **kwargs):
788 kwargs['features'] = 'xml'
789 warnings.warn(
790 'The BeautifulStoneSoup class is deprecated. Instead of using '
791 'it, pass features="xml" into the BeautifulSoup constructor.',
792 DeprecationWarning
793 )
794 super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
797class StopParsing(Exception):
798 """Exception raised by a TreeBuilder if it's unable to continue parsing."""
799 pass
801class FeatureNotFound(ValueError):
802 """Exception raised by the BeautifulSoup constructor if no parser with the
803 requested features is found.
804 """
805 pass
808#If this file is run as a script, act as an HTML pretty-printer.
809if __name__ == '__main__':
810 import sys
811 soup = BeautifulSoup(sys.stdin)
812 print((soup.prettify()))