Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/element.py: 56%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
4try:
5 from collections.abc import Callable # Python 3.6
6except ImportError as e:
7 from collections import Callable
8import re
9import sys
10import warnings
11try:
12 import soupsieve
13except ImportError as e:
14 soupsieve = None
15 warnings.warn(
16 'The soupsieve package is not installed. CSS selectors cannot be used.'
17 )
19from bs4.formatter import (
20 Formatter,
21 HTMLFormatter,
22 XMLFormatter,
23)
25DEFAULT_OUTPUT_ENCODING = "utf-8"
27nonwhitespace_re = re.compile(r"\S+")
29# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
30# the off chance someone imported it for their own use.
31whitespace_re = re.compile(r"\s+")
33def _alias(attr):
34 """Alias one attribute name to another for backward compatibility"""
35 @property
36 def alias(self):
37 return getattr(self, attr)
39 @alias.setter
40 def alias(self):
41 return setattr(self, attr)
42 return alias
45# These encodings are recognized by Python (so PageElement.encode
46# could theoretically support them) but XML and HTML don't recognize
47# them (so they should not show up in an XML or HTML document as that
48# document's encoding).
49#
50# If an XML document is encoded in one of these encodings, no encoding
51# will be mentioned in the XML declaration. If an HTML document is
52# encoded in one of these encodings, and the HTML document has a
53# <meta> tag that mentions an encoding, the encoding will be given as
54# the empty string.
55#
56# Source:
57# https://docs.python.org/3/library/codecs.html#python-specific-encodings
58PYTHON_SPECIFIC_ENCODINGS = set([
59 "idna",
60 "mbcs",
61 "oem",
62 "palmos",
63 "punycode",
64 "raw_unicode_escape",
65 "undefined",
66 "unicode_escape",
67 "raw-unicode-escape",
68 "unicode-escape",
69 "string-escape",
70 "string_escape",
71])
74class NamespacedAttribute(str):
75 """A namespaced string (e.g. 'xml:lang') that remembers the namespace
76 ('xml') and the name ('lang') that were used to create it.
77 """
79 def __new__(cls, prefix, name=None, namespace=None):
80 if not name:
81 # This is the default namespace. Its name "has no value"
82 # per https://www.w3.org/TR/xml-names/#defaulting
83 name = None
85 if not name:
86 obj = str.__new__(cls, prefix)
87 elif not prefix:
88 # Not really namespaced.
89 obj = str.__new__(cls, name)
90 else:
91 obj = str.__new__(cls, prefix + ":" + name)
92 obj.prefix = prefix
93 obj.name = name
94 obj.namespace = namespace
95 return obj
97class AttributeValueWithCharsetSubstitution(str):
98 """A stand-in object for a character encoding specified in HTML."""
100class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
101 """A generic stand-in for the value of a meta tag's 'charset' attribute.
103 When Beautiful Soup parses the markup '<meta charset="utf8">', the
104 value of the 'charset' attribute will be one of these objects.
105 """
107 def __new__(cls, original_value):
108 obj = str.__new__(cls, original_value)
109 obj.original_value = original_value
110 return obj
112 def encode(self, encoding):
113 """When an HTML document is being encoded to a given encoding, the
114 value of a meta tag's 'charset' is the name of the encoding.
115 """
116 if encoding in PYTHON_SPECIFIC_ENCODINGS:
117 return ''
118 return encoding
121class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
122 """A generic stand-in for the value of a meta tag's 'content' attribute.
124 When Beautiful Soup parses the markup:
125 <meta http-equiv="content-type" content="text/html; charset=utf8">
127 The value of the 'content' attribute will be one of these objects.
128 """
130 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
132 def __new__(cls, original_value):
133 match = cls.CHARSET_RE.search(original_value)
134 if match is None:
135 # No substitution necessary.
136 return str.__new__(str, original_value)
138 obj = str.__new__(cls, original_value)
139 obj.original_value = original_value
140 return obj
142 def encode(self, encoding):
143 if encoding in PYTHON_SPECIFIC_ENCODINGS:
144 return ''
145 def rewrite(match):
146 return match.group(1) + encoding
147 return self.CHARSET_RE.sub(rewrite, self.original_value)
150class PageElement(object):
151 """Contains the navigational information for some part of the page:
152 that is, its current location in the parse tree.
154 NavigableString, Tag, etc. are all subclasses of PageElement.
155 """
157 def setup(self, parent=None, previous_element=None, next_element=None,
158 previous_sibling=None, next_sibling=None):
159 """Sets up the initial relations between this element and
160 other elements.
162 :param parent: The parent of this element.
164 :param previous_element: The element parsed immediately before
165 this one.
167 :param next_element: The element parsed immediately before
168 this one.
170 :param previous_sibling: The most recently encountered element
171 on the same level of the parse tree as this one.
173 :param previous_sibling: The next element to be encountered
174 on the same level of the parse tree as this one.
175 """
176 self.parent = parent
178 self.previous_element = previous_element
179 if previous_element is not None:
180 self.previous_element.next_element = self
182 self.next_element = next_element
183 if self.next_element is not None:
184 self.next_element.previous_element = self
186 self.next_sibling = next_sibling
187 if self.next_sibling is not None:
188 self.next_sibling.previous_sibling = self
190 if (previous_sibling is None
191 and self.parent is not None and self.parent.contents):
192 previous_sibling = self.parent.contents[-1]
194 self.previous_sibling = previous_sibling
195 if previous_sibling is not None:
196 self.previous_sibling.next_sibling = self
198 def format_string(self, s, formatter):
199 """Format the given string using the given formatter.
201 :param s: A string.
202 :param formatter: A Formatter object, or a string naming one of the standard formatters.
203 """
204 if formatter is None:
205 return s
206 if not isinstance(formatter, Formatter):
207 formatter = self.formatter_for_name(formatter)
208 output = formatter.substitute(s)
209 return output
211 def formatter_for_name(self, formatter):
212 """Look up or create a Formatter for the given identifier,
213 if necessary.
215 :param formatter: Can be a Formatter object (used as-is), a
216 function (used as the entity substitution hook for an
217 XMLFormatter or HTMLFormatter), or a string (used to look
218 up an XMLFormatter or HTMLFormatter in the appropriate
219 registry.
220 """
221 if isinstance(formatter, Formatter):
222 return formatter
223 if self._is_xml:
224 c = XMLFormatter
225 else:
226 c = HTMLFormatter
227 if isinstance(formatter, Callable):
228 return c(entity_substitution=formatter)
229 return c.REGISTRY[formatter]
231 @property
232 def _is_xml(self):
233 """Is this element part of an XML tree or an HTML tree?
235 This is used in formatter_for_name, when deciding whether an
236 XMLFormatter or HTMLFormatter is more appropriate. It can be
237 inefficient, but it should be called very rarely.
238 """
239 if self.known_xml is not None:
240 # Most of the time we will have determined this when the
241 # document is parsed.
242 return self.known_xml
244 # Otherwise, it's likely that this element was created by
245 # direct invocation of the constructor from within the user's
246 # Python code.
247 if self.parent is None:
248 # This is the top-level object. It should have .known_xml set
249 # from tree creation. If not, take a guess--BS is usually
250 # used on HTML markup.
251 return getattr(self, 'is_xml', False)
252 return self.parent._is_xml
254 nextSibling = _alias("next_sibling") # BS3
255 previousSibling = _alias("previous_sibling") # BS3
257 default = object()
258 def _all_strings(self, strip=False, types=default):
259 """Yield all strings of certain classes, possibly stripping them.
261 This is implemented differently in Tag and NavigableString.
262 """
263 raise NotImplementedError()
265 @property
266 def stripped_strings(self):
267 """Yield all strings in this PageElement, stripping them first.
269 :yield: A sequence of stripped strings.
270 """
271 for string in self._all_strings(True):
272 yield string
274 def get_text(self, separator="", strip=False,
275 types=default):
276 """Get all child strings of this PageElement, concatenated using the
277 given separator.
279 :param separator: Strings will be concatenated using this separator.
281 :param strip: If True, strings will be stripped before being
282 concatenated.
284 :param types: A tuple of NavigableString subclasses. Any
285 strings of a subclass not found in this list will be
286 ignored. Although there are exceptions, the default
287 behavior in most cases is to consider only NavigableString
288 and CData objects. That means no comments, processing
289 instructions, etc.
291 :return: A string.
292 """
293 return separator.join([s for s in self._all_strings(
294 strip, types=types)])
295 getText = get_text
296 text = property(get_text)
298 def replace_with(self, *args):
299 """Replace this PageElement with one or more PageElements, keeping the
300 rest of the tree the same.
302 :param args: One or more PageElements.
303 :return: `self`, no longer part of the tree.
304 """
305 if self.parent is None:
306 raise ValueError(
307 "Cannot replace one element with another when the "
308 "element to be replaced is not part of a tree.")
309 if len(args) == 1 and args[0] is self:
310 return
311 if any(x is self.parent for x in args):
312 raise ValueError("Cannot replace a Tag with its parent.")
313 old_parent = self.parent
314 my_index = self.parent.index(self)
315 self.extract(_self_index=my_index)
316 for idx, replace_with in enumerate(args, start=my_index):
317 old_parent.insert(idx, replace_with)
318 return self
319 replaceWith = replace_with # BS3
321 def unwrap(self):
322 """Replace this PageElement with its contents.
324 :return: `self`, no longer part of the tree.
325 """
326 my_parent = self.parent
327 if self.parent is None:
328 raise ValueError(
329 "Cannot replace an element with its contents when that"
330 "element is not part of a tree.")
331 my_index = self.parent.index(self)
332 self.extract(_self_index=my_index)
333 for child in reversed(self.contents[:]):
334 my_parent.insert(my_index, child)
335 return self
336 replace_with_children = unwrap
337 replaceWithChildren = unwrap # BS3
339 def wrap(self, wrap_inside):
340 """Wrap this PageElement inside another one.
342 :param wrap_inside: A PageElement.
343 :return: `wrap_inside`, occupying the position in the tree that used
344 to be occupied by `self`, and with `self` inside it.
345 """
346 me = self.replace_with(wrap_inside)
347 wrap_inside.append(me)
348 return wrap_inside
350 def extract(self, _self_index=None):
351 """Destructively rips this element out of the tree.
353 :param _self_index: The location of this element in its parent's
354 .contents, if known. Passing this in allows for a performance
355 optimization.
357 :return: `self`, no longer part of the tree.
358 """
359 if self.parent is not None:
360 if _self_index is None:
361 _self_index = self.parent.index(self)
362 del self.parent.contents[_self_index]
364 #Find the two elements that would be next to each other if
365 #this element (and any children) hadn't been parsed. Connect
366 #the two.
367 last_child = self._last_descendant()
368 next_element = last_child.next_element
370 if (self.previous_element is not None and
371 self.previous_element is not next_element):
372 self.previous_element.next_element = next_element
373 if next_element is not None and next_element is not self.previous_element:
374 next_element.previous_element = self.previous_element
375 self.previous_element = None
376 last_child.next_element = None
378 self.parent = None
379 if (self.previous_sibling is not None
380 and self.previous_sibling is not self.next_sibling):
381 self.previous_sibling.next_sibling = self.next_sibling
382 if (self.next_sibling is not None
383 and self.next_sibling is not self.previous_sibling):
384 self.next_sibling.previous_sibling = self.previous_sibling
385 self.previous_sibling = self.next_sibling = None
386 return self
388 def _last_descendant(self, is_initialized=True, accept_self=True):
389 """Finds the last element beneath this object to be parsed.
391 :param is_initialized: Has `setup` been called on this PageElement
392 yet?
393 :param accept_self: Is `self` an acceptable answer to the question?
394 """
395 if is_initialized and self.next_sibling is not None:
396 last_child = self.next_sibling.previous_element
397 else:
398 last_child = self
399 while isinstance(last_child, Tag) and last_child.contents:
400 last_child = last_child.contents[-1]
401 if not accept_self and last_child is self:
402 last_child = None
403 return last_child
404 # BS3: Not part of the API!
405 _lastRecursiveChild = _last_descendant
407 def insert(self, position, new_child):
408 """Insert a new PageElement in the list of this PageElement's children.
410 This works the same way as `list.insert`.
412 :param position: The numeric position that should be occupied
413 in `self.children` by the new PageElement.
414 :param new_child: A PageElement.
415 """
416 if new_child is None:
417 raise ValueError("Cannot insert None into a tag.")
418 if new_child is self:
419 raise ValueError("Cannot insert a tag into itself.")
420 if (isinstance(new_child, str)
421 and not isinstance(new_child, NavigableString)):
422 new_child = NavigableString(new_child)
424 from bs4 import BeautifulSoup
425 if isinstance(new_child, BeautifulSoup):
426 # We don't want to end up with a situation where one BeautifulSoup
427 # object contains another. Insert the children one at a time.
428 for subchild in list(new_child.contents):
429 self.insert(position, subchild)
430 position += 1
431 return
432 position = min(position, len(self.contents))
433 if hasattr(new_child, 'parent') and new_child.parent is not None:
434 # We're 'inserting' an element that's already one
435 # of this object's children.
436 if new_child.parent is self:
437 current_index = self.index(new_child)
438 if current_index < position:
439 # We're moving this element further down the list
440 # of this object's children. That means that when
441 # we extract this element, our target index will
442 # jump down one.
443 position -= 1
444 new_child.extract()
446 new_child.parent = self
447 previous_child = None
448 if position == 0:
449 new_child.previous_sibling = None
450 new_child.previous_element = self
451 else:
452 previous_child = self.contents[position - 1]
453 new_child.previous_sibling = previous_child
454 new_child.previous_sibling.next_sibling = new_child
455 new_child.previous_element = previous_child._last_descendant(False)
456 if new_child.previous_element is not None:
457 new_child.previous_element.next_element = new_child
459 new_childs_last_element = new_child._last_descendant(False)
461 if position >= len(self.contents):
462 new_child.next_sibling = None
464 parent = self
465 parents_next_sibling = None
466 while parents_next_sibling is None and parent is not None:
467 parents_next_sibling = parent.next_sibling
468 parent = parent.parent
469 if parents_next_sibling is not None:
470 # We found the element that comes next in the document.
471 break
472 if parents_next_sibling is not None:
473 new_childs_last_element.next_element = parents_next_sibling
474 else:
475 # The last element of this tag is the last element in
476 # the document.
477 new_childs_last_element.next_element = None
478 else:
479 next_child = self.contents[position]
480 new_child.next_sibling = next_child
481 if new_child.next_sibling is not None:
482 new_child.next_sibling.previous_sibling = new_child
483 new_childs_last_element.next_element = next_child
485 if new_childs_last_element.next_element is not None:
486 new_childs_last_element.next_element.previous_element = new_childs_last_element
487 self.contents.insert(position, new_child)
489 def append(self, tag):
490 """Appends the given PageElement to the contents of this one.
492 :param tag: A PageElement.
493 """
494 self.insert(len(self.contents), tag)
496 def extend(self, tags):
497 """Appends the given PageElements to this one's contents.
499 :param tags: A list of PageElements.
500 """
501 if isinstance(tags, Tag):
502 # Calling self.append() on another tag's contents will change
503 # the list we're iterating over. Make a list that won't
504 # change.
505 tags = list(tags.contents)
506 for tag in tags:
507 self.append(tag)
509 def insert_before(self, *args):
510 """Makes the given element(s) the immediate predecessor of this one.
512 All the elements will have the same parent, and the given elements
513 will be immediately before this one.
515 :param args: One or more PageElements.
516 """
517 parent = self.parent
518 if parent is None:
519 raise ValueError(
520 "Element has no parent, so 'before' has no meaning.")
521 if any(x is self for x in args):
522 raise ValueError("Can't insert an element before itself.")
523 for predecessor in args:
524 # Extract first so that the index won't be screwed up if they
525 # are siblings.
526 if isinstance(predecessor, PageElement):
527 predecessor.extract()
528 index = parent.index(self)
529 parent.insert(index, predecessor)
531 def insert_after(self, *args):
532 """Makes the given element(s) the immediate successor of this one.
534 The elements will have the same parent, and the given elements
535 will be immediately after this one.
537 :param args: One or more PageElements.
538 """
539 # Do all error checking before modifying the tree.
540 parent = self.parent
541 if parent is None:
542 raise ValueError(
543 "Element has no parent, so 'after' has no meaning.")
544 if any(x is self for x in args):
545 raise ValueError("Can't insert an element after itself.")
547 offset = 0
548 for successor in args:
549 # Extract first so that the index won't be screwed up if they
550 # are siblings.
551 if isinstance(successor, PageElement):
552 successor.extract()
553 index = parent.index(self)
554 parent.insert(index+1+offset, successor)
555 offset += 1
557 def find_next(self, name=None, attrs={}, string=None, **kwargs):
558 """Find the first PageElement that matches the given criteria and
559 appears later in the document than this PageElement.
561 All find_* methods take a common set of arguments. See the online
562 documentation for detailed explanations.
564 :param name: A filter on tag name.
565 :param attrs: A dictionary of filters on attribute values.
566 :param string: A filter for a NavigableString with specific text.
567 :kwargs: A dictionary of filters on attribute values.
568 :return: A PageElement.
569 :rtype: bs4.element.Tag | bs4.element.NavigableString
570 """
571 return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
572 findNext = find_next # BS3
574 def find_all_next(self, name=None, attrs={}, string=None, limit=None,
575 **kwargs):
576 """Find all PageElements that match the given criteria and appear
577 later in the document than this PageElement.
579 All find_* methods take a common set of arguments. See the online
580 documentation for detailed explanations.
582 :param name: A filter on tag name.
583 :param attrs: A dictionary of filters on attribute values.
584 :param string: A filter for a NavigableString with specific text.
585 :param limit: Stop looking after finding this many results.
586 :kwargs: A dictionary of filters on attribute values.
587 :return: A ResultSet containing PageElements.
588 """
589 return self._find_all(name, attrs, string, limit, self.next_elements,
590 **kwargs)
591 findAllNext = find_all_next # BS3
593 def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
594 """Find the closest sibling to this PageElement that matches the
595 given criteria and appears later in the document.
597 All find_* methods take a common set of arguments. See the
598 online documentation for detailed explanations.
600 :param name: A filter on tag name.
601 :param attrs: A dictionary of filters on attribute values.
602 :param string: A filter for a NavigableString with specific text.
603 :kwargs: A dictionary of filters on attribute values.
604 :return: A PageElement.
605 :rtype: bs4.element.Tag | bs4.element.NavigableString
606 """
607 return self._find_one(self.find_next_siblings, name, attrs, string,
608 **kwargs)
609 findNextSibling = find_next_sibling # BS3
611 def find_next_siblings(self, name=None, attrs={}, string=None, limit=None,
612 **kwargs):
613 """Find all siblings of this PageElement that match the given criteria
614 and appear later in the document.
616 All find_* methods take a common set of arguments. See the online
617 documentation for detailed explanations.
619 :param name: A filter on tag name.
620 :param attrs: A dictionary of filters on attribute values.
621 :param string: A filter for a NavigableString with specific text.
622 :param limit: Stop looking after finding this many results.
623 :kwargs: A dictionary of filters on attribute values.
624 :return: A ResultSet of PageElements.
625 :rtype: bs4.element.ResultSet
626 """
627 return self._find_all(name, attrs, string, limit,
628 self.next_siblings, **kwargs)
629 findNextSiblings = find_next_siblings # BS3
630 fetchNextSiblings = find_next_siblings # BS2
632 def find_previous(self, name=None, attrs={}, string=None, **kwargs):
633 """Look backwards in the document from this PageElement and find the
634 first PageElement that matches the given criteria.
636 All find_* methods take a common set of arguments. See the online
637 documentation for detailed explanations.
639 :param name: A filter on tag name.
640 :param attrs: A dictionary of filters on attribute values.
641 :param string: A filter for a NavigableString with specific text.
642 :kwargs: A dictionary of filters on attribute values.
643 :return: A PageElement.
644 :rtype: bs4.element.Tag | bs4.element.NavigableString
645 """
646 return self._find_one(
647 self.find_all_previous, name, attrs, string, **kwargs)
648 findPrevious = find_previous # BS3
650 def find_all_previous(self, name=None, attrs={}, string=None, limit=None,
651 **kwargs):
652 """Look backwards in the document from this PageElement and find all
653 PageElements that match the given criteria.
655 All find_* methods take a common set of arguments. See the online
656 documentation for detailed explanations.
658 :param name: A filter on tag name.
659 :param attrs: A dictionary of filters on attribute values.
660 :param string: A filter for a NavigableString with specific text.
661 :param limit: Stop looking after finding this many results.
662 :kwargs: A dictionary of filters on attribute values.
663 :return: A ResultSet of PageElements.
664 :rtype: bs4.element.ResultSet
665 """
666 return self._find_all(name, attrs, string, limit, self.previous_elements,
667 **kwargs)
668 findAllPrevious = find_all_previous # BS3
669 fetchPrevious = find_all_previous # BS2
671 def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs):
672 """Returns the closest sibling to this PageElement that matches the
673 given criteria and appears earlier in the document.
675 All find_* methods take a common set of arguments. See the online
676 documentation for detailed explanations.
678 :param name: A filter on tag name.
679 :param attrs: A dictionary of filters on attribute values.
680 :param string: A filter for a NavigableString with specific text.
681 :kwargs: A dictionary of filters on attribute values.
682 :return: A PageElement.
683 :rtype: bs4.element.Tag | bs4.element.NavigableString
684 """
685 return self._find_one(self.find_previous_siblings, name, attrs, string,
686 **kwargs)
687 findPreviousSibling = find_previous_sibling # BS3
689 def find_previous_siblings(self, name=None, attrs={}, string=None,
690 limit=None, **kwargs):
691 """Returns all siblings to this PageElement that match the
692 given criteria and appear earlier in the document.
694 All find_* methods take a common set of arguments. See the online
695 documentation for detailed explanations.
697 :param name: A filter on tag name.
698 :param attrs: A dictionary of filters on attribute values.
699 :param string: A filter for a NavigableString with specific text.
700 :param limit: Stop looking after finding this many results.
701 :kwargs: A dictionary of filters on attribute values.
702 :return: A ResultSet of PageElements.
703 :rtype: bs4.element.ResultSet
704 """
705 return self._find_all(name, attrs, string, limit,
706 self.previous_siblings, **kwargs)
707 findPreviousSiblings = find_previous_siblings # BS3
708 fetchPreviousSiblings = find_previous_siblings # BS2
710 def find_parent(self, name=None, attrs={}, **kwargs):
711 """Find the closest parent of this PageElement that matches the given
712 criteria.
714 All find_* methods take a common set of arguments. See the online
715 documentation for detailed explanations.
717 :param name: A filter on tag name.
718 :param attrs: A dictionary of filters on attribute values.
719 :kwargs: A dictionary of filters on attribute values.
721 :return: A PageElement.
722 :rtype: bs4.element.Tag | bs4.element.NavigableString
723 """
724 # NOTE: We can't use _find_one because findParents takes a different
725 # set of arguments.
726 r = None
727 l = self.find_parents(name, attrs, 1, **kwargs)
728 if l:
729 r = l[0]
730 return r
731 findParent = find_parent # BS3
733 def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
734 """Find all parents of this PageElement that match the given criteria.
736 All find_* methods take a common set of arguments. See the online
737 documentation for detailed explanations.
739 :param name: A filter on tag name.
740 :param attrs: A dictionary of filters on attribute values.
741 :param limit: Stop looking after finding this many results.
742 :kwargs: A dictionary of filters on attribute values.
744 :return: A PageElement.
745 :rtype: bs4.element.Tag | bs4.element.NavigableString
746 """
747 return self._find_all(name, attrs, None, limit, self.parents,
748 **kwargs)
749 findParents = find_parents # BS3
750 fetchParents = find_parents # BS2
752 @property
753 def next(self):
754 """The PageElement, if any, that was parsed just after this one.
756 :return: A PageElement.
757 :rtype: bs4.element.Tag | bs4.element.NavigableString
758 """
759 return self.next_element
761 @property
762 def previous(self):
763 """The PageElement, if any, that was parsed just before this one.
765 :return: A PageElement.
766 :rtype: bs4.element.Tag | bs4.element.NavigableString
767 """
768 return self.previous_element
770 #These methods do the real heavy lifting.
772 def _find_one(self, method, name, attrs, string, **kwargs):
773 r = None
774 l = method(name, attrs, string, 1, **kwargs)
775 if l:
776 r = l[0]
777 return r
779 def _find_all(self, name, attrs, string, limit, generator, **kwargs):
780 "Iterates over a generator looking for things that match."
782 if string is None and 'text' in kwargs:
783 string = kwargs.pop('text')
784 warnings.warn(
785 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
786 DeprecationWarning
787 )
789 if isinstance(name, SoupStrainer):
790 strainer = name
791 else:
792 strainer = SoupStrainer(name, attrs, string, **kwargs)
794 if string is None and not limit and not attrs and not kwargs:
795 if name is True or name is None:
796 # Optimization to find all tags.
797 result = (element for element in generator
798 if isinstance(element, Tag))
799 return ResultSet(strainer, result)
800 elif isinstance(name, str):
801 # Optimization to find all tags with a given name.
802 if name.count(':') == 1:
803 # This is a name with a prefix. If this is a namespace-aware document,
804 # we need to match the local name against tag.name. If not,
805 # we need to match the fully-qualified name against tag.name.
806 prefix, local_name = name.split(':', 1)
807 else:
808 prefix = None
809 local_name = name
810 result = (element for element in generator
811 if isinstance(element, Tag)
812 and (
813 element.name == name
814 ) or (
815 element.name == local_name
816 and (prefix is None or element.prefix == prefix)
817 )
818 )
819 return ResultSet(strainer, result)
820 results = ResultSet(strainer)
821 while True:
822 try:
823 i = next(generator)
824 except StopIteration:
825 break
826 if i:
827 found = strainer.search(i)
828 if found:
829 results.append(found)
830 if limit and len(results) >= limit:
831 break
832 return results
834 #These generators can be used to navigate starting from both
835 #NavigableStrings and Tags.
836 @property
837 def next_elements(self):
838 """All PageElements that were parsed after this one.
840 :yield: A sequence of PageElements.
841 """
842 i = self.next_element
843 while i is not None:
844 yield i
845 i = i.next_element
847 @property
848 def next_siblings(self):
849 """All PageElements that are siblings of this one but were parsed
850 later.
852 :yield: A sequence of PageElements.
853 """
854 i = self.next_sibling
855 while i is not None:
856 yield i
857 i = i.next_sibling
859 @property
860 def previous_elements(self):
861 """All PageElements that were parsed before this one.
863 :yield: A sequence of PageElements.
864 """
865 i = self.previous_element
866 while i is not None:
867 yield i
868 i = i.previous_element
870 @property
871 def previous_siblings(self):
872 """All PageElements that are siblings of this one but were parsed
873 earlier.
875 :yield: A sequence of PageElements.
876 """
877 i = self.previous_sibling
878 while i is not None:
879 yield i
880 i = i.previous_sibling
882 @property
883 def parents(self):
884 """All PageElements that are parents of this PageElement.
886 :yield: A sequence of PageElements.
887 """
888 i = self.parent
889 while i is not None:
890 yield i
891 i = i.parent
893 @property
894 def decomposed(self):
895 """Check whether a PageElement has been decomposed.
897 :rtype: bool
898 """
899 return getattr(self, '_decomposed', False) or False
901 # Old non-property versions of the generators, for backwards
902 # compatibility with BS3.
903 def nextGenerator(self):
904 return self.next_elements
906 def nextSiblingGenerator(self):
907 return self.next_siblings
909 def previousGenerator(self):
910 return self.previous_elements
912 def previousSiblingGenerator(self):
913 return self.previous_siblings
915 def parentGenerator(self):
916 return self.parents
919class NavigableString(str, PageElement):
920 """A Python Unicode string that is part of a parse tree.
922 When Beautiful Soup parses the markup <b>penguin</b>, it will
923 create a NavigableString for the string "penguin".
924 """
926 PREFIX = ''
927 SUFFIX = ''
929 # We can't tell just by looking at a string whether it's contained
930 # in an XML document or an HTML document.
932 known_xml = None
934 def __new__(cls, value):
935 """Create a new NavigableString.
937 When unpickling a NavigableString, this method is called with
938 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
939 passed in to the superclass's __new__ or the superclass won't know
940 how to handle non-ASCII characters.
941 """
942 if isinstance(value, str):
943 u = str.__new__(cls, value)
944 else:
945 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
946 u.setup()
947 return u
949 def __copy__(self):
950 """A copy of a NavigableString has the same contents and class
951 as the original, but it is not connected to the parse tree.
952 """
953 return type(self)(self)
955 def __getnewargs__(self):
956 return (str(self),)
958 def __getattr__(self, attr):
959 """text.string gives you text. This is for backwards
960 compatibility for Navigable*String, but for CData* it lets you
961 get the string without the CData wrapper."""
962 if attr == 'string':
963 return self
964 else:
965 raise AttributeError(
966 "'%s' object has no attribute '%s'" % (
967 self.__class__.__name__, attr))
969 def output_ready(self, formatter="minimal"):
970 """Run the string through the provided formatter.
972 :param formatter: A Formatter object, or a string naming one of the standard formatters.
973 """
974 output = self.format_string(self, formatter)
975 return self.PREFIX + output + self.SUFFIX
977 @property
978 def name(self):
979 """Since a NavigableString is not a Tag, it has no .name.
981 This property is implemented so that code like this doesn't crash
982 when run on a mixture of Tag and NavigableString objects:
983 [x.name for x in tag.children]
984 """
985 return None
987 @name.setter
988 def name(self, name):
989 """Prevent NavigableString.name from ever being set."""
990 raise AttributeError("A NavigableString cannot be given a name.")
992 def _all_strings(self, strip=False, types=PageElement.default):
993 """Yield all strings of certain classes, possibly stripping them.
995 This makes it easy for NavigableString to implement methods
996 like get_text() as conveniences, creating a consistent
997 text-extraction API across all PageElements.
999 :param strip: If True, all strings will be stripped before being
1000 yielded.
1002 :param types: A tuple of NavigableString subclasses. If this
1003 NavigableString isn't one of those subclasses, the
1004 sequence will be empty. By default, the subclasses
1005 considered are NavigableString and CData objects. That
1006 means no comments, processing instructions, etc.
1008 :yield: A sequence that either contains this string, or is empty.
1010 """
1011 if types is self.default:
1012 # This is kept in Tag because it's full of subclasses of
1013 # this class, which aren't defined until later in the file.
1014 types = Tag.DEFAULT_INTERESTING_STRING_TYPES
1016 # Do nothing if the caller is looking for specific types of
1017 # string, and we're of a different type.
1018 #
1019 # We check specific types instead of using isinstance(self,
1020 # types) because all of these classes subclass
1021 # NavigableString. Anyone who's using this feature probably
1022 # wants generic NavigableStrings but not other stuff.
1023 my_type = type(self)
1024 if types is not None:
1025 if isinstance(types, type):
1026 # Looking for a single type.
1027 if my_type is not types:
1028 return
1029 elif my_type not in types:
1030 # Looking for one of a list of types.
1031 return
1033 value = self
1034 if strip:
1035 value = value.strip()
1036 if len(value) > 0:
1037 yield value
1038 strings = property(_all_strings)
1040class PreformattedString(NavigableString):
1041 """A NavigableString not subject to the normal formatting rules.
1043 This is an abstract class used for special kinds of strings such
1044 as comments (the Comment class) and CDATA blocks (the CData
1045 class).
1046 """
1048 PREFIX = ''
1049 SUFFIX = ''
1051 def output_ready(self, formatter=None):
1052 """Make this string ready for output by adding any subclass-specific
1053 prefix or suffix.
1055 :param formatter: A Formatter object, or a string naming one
1056 of the standard formatters. The string will be passed into the
1057 Formatter, but only to trigger any side effects: the return
1058 value is ignored.
1060 :return: The string, with any subclass-specific prefix and
1061 suffix added on.
1062 """
1063 if formatter is not None:
1064 ignore = self.format_string(self, formatter)
1065 return self.PREFIX + self + self.SUFFIX
1067class CData(PreformattedString):
1068 """A CDATA block."""
1069 PREFIX = '<![CDATA['
1070 SUFFIX = ']]>'
1072class ProcessingInstruction(PreformattedString):
1073 """A SGML processing instruction."""
1075 PREFIX = '<?'
1076 SUFFIX = '>'
1078class XMLProcessingInstruction(ProcessingInstruction):
1079 """An XML processing instruction."""
1080 PREFIX = '<?'
1081 SUFFIX = '?>'
1083class Comment(PreformattedString):
1084 """An HTML or XML comment."""
1085 PREFIX = '<!--'
1086 SUFFIX = '-->'
1089class Declaration(PreformattedString):
1090 """An XML declaration."""
1091 PREFIX = '<?'
1092 SUFFIX = '?>'
1095class Doctype(PreformattedString):
1096 """A document type declaration."""
1097 @classmethod
1098 def for_name_and_ids(cls, name, pub_id, system_id):
1099 """Generate an appropriate document type declaration for a given
1100 public ID and system ID.
1102 :param name: The name of the document's root element, e.g. 'html'.
1103 :param pub_id: The Formal Public Identifier for this document type,
1104 e.g. '-//W3C//DTD XHTML 1.1//EN'
1105 :param system_id: The system identifier for this document type,
1106 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1108 :return: A Doctype.
1109 """
1110 value = name or ''
1111 if pub_id is not None:
1112 value += ' PUBLIC "%s"' % pub_id
1113 if system_id is not None:
1114 value += ' "%s"' % system_id
1115 elif system_id is not None:
1116 value += ' SYSTEM "%s"' % system_id
1118 return Doctype(value)
1120 PREFIX = '<!DOCTYPE '
1121 SUFFIX = '>\n'
1124class Stylesheet(NavigableString):
1125 """A NavigableString representing an stylesheet (probably
1126 CSS).
1128 Used to distinguish embedded stylesheets from textual content.
1129 """
1130 pass
1133class Script(NavigableString):
1134 """A NavigableString representing an executable script (probably
1135 Javascript).
1137 Used to distinguish executable code from textual content.
1138 """
1139 pass
1142class TemplateString(NavigableString):
1143 """A NavigableString representing a string found inside an HTML
1144 template embedded in a larger document.
1146 Used to distinguish such strings from the main body of the document.
1147 """
1148 pass
1151class RubyTextString(NavigableString):
1152 """A NavigableString representing the contents of the <rt> HTML
1153 element.
1155 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
1157 Can be used to distinguish such strings from the strings they're
1158 annotating.
1159 """
1160 pass
1163class RubyParenthesisString(NavigableString):
1164 """A NavigableString representing the contents of the <rp> HTML
1165 element.
1167 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
1168 """
1169 pass
1172class Tag(PageElement):
1173 """Represents an HTML or XML tag that is part of a parse tree, along
1174 with its attributes and contents.
1176 When Beautiful Soup parses the markup <b>penguin</b>, it will
1177 create a Tag object representing the <b> tag.
1178 """
1180 def __init__(self, parser=None, builder=None, name=None, namespace=None,
1181 prefix=None, attrs=None, parent=None, previous=None,
1182 is_xml=None, sourceline=None, sourcepos=None,
1183 can_be_empty_element=None, cdata_list_attributes=None,
1184 preserve_whitespace_tags=None,
1185 interesting_string_types=None,
1186 namespaces=None
1187 ):
1188 """Basic constructor.
1190 :param parser: A BeautifulSoup object.
1191 :param builder: A TreeBuilder.
1192 :param name: The name of the tag.
1193 :param namespace: The URI of this Tag's XML namespace, if any.
1194 :param prefix: The prefix for this Tag's XML namespace, if any.
1195 :param attrs: A dictionary of this Tag's attribute values.
1196 :param parent: The PageElement to use as this Tag's parent.
1197 :param previous: The PageElement that was parsed immediately before
1198 this tag.
1199 :param is_xml: If True, this is an XML tag. Otherwise, this is an
1200 HTML tag.
1201 :param sourceline: The line number where this tag was found in its
1202 source document.
1203 :param sourcepos: The character position within `sourceline` where this
1204 tag was found.
1205 :param can_be_empty_element: If True, this tag should be
1206 represented as <tag/>. If False, this tag should be represented
1207 as <tag></tag>.
1208 :param cdata_list_attributes: A list of attributes whose values should
1209 be treated as CDATA if they ever show up on this tag.
1210 :param preserve_whitespace_tags: A list of tag names whose contents
1211 should have their whitespace preserved.
1212 :param interesting_string_types: This is a NavigableString
1213 subclass or a tuple of them. When iterating over this
1214 Tag's strings in methods like Tag.strings or Tag.get_text,
1215 these are the types of strings that are interesting enough
1216 to be considered. The default is to consider
1217 NavigableString and CData the only interesting string
1218 subtypes.
1219 :param namespaces: A dictionary mapping currently active
1220 namespace prefixes to URIs. This can be used later to
1221 construct CSS selectors.
1222 """
1223 if parser is None:
1224 self.parser_class = None
1225 else:
1226 # We don't actually store the parser object: that lets extracted
1227 # chunks be garbage-collected.
1228 self.parser_class = parser.__class__
1229 if name is None:
1230 raise ValueError("No value provided for new tag's name.")
1231 self.name = name
1232 self.namespace = namespace
1233 self._namespaces = namespaces or {}
1234 self.prefix = prefix
1235 if ((not builder or builder.store_line_numbers)
1236 and (sourceline is not None or sourcepos is not None)):
1237 self.sourceline = sourceline
1238 self.sourcepos = sourcepos
1239 if attrs is None:
1240 attrs = {}
1241 elif attrs:
1242 if builder is not None and builder.cdata_list_attributes:
1243 attrs = builder._replace_cdata_list_attribute_values(
1244 self.name, attrs)
1245 else:
1246 attrs = dict(attrs)
1247 else:
1248 attrs = dict(attrs)
1250 # If possible, determine ahead of time whether this tag is an
1251 # XML tag.
1252 if builder:
1253 self.known_xml = builder.is_xml
1254 else:
1255 self.known_xml = is_xml
1256 self.attrs = attrs
1257 self.contents = []
1258 self.setup(parent, previous)
1259 self.hidden = False
1261 if builder is None:
1262 # In the absence of a TreeBuilder, use whatever values were
1263 # passed in here. They're probably None, unless this is a copy of some
1264 # other tag.
1265 self.can_be_empty_element = can_be_empty_element
1266 self.cdata_list_attributes = cdata_list_attributes
1267 self.preserve_whitespace_tags = preserve_whitespace_tags
1268 self.interesting_string_types = interesting_string_types
1269 else:
1270 # Set up any substitutions for this tag, such as the charset in a META tag.
1271 builder.set_up_substitutions(self)
1273 # Ask the TreeBuilder whether this tag might be an empty-element tag.
1274 self.can_be_empty_element = builder.can_be_empty_element(name)
1276 # Keep track of the list of attributes of this tag that
1277 # might need to be treated as a list.
1278 #
1279 # For performance reasons, we store the whole data structure
1280 # rather than asking the question of every tag. Asking would
1281 # require building a new data structure every time, and
1282 # (unlike can_be_empty_element), we almost never need
1283 # to check this.
1284 self.cdata_list_attributes = builder.cdata_list_attributes
1286 # Keep track of the names that might cause this tag to be treated as a
1287 # whitespace-preserved tag.
1288 self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1290 if self.name in builder.string_containers:
1291 # This sort of tag uses a special string container
1292 # subclass for most of its strings. When we ask the
1293 self.interesting_string_types = builder.string_containers[self.name]
1294 else:
1295 self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
1297 parserClass = _alias("parser_class") # BS3
1299 def __copy__(self):
1300 """A copy of a Tag is a new Tag, unconnected to the parse tree.
1301 Its contents are a copy of the old Tag's contents.
1302 """
1303 clone = type(self)(
1304 None, self.builder, self.name, self.namespace,
1305 self.prefix, self.attrs, is_xml=self._is_xml,
1306 sourceline=self.sourceline, sourcepos=self.sourcepos,
1307 can_be_empty_element=self.can_be_empty_element,
1308 cdata_list_attributes=self.cdata_list_attributes,
1309 preserve_whitespace_tags=self.preserve_whitespace_tags
1310 )
1311 for attr in ('can_be_empty_element', 'hidden'):
1312 setattr(clone, attr, getattr(self, attr))
1313 for child in self.contents:
1314 clone.append(child.__copy__())
1315 return clone
1317 @property
1318 def is_empty_element(self):
1319 """Is this tag an empty-element tag? (aka a self-closing tag)
1321 A tag that has contents is never an empty-element tag.
1323 A tag that has no contents may or may not be an empty-element
1324 tag. It depends on the builder used to create the tag. If the
1325 builder has a designated list of empty-element tags, then only
1326 a tag whose name shows up in that list is considered an
1327 empty-element tag.
1329 If the builder has no designated list of empty-element tags,
1330 then any tag with no contents is an empty-element tag.
1331 """
1332 return len(self.contents) == 0 and self.can_be_empty_element
1333 isSelfClosing = is_empty_element # BS3
1335 @property
1336 def string(self):
1337 """Convenience property to get the single string within this
1338 PageElement.
1340 TODO It might make sense to have NavigableString.string return
1341 itself.
1343 :return: If this element has a single string child, return
1344 value is that string. If this element has one child tag,
1345 return value is the 'string' attribute of the child tag,
1346 recursively. If this element is itself a string, has no
1347 children, or has more than one child, return value is None.
1348 """
1349 if len(self.contents) != 1:
1350 return None
1351 child = self.contents[0]
1352 if isinstance(child, NavigableString):
1353 return child
1354 return child.string
1356 @string.setter
1357 def string(self, string):
1358 """Replace this PageElement's contents with `string`."""
1359 self.clear()
1360 self.append(string.__class__(string))
1362 DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
1363 def _all_strings(self, strip=False, types=PageElement.default):
1364 """Yield all strings of certain classes, possibly stripping them.
1366 :param strip: If True, all strings will be stripped before being
1367 yielded.
1369 :param types: A tuple of NavigableString subclasses. Any strings of
1370 a subclass not found in this list will be ignored. By
1371 default, the subclasses considered are the ones found in
1372 self.interesting_string_types. If that's not specified,
1373 only NavigableString and CData objects will be
1374 considered. That means no comments, processing
1375 instructions, etc.
1377 :yield: A sequence of strings.
1379 """
1380 if types is self.default:
1381 types = self.interesting_string_types
1383 for descendant in self.descendants:
1384 if (types is None and not isinstance(descendant, NavigableString)):
1385 continue
1386 descendant_type = type(descendant)
1387 if isinstance(types, type):
1388 if descendant_type is not types:
1389 # We're not interested in strings of this type.
1390 continue
1391 elif types is not None and descendant_type not in types:
1392 # We're not interested in strings of this type.
1393 continue
1394 if strip:
1395 descendant = descendant.strip()
1396 if len(descendant) == 0:
1397 continue
1398 yield descendant
1399 strings = property(_all_strings)
1401 def decompose(self):
1402 """Recursively destroys this PageElement and its children.
1404 This element will be removed from the tree and wiped out; so
1405 will everything beneath it.
1407 The behavior of a decomposed PageElement is undefined and you
1408 should never use one for anything, but if you need to _check_
1409 whether an element has been decomposed, you can use the
1410 `decomposed` property.
1411 """
1412 self.extract()
1413 i = self
1414 while i is not None:
1415 n = i.next_element
1416 i.__dict__.clear()
1417 i.contents = []
1418 i._decomposed = True
1419 i = n
1421 def clear(self, decompose=False):
1422 """Wipe out all children of this PageElement by calling extract()
1423 on them.
1425 :param decompose: If this is True, decompose() (a more
1426 destructive method) will be called instead of extract().
1427 """
1428 if decompose:
1429 for element in self.contents[:]:
1430 if isinstance(element, Tag):
1431 element.decompose()
1432 else:
1433 element.extract()
1434 else:
1435 for element in self.contents[:]:
1436 element.extract()
1438 def smooth(self):
1439 """Smooth out this element's children by consolidating consecutive
1440 strings.
1442 This makes pretty-printed output look more natural following a
1443 lot of operations that modified the tree.
1444 """
1445 # Mark the first position of every pair of children that need
1446 # to be consolidated. Do this rather than making a copy of
1447 # self.contents, since in most cases very few strings will be
1448 # affected.
1449 marked = []
1450 for i, a in enumerate(self.contents):
1451 if isinstance(a, Tag):
1452 # Recursively smooth children.
1453 a.smooth()
1454 if i == len(self.contents)-1:
1455 # This is the last item in .contents, and it's not a
1456 # tag. There's no chance it needs any work.
1457 continue
1458 b = self.contents[i+1]
1459 if (isinstance(a, NavigableString)
1460 and isinstance(b, NavigableString)
1461 and not isinstance(a, PreformattedString)
1462 and not isinstance(b, PreformattedString)
1463 ):
1464 marked.append(i)
1466 # Go over the marked positions in reverse order, so that
1467 # removing items from .contents won't affect the remaining
1468 # positions.
1469 for i in reversed(marked):
1470 a = self.contents[i]
1471 b = self.contents[i+1]
1472 b.extract()
1473 n = NavigableString(a+b)
1474 a.replace_with(n)
1476 def index(self, element):
1477 """Find the index of a child by identity, not value.
1479 Avoids issues with tag.contents.index(element) getting the
1480 index of equal elements.
1482 :param element: Look for this PageElement in `self.contents`.
1483 """
1484 for i, child in enumerate(self.contents):
1485 if child is element:
1486 return i
1487 raise ValueError("Tag.index: element not in tag")
1489 def get(self, key, default=None):
1490 """Returns the value of the 'key' attribute for the tag, or
1491 the value given for 'default' if it doesn't have that
1492 attribute."""
1493 return self.attrs.get(key, default)
1495 def get_attribute_list(self, key, default=None):
1496 """The same as get(), but always returns a list.
1498 :param key: The attribute to look for.
1499 :param default: Use this value if the attribute is not present
1500 on this PageElement.
1501 :return: A list of values, probably containing only a single
1502 value.
1503 """
1504 value = self.get(key, default)
1505 if not isinstance(value, list):
1506 value = [value]
1507 return value
1509 def has_attr(self, key):
1510 """Does this PageElement have an attribute with the given name?"""
1511 return key in self.attrs
1513 def __hash__(self):
1514 return str(self).__hash__()
1516 def __getitem__(self, key):
1517 """tag[key] returns the value of the 'key' attribute for the Tag,
1518 and throws an exception if it's not there."""
1519 return self.attrs[key]
1521 def __iter__(self):
1522 "Iterating over a Tag iterates over its contents."
1523 return iter(self.contents)
1525 def __len__(self):
1526 "The length of a Tag is the length of its list of contents."
1527 return len(self.contents)
1529 def __contains__(self, x):
1530 return x in self.contents
1532 def __bool__(self):
1533 "A tag is non-None even if it has no contents."
1534 return True
1536 def __setitem__(self, key, value):
1537 """Setting tag[key] sets the value of the 'key' attribute for the
1538 tag."""
1539 self.attrs[key] = value
1541 def __delitem__(self, key):
1542 "Deleting tag[key] deletes all 'key' attributes for the tag."
1543 self.attrs.pop(key, None)
1545 def __call__(self, *args, **kwargs):
1546 """Calling a Tag like a function is the same as calling its
1547 find_all() method. Eg. tag('a') returns a list of all the A tags
1548 found within this tag."""
1549 return self.find_all(*args, **kwargs)
1551 def __getattr__(self, tag):
1552 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
1553 #print("Getattr %s.%s" % (self.__class__, tag))
1554 if len(tag) > 3 and tag.endswith('Tag'):
1555 # BS3: soup.aTag -> "soup.find("a")
1556 tag_name = tag[:-3]
1557 warnings.warn(
1558 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
1559 name=tag_name
1560 ),
1561 DeprecationWarning
1562 )
1563 return self.find(tag_name)
1564 # We special case contents to avoid recursion.
1565 elif not tag.startswith("__") and not tag == "contents":
1566 return self.find(tag)
1567 raise AttributeError(
1568 "'%s' object has no attribute '%s'" % (self.__class__, tag))
1570 def __eq__(self, other):
1571 """Returns true iff this Tag has the same name, the same attributes,
1572 and the same contents (recursively) as `other`."""
1573 if self is other:
1574 return True
1575 if (not hasattr(other, 'name') or
1576 not hasattr(other, 'attrs') or
1577 not hasattr(other, 'contents') or
1578 self.name != other.name or
1579 self.attrs != other.attrs or
1580 len(self) != len(other)):
1581 return False
1582 for i, my_child in enumerate(self.contents):
1583 if my_child != other.contents[i]:
1584 return False
1585 return True
1587 def __ne__(self, other):
1588 """Returns true iff this Tag is not identical to `other`,
1589 as defined in __eq__."""
1590 return not self == other
1592 def __repr__(self, encoding="unicode-escape"):
1593 """Renders this PageElement as a string.
1595 :param encoding: The encoding to use (Python 2 only).
1596 TODO: This is now ignored and a warning should be issued
1597 if a value is provided.
1598 :return: A (Unicode) string.
1599 """
1600 # "The return value must be a string object", i.e. Unicode
1601 return self.decode()
1603 def __unicode__(self):
1604 """Renders this PageElement as a Unicode string."""
1605 return self.decode()
1607 __str__ = __repr__ = __unicode__
1609 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1610 indent_level=None, formatter="minimal",
1611 errors="xmlcharrefreplace"):
1612 """Render a bytestring representation of this PageElement and its
1613 contents.
1615 :param encoding: The destination encoding.
1616 :param indent_level: Each line of the rendering will be
1617 indented this many levels. (The formatter decides what a
1618 'level' means in terms of spaces or other characters
1619 output.) Used internally in recursive calls while
1620 pretty-printing.
1621 :param formatter: A Formatter object, or a string naming one of
1622 the standard formatters.
1623 :param errors: An error handling strategy such as
1624 'xmlcharrefreplace'. This value is passed along into
1625 encode() and its value should be one of the constants
1626 defined by Python.
1627 :return: A bytestring.
1629 """
1630 # Turn the data structure into Unicode, then encode the
1631 # Unicode.
1632 u = self.decode(indent_level, encoding, formatter)
1633 return u.encode(encoding, errors)
1635 def decode(self, indent_level=None,
1636 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1637 formatter="minimal"):
1638 """Render a Unicode representation of this PageElement and its
1639 contents.
1641 :param indent_level: Each line of the rendering will be
1642 indented this many spaces. Used internally in
1643 recursive calls while pretty-printing.
1644 :param eventual_encoding: The tag is destined to be
1645 encoded into this encoding. This method is _not_
1646 responsible for performing that encoding. This information
1647 is passed in so that it can be substituted in if the
1648 document contains a <META> tag that mentions the document's
1649 encoding.
1650 :param formatter: A Formatter object, or a string naming one of
1651 the standard formatters.
1652 """
1654 # First off, turn a non-Formatter `formatter` into a Formatter
1655 # object. This will stop the lookup from happening over and
1656 # over again.
1657 if not isinstance(formatter, Formatter):
1658 formatter = self.formatter_for_name(formatter)
1659 attributes = formatter.attributes(self)
1660 attrs = []
1661 for key, val in attributes:
1662 if val is None:
1663 decoded = key
1664 else:
1665 if isinstance(val, list) or isinstance(val, tuple):
1666 val = ' '.join(val)
1667 elif not isinstance(val, str):
1668 val = str(val)
1669 elif (
1670 isinstance(val, AttributeValueWithCharsetSubstitution)
1671 and eventual_encoding is not None
1672 ):
1673 val = val.encode(eventual_encoding)
1675 text = formatter.attribute_value(val)
1676 decoded = (
1677 str(key) + '='
1678 + formatter.quoted_attribute_value(text))
1679 attrs.append(decoded)
1680 close = ''
1681 closeTag = ''
1683 prefix = ''
1684 if self.prefix:
1685 prefix = self.prefix + ":"
1687 if self.is_empty_element:
1688 close = formatter.void_element_close_prefix or ''
1689 else:
1690 closeTag = '</%s%s>' % (prefix, self.name)
1692 pretty_print = self._should_pretty_print(indent_level)
1693 space = ''
1694 indent_space = ''
1695 if indent_level is not None:
1696 indent_space = (formatter.indent * (indent_level - 1))
1697 if pretty_print:
1698 space = indent_space
1699 indent_contents = indent_level + 1
1700 else:
1701 indent_contents = None
1702 contents = self.decode_contents(
1703 indent_contents, eventual_encoding, formatter
1704 )
1706 if self.hidden:
1707 # This is the 'document root' object.
1708 s = contents
1709 else:
1710 s = []
1711 attribute_string = ''
1712 if attrs:
1713 attribute_string = ' ' + ' '.join(attrs)
1714 if indent_level is not None:
1715 # Even if this particular tag is not pretty-printed,
1716 # we should indent up to the start of the tag.
1717 s.append(indent_space)
1718 s.append('<%s%s%s%s>' % (
1719 prefix, self.name, attribute_string, close))
1720 if pretty_print:
1721 s.append("\n")
1722 s.append(contents)
1723 if pretty_print and contents and contents[-1] != "\n":
1724 s.append("\n")
1725 if pretty_print and closeTag:
1726 s.append(space)
1727 s.append(closeTag)
1728 if indent_level is not None and closeTag and self.next_sibling:
1729 # Even if this particular tag is not pretty-printed,
1730 # we're now done with the tag, and we should add a
1731 # newline if appropriate.
1732 s.append("\n")
1733 s = ''.join(s)
1734 return s
1736 def _should_pretty_print(self, indent_level):
1737 """Should this tag be pretty-printed?
1739 Most of them should, but some (such as <pre> in HTML
1740 documents) should not.
1741 """
1742 return (
1743 indent_level is not None
1744 and (
1745 not self.preserve_whitespace_tags
1746 or self.name not in self.preserve_whitespace_tags
1747 )
1748 )
1750 def prettify(self, encoding=None, formatter="minimal"):
1751 """Pretty-print this PageElement as a string.
1753 :param encoding: The eventual encoding of the string. If this is None,
1754 a Unicode string will be returned.
1755 :param formatter: A Formatter object, or a string naming one of
1756 the standard formatters.
1757 :return: A Unicode string (if encoding==None) or a bytestring
1758 (otherwise).
1759 """
1760 if encoding is None:
1761 return self.decode(True, formatter=formatter)
1762 else:
1763 return self.encode(encoding, True, formatter=formatter)
1765 def decode_contents(self, indent_level=None,
1766 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1767 formatter="minimal"):
1768 """Renders the contents of this tag as a Unicode string.
1770 :param indent_level: Each line of the rendering will be
1771 indented this many levels. (The formatter decides what a
1772 'level' means in terms of spaces or other characters
1773 output.) Used internally in recursive calls while
1774 pretty-printing.
1776 :param eventual_encoding: The tag is destined to be
1777 encoded into this encoding. decode_contents() is _not_
1778 responsible for performing that encoding. This information
1779 is passed in so that it can be substituted in if the
1780 document contains a <META> tag that mentions the document's
1781 encoding.
1783 :param formatter: A Formatter object, or a string naming one of
1784 the standard Formatters.
1786 """
1787 # First off, turn a string formatter into a Formatter object. This
1788 # will stop the lookup from happening over and over again.
1789 if not isinstance(formatter, Formatter):
1790 formatter = self.formatter_for_name(formatter)
1792 pretty_print = (indent_level is not None)
1793 s = []
1794 for c in self:
1795 text = None
1796 if isinstance(c, NavigableString):
1797 text = c.output_ready(formatter)
1798 elif isinstance(c, Tag):
1799 s.append(c.decode(indent_level, eventual_encoding,
1800 formatter))
1801 preserve_whitespace = (
1802 self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
1803 )
1804 if text and indent_level and not preserve_whitespace:
1805 text = text.strip()
1806 if text:
1807 if pretty_print and not preserve_whitespace:
1808 s.append(formatter.indent * (indent_level - 1))
1809 s.append(text)
1810 if pretty_print and not preserve_whitespace:
1811 s.append("\n")
1812 return ''.join(s)
1814 def encode_contents(
1815 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1816 formatter="minimal"):
1817 """Renders the contents of this PageElement as a bytestring.
1819 :param indent_level: Each line of the rendering will be
1820 indented this many levels. (The formatter decides what a
1821 'level' means in terms of spaces or other characters
1822 output.) Used internally in recursive calls while
1823 pretty-printing.
1825 :param eventual_encoding: The bytestring will be in this encoding.
1827 :param formatter: A Formatter object, or a string naming one of
1828 the standard Formatters.
1830 :return: A bytestring.
1831 """
1832 contents = self.decode_contents(indent_level, encoding, formatter)
1833 return contents.encode(encoding)
1835 # Old method for BS3 compatibility
1836 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1837 prettyPrint=False, indentLevel=0):
1838 """Deprecated method for BS3 compatibility."""
1839 if not prettyPrint:
1840 indentLevel = None
1841 return self.encode_contents(
1842 indent_level=indentLevel, encoding=encoding)
1844 #Soup methods
1846 def find(self, name=None, attrs={}, recursive=True, string=None,
1847 **kwargs):
1848 """Look in the children of this PageElement and find the first
1849 PageElement that matches the given criteria.
1851 All find_* methods take a common set of arguments. See the online
1852 documentation for detailed explanations.
1854 :param name: A filter on tag name.
1855 :param attrs: A dictionary of filters on attribute values.
1856 :param recursive: If this is True, find() will perform a
1857 recursive search of this PageElement's children. Otherwise,
1858 only the direct children will be considered.
1859 :param limit: Stop looking after finding this many results.
1860 :kwargs: A dictionary of filters on attribute values.
1861 :return: A PageElement.
1862 :rtype: bs4.element.Tag | bs4.element.NavigableString
1863 """
1864 r = None
1865 l = self.find_all(name, attrs, recursive, string, 1, **kwargs)
1866 if l:
1867 r = l[0]
1868 return r
1869 findChild = find #BS2
1871 def find_all(self, name=None, attrs={}, recursive=True, string=None,
1872 limit=None, **kwargs):
1873 """Look in the children of this PageElement and find all
1874 PageElements that match the given criteria.
1876 All find_* methods take a common set of arguments. See the online
1877 documentation for detailed explanations.
1879 :param name: A filter on tag name.
1880 :param attrs: A dictionary of filters on attribute values.
1881 :param recursive: If this is True, find_all() will perform a
1882 recursive search of this PageElement's children. Otherwise,
1883 only the direct children will be considered.
1884 :param limit: Stop looking after finding this many results.
1885 :kwargs: A dictionary of filters on attribute values.
1886 :return: A ResultSet of PageElements.
1887 :rtype: bs4.element.ResultSet
1888 """
1889 generator = self.descendants
1890 if not recursive:
1891 generator = self.children
1892 return self._find_all(name, attrs, string, limit, generator, **kwargs)
1893 findAll = find_all # BS3
1894 findChildren = find_all # BS2
1896 #Generator methods
1897 @property
1898 def children(self):
1899 """Iterate over all direct children of this PageElement.
1901 :yield: A sequence of PageElements.
1902 """
1903 # return iter() to make the purpose of the method clear
1904 return iter(self.contents) # XXX This seems to be untested.
1906 @property
1907 def descendants(self):
1908 """Iterate over all children of this PageElement in a
1909 breadth-first sequence.
1911 :yield: A sequence of PageElements.
1912 """
1913 if not len(self.contents):
1914 return
1915 stopNode = self._last_descendant().next_element
1916 current = self.contents[0]
1917 while current is not stopNode:
1918 yield current
1919 current = current.next_element
1921 # CSS selector code
1922 def select_one(self, selector, namespaces=None, **kwargs):
1923 """Perform a CSS selection operation on the current element.
1925 :param selector: A CSS selector.
1927 :param namespaces: A dictionary mapping namespace prefixes
1928 used in the CSS selector to namespace URIs. By default,
1929 Beautiful Soup will use the prefixes it encountered while
1930 parsing the document.
1932 :param kwargs: Keyword arguments to be passed into SoupSieve's
1933 soupsieve.select() method.
1935 :return: A Tag.
1936 :rtype: bs4.element.Tag
1937 """
1938 value = self.select(selector, namespaces, 1, **kwargs)
1939 if value:
1940 return value[0]
1941 return None
1943 def select(self, selector, namespaces=None, limit=None, **kwargs):
1944 """Perform a CSS selection operation on the current element.
1946 This uses the SoupSieve library.
1948 :param selector: A string containing a CSS selector.
1950 :param namespaces: A dictionary mapping namespace prefixes
1951 used in the CSS selector to namespace URIs. By default,
1952 Beautiful Soup will use the prefixes it encountered while
1953 parsing the document.
1955 :param limit: After finding this number of results, stop looking.
1957 :param kwargs: Keyword arguments to be passed into SoupSieve's
1958 soupsieve.select() method.
1960 :return: A ResultSet of Tags.
1961 :rtype: bs4.element.ResultSet
1962 """
1963 if namespaces is None:
1964 namespaces = self._namespaces
1966 if limit is None:
1967 limit = 0
1968 if soupsieve is None:
1969 raise NotImplementedError(
1970 "Cannot execute CSS selectors because the soupsieve package is not installed."
1971 )
1973 results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
1975 # We do this because it's more consistent and because
1976 # ResultSet.__getattr__ has a helpful error message.
1977 return ResultSet(None, results)
1979 # Old names for backwards compatibility
1980 def childGenerator(self):
1981 """Deprecated generator."""
1982 return self.children
1984 def recursiveChildGenerator(self):
1985 """Deprecated generator."""
1986 return self.descendants
1988 def has_key(self, key):
1989 """Deprecated method. This was kind of misleading because has_key()
1990 (attributes) was different from __in__ (contents).
1992 has_key() is gone in Python 3, anyway.
1993 """
1994 warnings.warn(
1995 'has_key is deprecated. Use has_attr(key) instead.',
1996 DeprecationWarning
1997 )
1998 return self.has_attr(key)
2000# Next, a couple classes to represent queries and their results.
2001class SoupStrainer(object):
2002 """Encapsulates a number of ways of matching a markup element (tag or
2003 string).
2005 This is primarily used to underpin the find_* methods, but you can
2006 create one yourself and pass it in as `parse_only` to the
2007 `BeautifulSoup` constructor, to parse a subset of a large
2008 document.
2009 """
2011 def __init__(self, name=None, attrs={}, string=None, **kwargs):
2012 """Constructor.
2014 The SoupStrainer constructor takes the same arguments passed
2015 into the find_* methods. See the online documentation for
2016 detailed explanations.
2018 :param name: A filter on tag name.
2019 :param attrs: A dictionary of filters on attribute values.
2020 :param string: A filter for a NavigableString with specific text.
2021 :kwargs: A dictionary of filters on attribute values.
2022 """
2023 if string is None and 'text' in kwargs:
2024 string = kwargs.pop('text')
2025 warnings.warn(
2026 "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
2027 DeprecationWarning
2028 )
2030 self.name = self._normalize_search_value(name)
2031 if not isinstance(attrs, dict):
2032 # Treat a non-dict value for attrs as a search for the 'class'
2033 # attribute.
2034 kwargs['class'] = attrs
2035 attrs = None
2037 if 'class_' in kwargs:
2038 # Treat class_="foo" as a search for the 'class'
2039 # attribute, overriding any non-dict value for attrs.
2040 kwargs['class'] = kwargs['class_']
2041 del kwargs['class_']
2043 if kwargs:
2044 if attrs:
2045 attrs = attrs.copy()
2046 attrs.update(kwargs)
2047 else:
2048 attrs = kwargs
2049 normalized_attrs = {}
2050 for key, value in list(attrs.items()):
2051 normalized_attrs[key] = self._normalize_search_value(value)
2053 self.attrs = normalized_attrs
2054 self.string = self._normalize_search_value(string)
2056 # DEPRECATED but just in case someone is checking this.
2057 self.text = self.string
2059 def _normalize_search_value(self, value):
2060 # Leave it alone if it's a Unicode string, a callable, a
2061 # regular expression, a boolean, or None.
2062 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
2063 or isinstance(value, bool) or value is None):
2064 return value
2066 # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
2067 if isinstance(value, bytes):
2068 return value.decode("utf8")
2070 # If it's listlike, convert it into a list of strings.
2071 if hasattr(value, '__iter__'):
2072 new_value = []
2073 for v in value:
2074 if (hasattr(v, '__iter__') and not isinstance(v, bytes)
2075 and not isinstance(v, str)):
2076 # This is almost certainly the user's mistake. In the
2077 # interests of avoiding infinite loops, we'll let
2078 # it through as-is rather than doing a recursive call.
2079 new_value.append(v)
2080 else:
2081 new_value.append(self._normalize_search_value(v))
2082 return new_value
2084 # Otherwise, convert it into a Unicode string.
2085 # The unicode(str()) thing is so this will do the same thing on Python 2
2086 # and Python 3.
2087 return str(str(value))
2089 def __str__(self):
2090 """A human-readable representation of this SoupStrainer."""
2091 if self.string:
2092 return self.string
2093 else:
2094 return "%s|%s" % (self.name, self.attrs)
2096 def search_tag(self, markup_name=None, markup_attrs={}):
2097 """Check whether a Tag with the given name and attributes would
2098 match this SoupStrainer.
2100 Used prospectively to decide whether to even bother creating a Tag
2101 object.
2103 :param markup_name: A tag name as found in some markup.
2104 :param markup_attrs: A dictionary of attributes as found in some markup.
2106 :return: True if the prospective tag would match this SoupStrainer;
2107 False otherwise.
2108 """
2109 found = None
2110 markup = None
2111 if isinstance(markup_name, Tag):
2112 markup = markup_name
2113 markup_attrs = markup
2115 if isinstance(self.name, str):
2116 # Optimization for a very common case where the user is
2117 # searching for a tag with one specific name, and we're
2118 # looking at a tag with a different name.
2119 if markup and not markup.prefix and self.name != markup.name:
2120 return False
2122 call_function_with_tag_data = (
2123 isinstance(self.name, Callable)
2124 and not isinstance(markup_name, Tag))
2126 if ((not self.name)
2127 or call_function_with_tag_data
2128 or (markup and self._matches(markup, self.name))
2129 or (not markup and self._matches(markup_name, self.name))):
2130 if call_function_with_tag_data:
2131 match = self.name(markup_name, markup_attrs)
2132 else:
2133 match = True
2134 markup_attr_map = None
2135 for attr, match_against in list(self.attrs.items()):
2136 if not markup_attr_map:
2137 if hasattr(markup_attrs, 'get'):
2138 markup_attr_map = markup_attrs
2139 else:
2140 markup_attr_map = {}
2141 for k, v in markup_attrs:
2142 markup_attr_map[k] = v
2143 attr_value = markup_attr_map.get(attr)
2144 if not self._matches(attr_value, match_against):
2145 match = False
2146 break
2147 if match:
2148 if markup:
2149 found = markup
2150 else:
2151 found = markup_name
2152 if found and self.string and not self._matches(found.string, self.string):
2153 found = None
2154 return found
2156 # For BS3 compatibility.
2157 searchTag = search_tag
2159 def search(self, markup):
2160 """Find all items in `markup` that match this SoupStrainer.
2162 Used by the core _find_all() method, which is ultimately
2163 called by all find_* methods.
2165 :param markup: A PageElement or a list of them.
2166 """
2167 # print('looking for %s in %s' % (self, markup))
2168 found = None
2169 # If given a list of items, scan it for a text element that
2170 # matches.
2171 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
2172 for element in markup:
2173 if isinstance(element, NavigableString) \
2174 and self.search(element):
2175 found = element
2176 break
2177 # If it's a Tag, make sure its name or attributes match.
2178 # Don't bother with Tags if we're searching for text.
2179 elif isinstance(markup, Tag):
2180 if not self.string or self.name or self.attrs:
2181 found = self.search_tag(markup)
2182 # If it's text, make sure the text matches.
2183 elif isinstance(markup, NavigableString) or \
2184 isinstance(markup, str):
2185 if not self.name and not self.attrs and self._matches(markup, self.string):
2186 found = markup
2187 else:
2188 raise Exception(
2189 "I don't know how to match against a %s" % markup.__class__)
2190 return found
2192 def _matches(self, markup, match_against, already_tried=None):
2193 # print(u"Matching %s against %s" % (markup, match_against))
2194 result = False
2195 if isinstance(markup, list) or isinstance(markup, tuple):
2196 # This should only happen when searching a multi-valued attribute
2197 # like 'class'.
2198 for item in markup:
2199 if self._matches(item, match_against):
2200 return True
2201 # We didn't match any particular value of the multivalue
2202 # attribute, but maybe we match the attribute value when
2203 # considered as a string.
2204 if self._matches(' '.join(markup), match_against):
2205 return True
2206 return False
2208 if match_against is True:
2209 # True matches any non-None value.
2210 return markup is not None
2212 if isinstance(match_against, Callable):
2213 return match_against(markup)
2215 # Custom callables take the tag as an argument, but all
2216 # other ways of matching match the tag name as a string.
2217 original_markup = markup
2218 if isinstance(markup, Tag):
2219 markup = markup.name
2221 # Ensure that `markup` is either a Unicode string, or None.
2222 markup = self._normalize_search_value(markup)
2224 if markup is None:
2225 # None matches None, False, an empty string, an empty list, and so on.
2226 return not match_against
2228 if (hasattr(match_against, '__iter__')
2229 and not isinstance(match_against, str)):
2230 # We're asked to match against an iterable of items.
2231 # The markup must be match at least one item in the
2232 # iterable. We'll try each one in turn.
2233 #
2234 # To avoid infinite recursion we need to keep track of
2235 # items we've already seen.
2236 if not already_tried:
2237 already_tried = set()
2238 for item in match_against:
2239 if item.__hash__:
2240 key = item
2241 else:
2242 key = id(item)
2243 if key in already_tried:
2244 continue
2245 else:
2246 already_tried.add(key)
2247 if self._matches(original_markup, item, already_tried):
2248 return True
2249 else:
2250 return False
2252 # Beyond this point we might need to run the test twice: once against
2253 # the tag's name and once against its prefixed name.
2254 match = False
2256 if not match and isinstance(match_against, str):
2257 # Exact string match
2258 match = markup == match_against
2260 if not match and hasattr(match_against, 'search'):
2261 # Regexp match
2262 return match_against.search(markup)
2264 if (not match
2265 and isinstance(original_markup, Tag)
2266 and original_markup.prefix):
2267 # Try the whole thing again with the prefixed tag name.
2268 return self._matches(
2269 original_markup.prefix + ':' + original_markup.name, match_against
2270 )
2272 return match
2275class ResultSet(list):
2276 """A ResultSet is just a list that keeps track of the SoupStrainer
2277 that created it."""
2278 def __init__(self, source, result=()):
2279 """Constructor.
2281 :param source: A SoupStrainer.
2282 :param result: A list of PageElements.
2283 """
2284 super(ResultSet, self).__init__(result)
2285 self.source = source
2287 def __getattr__(self, key):
2288 """Raise a helpful exception to explain a common code fix."""
2289 raise AttributeError(
2290 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2291 )