Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/element.py: 25%
954 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
4try:
5 from collections.abc import Callable # Python 3.6
6except ImportError as e:
7 from collections import Callable
8import re
9import sys
10import warnings
12from bs4.css import CSS
13from bs4.formatter import (
14 Formatter,
15 HTMLFormatter,
16 XMLFormatter,
17)
19DEFAULT_OUTPUT_ENCODING = "utf-8"
21nonwhitespace_re = re.compile(r"\S+")
23# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
24# the off chance someone imported it for their own use.
25whitespace_re = re.compile(r"\s+")
27def _alias(attr):
28 """Alias one attribute name to another for backward compatibility"""
29 @property
30 def alias(self):
31 return getattr(self, attr)
33 @alias.setter
34 def alias(self):
35 return setattr(self, attr)
36 return alias
39# These encodings are recognized by Python (so PageElement.encode
40# could theoretically support them) but XML and HTML don't recognize
41# them (so they should not show up in an XML or HTML document as that
42# document's encoding).
43#
44# If an XML document is encoded in one of these encodings, no encoding
45# will be mentioned in the XML declaration. If an HTML document is
46# encoded in one of these encodings, and the HTML document has a
47# <meta> tag that mentions an encoding, the encoding will be given as
48# the empty string.
49#
50# Source:
51# https://docs.python.org/3/library/codecs.html#python-specific-encodings
52PYTHON_SPECIFIC_ENCODINGS = set([
53 "idna",
54 "mbcs",
55 "oem",
56 "palmos",
57 "punycode",
58 "raw_unicode_escape",
59 "undefined",
60 "unicode_escape",
61 "raw-unicode-escape",
62 "unicode-escape",
63 "string-escape",
64 "string_escape",
65])
68class NamespacedAttribute(str):
69 """A namespaced string (e.g. 'xml:lang') that remembers the namespace
70 ('xml') and the name ('lang') that were used to create it.
71 """
73 def __new__(cls, prefix, name=None, namespace=None):
74 if not name:
75 # This is the default namespace. Its name "has no value"
76 # per https://www.w3.org/TR/xml-names/#defaulting
77 name = None
79 if not name:
80 obj = str.__new__(cls, prefix)
81 elif not prefix:
82 # Not really namespaced.
83 obj = str.__new__(cls, name)
84 else:
85 obj = str.__new__(cls, prefix + ":" + name)
86 obj.prefix = prefix
87 obj.name = name
88 obj.namespace = namespace
89 return obj
91class AttributeValueWithCharsetSubstitution(str):
92 """A stand-in object for a character encoding specified in HTML."""
94class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
95 """A generic stand-in for the value of a meta tag's 'charset' attribute.
97 When Beautiful Soup parses the markup '<meta charset="utf8">', the
98 value of the 'charset' attribute will be one of these objects.
99 """
101 def __new__(cls, original_value):
102 obj = str.__new__(cls, original_value)
103 obj.original_value = original_value
104 return obj
106 def encode(self, encoding):
107 """When an HTML document is being encoded to a given encoding, the
108 value of a meta tag's 'charset' is the name of the encoding.
109 """
110 if encoding in PYTHON_SPECIFIC_ENCODINGS:
111 return ''
112 return encoding
115class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
116 """A generic stand-in for the value of a meta tag's 'content' attribute.
118 When Beautiful Soup parses the markup:
119 <meta http-equiv="content-type" content="text/html; charset=utf8">
121 The value of the 'content' attribute will be one of these objects.
122 """
124 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
126 def __new__(cls, original_value):
127 match = cls.CHARSET_RE.search(original_value)
128 if match is None:
129 # No substitution necessary.
130 return str.__new__(str, original_value)
132 obj = str.__new__(cls, original_value)
133 obj.original_value = original_value
134 return obj
136 def encode(self, encoding):
137 if encoding in PYTHON_SPECIFIC_ENCODINGS:
138 return ''
139 def rewrite(match):
140 return match.group(1) + encoding
141 return self.CHARSET_RE.sub(rewrite, self.original_value)
144class PageElement(object):
145 """Contains the navigational information for some part of the page:
146 that is, its current location in the parse tree.
148 NavigableString, Tag, etc. are all subclasses of PageElement.
149 """
151 # In general, we can't tell just by looking at an element whether
152 # it's contained in an XML document or an HTML document. But for
153 # Tags (q.v.) we can store this information at parse time.
154 known_xml = None
156 def setup(self, parent=None, previous_element=None, next_element=None,
157 previous_sibling=None, next_sibling=None):
158 """Sets up the initial relations between this element and
159 other elements.
161 :param parent: The parent of this element.
163 :param previous_element: The element parsed immediately before
164 this one.
166 :param next_element: The element parsed immediately before
167 this one.
169 :param previous_sibling: The most recently encountered element
170 on the same level of the parse tree as this one.
172 :param previous_sibling: The next element to be encountered
173 on the same level of the parse tree as this one.
174 """
175 self.parent = parent
177 self.previous_element = previous_element
178 if previous_element is not None:
179 self.previous_element.next_element = self
181 self.next_element = next_element
182 if self.next_element is not None:
183 self.next_element.previous_element = self
185 self.next_sibling = next_sibling
186 if self.next_sibling is not None:
187 self.next_sibling.previous_sibling = self
189 if (previous_sibling is None
190 and self.parent is not None and self.parent.contents):
191 previous_sibling = self.parent.contents[-1]
193 self.previous_sibling = previous_sibling
194 if previous_sibling is not None:
195 self.previous_sibling.next_sibling = self
197 def format_string(self, s, formatter):
198 """Format the given string using the given formatter.
200 :param s: A string.
201 :param formatter: A Formatter object, or a string naming one of the standard formatters.
202 """
203 if formatter is None:
204 return s
205 if not isinstance(formatter, Formatter):
206 formatter = self.formatter_for_name(formatter)
207 output = formatter.substitute(s)
208 return output
210 def formatter_for_name(self, formatter):
211 """Look up or create a Formatter for the given identifier,
212 if necessary.
214 :param formatter: Can be a Formatter object (used as-is), a
215 function (used as the entity substitution hook for an
216 XMLFormatter or HTMLFormatter), or a string (used to look
217 up an XMLFormatter or HTMLFormatter in the appropriate
218 registry.
219 """
220 if isinstance(formatter, Formatter):
221 return formatter
222 if self._is_xml:
223 c = XMLFormatter
224 else:
225 c = HTMLFormatter
226 if isinstance(formatter, Callable):
227 return c(entity_substitution=formatter)
228 return c.REGISTRY[formatter]
230 @property
231 def _is_xml(self):
232 """Is this element part of an XML tree or an HTML tree?
234 This is used in formatter_for_name, when deciding whether an
235 XMLFormatter or HTMLFormatter is more appropriate. It can be
236 inefficient, but it should be called very rarely.
237 """
238 if self.known_xml is not None:
239 # Most of the time we will have determined this when the
240 # document is parsed.
241 return self.known_xml
243 # Otherwise, it's likely that this element was created by
244 # direct invocation of the constructor from within the user's
245 # Python code.
246 if self.parent is None:
247 # This is the top-level object. It should have .known_xml set
248 # from tree creation. If not, take a guess--BS is usually
249 # used on HTML markup.
250 return getattr(self, 'is_xml', False)
251 return self.parent._is_xml
253 nextSibling = _alias("next_sibling") # BS3
254 previousSibling = _alias("previous_sibling") # BS3
256 default = object()
257 def _all_strings(self, strip=False, types=default):
258 """Yield all strings of certain classes, possibly stripping them.
260 This is implemented differently in Tag and NavigableString.
261 """
262 raise NotImplementedError()
264 @property
265 def stripped_strings(self):
266 """Yield all strings in this PageElement, stripping them first.
268 :yield: A sequence of stripped strings.
269 """
270 for string in self._all_strings(True):
271 yield string
273 def get_text(self, separator="", strip=False,
274 types=default):
275 """Get all child strings of this PageElement, concatenated using the
276 given separator.
278 :param separator: Strings will be concatenated using this separator.
280 :param strip: If True, strings will be stripped before being
281 concatenated.
283 :param types: A tuple of NavigableString subclasses. Any
284 strings of a subclass not found in this list will be
285 ignored. Although there are exceptions, the default
286 behavior in most cases is to consider only NavigableString
287 and CData objects. That means no comments, processing
288 instructions, etc.
290 :return: A string.
291 """
292 return separator.join([s for s in self._all_strings(
293 strip, types=types)])
294 getText = get_text
295 text = property(get_text)
297 def replace_with(self, *args):
298 """Replace this PageElement with one or more PageElements, keeping the
299 rest of the tree the same.
301 :param args: One or more PageElements.
302 :return: `self`, no longer part of the tree.
303 """
304 if self.parent is None:
305 raise ValueError(
306 "Cannot replace one element with another when the "
307 "element to be replaced is not part of a tree.")
308 if len(args) == 1 and args[0] is self:
309 return
310 if any(x is self.parent for x in args):
311 raise ValueError("Cannot replace a Tag with its parent.")
312 old_parent = self.parent
313 my_index = self.parent.index(self)
314 self.extract(_self_index=my_index)
315 for idx, replace_with in enumerate(args, start=my_index):
316 old_parent.insert(idx, replace_with)
317 return self
318 replaceWith = replace_with # BS3
320 def unwrap(self):
321 """Replace this PageElement with its contents.
323 :return: `self`, no longer part of the tree.
324 """
325 my_parent = self.parent
326 if self.parent is None:
327 raise ValueError(
328 "Cannot replace an element with its contents when that"
329 "element is not part of a tree.")
330 my_index = self.parent.index(self)
331 self.extract(_self_index=my_index)
332 for child in reversed(self.contents[:]):
333 my_parent.insert(my_index, child)
334 return self
335 replace_with_children = unwrap
336 replaceWithChildren = unwrap # BS3
338 def wrap(self, wrap_inside):
339 """Wrap this PageElement inside another one.
341 :param wrap_inside: A PageElement.
342 :return: `wrap_inside`, occupying the position in the tree that used
343 to be occupied by `self`, and with `self` inside it.
344 """
345 me = self.replace_with(wrap_inside)
346 wrap_inside.append(me)
347 return wrap_inside
349 def extract(self, _self_index=None):
350 """Destructively rips this element out of the tree.
352 :param _self_index: The location of this element in its parent's
353 .contents, if known. Passing this in allows for a performance
354 optimization.
356 :return: `self`, no longer part of the tree.
357 """
358 if self.parent is not None:
359 if _self_index is None:
360 _self_index = self.parent.index(self)
361 del self.parent.contents[_self_index]
363 #Find the two elements that would be next to each other if
364 #this element (and any children) hadn't been parsed. Connect
365 #the two.
366 last_child = self._last_descendant()
367 next_element = last_child.next_element
369 if (self.previous_element is not None and
370 self.previous_element is not next_element):
371 self.previous_element.next_element = next_element
372 if next_element is not None and next_element is not self.previous_element:
373 next_element.previous_element = self.previous_element
374 self.previous_element = None
375 last_child.next_element = None
377 self.parent = None
378 if (self.previous_sibling is not None
379 and self.previous_sibling is not self.next_sibling):
380 self.previous_sibling.next_sibling = self.next_sibling
381 if (self.next_sibling is not None
382 and self.next_sibling is not self.previous_sibling):
383 self.next_sibling.previous_sibling = self.previous_sibling
384 self.previous_sibling = self.next_sibling = None
385 return self
387 def _last_descendant(self, is_initialized=True, accept_self=True):
388 """Finds the last element beneath this object to be parsed.
390 :param is_initialized: Has `setup` been called on this PageElement
391 yet?
392 :param accept_self: Is `self` an acceptable answer to the question?
393 """
394 if is_initialized and self.next_sibling is not None:
395 last_child = self.next_sibling.previous_element
396 else:
397 last_child = self
398 while isinstance(last_child, Tag) and last_child.contents:
399 last_child = last_child.contents[-1]
400 if not accept_self and last_child is self:
401 last_child = None
402 return last_child
403 # BS3: Not part of the API!
404 _lastRecursiveChild = _last_descendant
406 def insert(self, position, new_child):
407 """Insert a new PageElement in the list of this PageElement's children.
409 This works the same way as `list.insert`.
411 :param position: The numeric position that should be occupied
412 in `self.children` by the new PageElement.
413 :param new_child: A PageElement.
414 """
415 if new_child is None:
416 raise ValueError("Cannot insert None into a tag.")
417 if new_child is self:
418 raise ValueError("Cannot insert a tag into itself.")
419 if (isinstance(new_child, str)
420 and not isinstance(new_child, NavigableString)):
421 new_child = NavigableString(new_child)
423 from bs4 import BeautifulSoup
424 if isinstance(new_child, BeautifulSoup):
425 # We don't want to end up with a situation where one BeautifulSoup
426 # object contains another. Insert the children one at a time.
427 for subchild in list(new_child.contents):
428 self.insert(position, subchild)
429 position += 1
430 return
431 position = min(position, len(self.contents))
432 if hasattr(new_child, 'parent') and new_child.parent is not None:
433 # We're 'inserting' an element that's already one
434 # of this object's children.
435 if new_child.parent is self:
436 current_index = self.index(new_child)
437 if current_index < position:
438 # We're moving this element further down the list
439 # of this object's children. That means that when
440 # we extract this element, our target index will
441 # jump down one.
442 position -= 1
443 new_child.extract()
445 new_child.parent = self
446 previous_child = None
447 if position == 0:
448 new_child.previous_sibling = None
449 new_child.previous_element = self
450 else:
451 previous_child = self.contents[position - 1]
452 new_child.previous_sibling = previous_child
453 new_child.previous_sibling.next_sibling = new_child
454 new_child.previous_element = previous_child._last_descendant(False)
455 if new_child.previous_element is not None:
456 new_child.previous_element.next_element = new_child
458 new_childs_last_element = new_child._last_descendant(False)
460 if position >= len(self.contents):
461 new_child.next_sibling = None
463 parent = self
464 parents_next_sibling = None
465 while parents_next_sibling is None and parent is not None:
466 parents_next_sibling = parent.next_sibling
467 parent = parent.parent
468 if parents_next_sibling is not None:
469 # We found the element that comes next in the document.
470 break
471 if parents_next_sibling is not None:
472 new_childs_last_element.next_element = parents_next_sibling
473 else:
474 # The last element of this tag is the last element in
475 # the document.
476 new_childs_last_element.next_element = None
477 else:
478 next_child = self.contents[position]
479 new_child.next_sibling = next_child
480 if new_child.next_sibling is not None:
481 new_child.next_sibling.previous_sibling = new_child
482 new_childs_last_element.next_element = next_child
484 if new_childs_last_element.next_element is not None:
485 new_childs_last_element.next_element.previous_element = new_childs_last_element
486 self.contents.insert(position, new_child)
488 def append(self, tag):
489 """Appends the given PageElement to the contents of this one.
491 :param tag: A PageElement.
492 """
493 self.insert(len(self.contents), tag)
495 def extend(self, tags):
496 """Appends the given PageElements to this one's contents.
498 :param tags: A list of PageElements. If a single Tag is
499 provided instead, this PageElement's contents will be extended
500 with that Tag's contents.
501 """
502 if isinstance(tags, Tag):
503 tags = tags.contents
504 if isinstance(tags, list):
505 # Moving items around the tree may change their position in
506 # the original list. Make a list that won't change.
507 tags = list(tags)
508 for tag in tags:
509 self.append(tag)
511 def insert_before(self, *args):
512 """Makes the given element(s) the immediate predecessor of this one.
514 All the elements will have the same parent, and the given elements
515 will be immediately before this one.
517 :param args: One or more PageElements.
518 """
519 parent = self.parent
520 if parent is None:
521 raise ValueError(
522 "Element has no parent, so 'before' has no meaning.")
523 if any(x is self for x in args):
524 raise ValueError("Can't insert an element before itself.")
525 for predecessor in args:
526 # Extract first so that the index won't be screwed up if they
527 # are siblings.
528 if isinstance(predecessor, PageElement):
529 predecessor.extract()
530 index = parent.index(self)
531 parent.insert(index, predecessor)
533 def insert_after(self, *args):
534 """Makes the given element(s) the immediate successor of this one.
536 The elements will have the same parent, and the given elements
537 will be immediately after this one.
539 :param args: One or more PageElements.
540 """
541 # Do all error checking before modifying the tree.
542 parent = self.parent
543 if parent is None:
544 raise ValueError(
545 "Element has no parent, so 'after' has no meaning.")
546 if any(x is self for x in args):
547 raise ValueError("Can't insert an element after itself.")
549 offset = 0
550 for successor in args:
551 # Extract first so that the index won't be screwed up if they
552 # are siblings.
553 if isinstance(successor, PageElement):
554 successor.extract()
555 index = parent.index(self)
556 parent.insert(index+1+offset, successor)
557 offset += 1
559 def find_next(self, name=None, attrs={}, string=None, **kwargs):
560 """Find the first PageElement that matches the given criteria and
561 appears later in the document than this PageElement.
563 All find_* methods take a common set of arguments. See the online
564 documentation for detailed explanations.
566 :param name: A filter on tag name.
567 :param attrs: A dictionary of filters on attribute values.
568 :param string: A filter for a NavigableString with specific text.
569 :kwargs: A dictionary of filters on attribute values.
570 :return: A PageElement.
571 :rtype: bs4.element.Tag | bs4.element.NavigableString
572 """
573 return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
574 findNext = find_next # BS3
576 def find_all_next(self, name=None, attrs={}, string=None, limit=None,
577 **kwargs):
578 """Find all PageElements that match the given criteria and appear
579 later in the document than this PageElement.
581 All find_* methods take a common set of arguments. See the online
582 documentation for detailed explanations.
584 :param name: A filter on tag name.
585 :param attrs: A dictionary of filters on attribute values.
586 :param string: A filter for a NavigableString with specific text.
587 :param limit: Stop looking after finding this many results.
588 :kwargs: A dictionary of filters on attribute values.
589 :return: A ResultSet containing PageElements.
590 """
591 _stacklevel = kwargs.pop('_stacklevel', 2)
592 return self._find_all(name, attrs, string, limit, self.next_elements,
593 _stacklevel=_stacklevel+1, **kwargs)
594 findAllNext = find_all_next # BS3
596 def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
597 """Find the closest sibling to this PageElement that matches the
598 given criteria and appears later in the document.
600 All find_* methods take a common set of arguments. See the
601 online documentation for detailed explanations.
603 :param name: A filter on tag name.
604 :param attrs: A dictionary of filters on attribute values.
605 :param string: A filter for a NavigableString with specific text.
606 :kwargs: A dictionary of filters on attribute values.
607 :return: A PageElement.
608 :rtype: bs4.element.Tag | bs4.element.NavigableString
609 """
610 return self._find_one(self.find_next_siblings, name, attrs, string,
611 **kwargs)
612 findNextSibling = find_next_sibling # BS3
614 def find_next_siblings(self, name=None, attrs={}, string=None, limit=None,
615 **kwargs):
616 """Find all siblings of this PageElement that match the given criteria
617 and appear later in the document.
619 All find_* methods take a common set of arguments. See the online
620 documentation for detailed explanations.
622 :param name: A filter on tag name.
623 :param attrs: A dictionary of filters on attribute values.
624 :param string: A filter for a NavigableString with specific text.
625 :param limit: Stop looking after finding this many results.
626 :kwargs: A dictionary of filters on attribute values.
627 :return: A ResultSet of PageElements.
628 :rtype: bs4.element.ResultSet
629 """
630 _stacklevel = kwargs.pop('_stacklevel', 2)
631 return self._find_all(
632 name, attrs, string, limit,
633 self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
634 )
635 findNextSiblings = find_next_siblings # BS3
636 fetchNextSiblings = find_next_siblings # BS2
638 def find_previous(self, name=None, attrs={}, string=None, **kwargs):
639 """Look backwards in the document from this PageElement and find the
640 first PageElement that matches the given criteria.
642 All find_* methods take a common set of arguments. See the online
643 documentation for detailed explanations.
645 :param name: A filter on tag name.
646 :param attrs: A dictionary of filters on attribute values.
647 :param string: A filter for a NavigableString with specific text.
648 :kwargs: A dictionary of filters on attribute values.
649 :return: A PageElement.
650 :rtype: bs4.element.Tag | bs4.element.NavigableString
651 """
652 return self._find_one(
653 self.find_all_previous, name, attrs, string, **kwargs)
654 findPrevious = find_previous # BS3
656 def find_all_previous(self, name=None, attrs={}, string=None, limit=None,
657 **kwargs):
658 """Look backwards in the document from this PageElement and find all
659 PageElements that match the given criteria.
661 All find_* methods take a common set of arguments. See the online
662 documentation for detailed explanations.
664 :param name: A filter on tag name.
665 :param attrs: A dictionary of filters on attribute values.
666 :param string: A filter for a NavigableString with specific text.
667 :param limit: Stop looking after finding this many results.
668 :kwargs: A dictionary of filters on attribute values.
669 :return: A ResultSet of PageElements.
670 :rtype: bs4.element.ResultSet
671 """
672 _stacklevel = kwargs.pop('_stacklevel', 2)
673 return self._find_all(
674 name, attrs, string, limit, self.previous_elements,
675 _stacklevel=_stacklevel+1, **kwargs
676 )
677 findAllPrevious = find_all_previous # BS3
678 fetchPrevious = find_all_previous # BS2
680 def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs):
681 """Returns the closest sibling to this PageElement that matches the
682 given criteria and appears earlier in the document.
684 All find_* methods take a common set of arguments. See the online
685 documentation for detailed explanations.
687 :param name: A filter on tag name.
688 :param attrs: A dictionary of filters on attribute values.
689 :param string: A filter for a NavigableString with specific text.
690 :kwargs: A dictionary of filters on attribute values.
691 :return: A PageElement.
692 :rtype: bs4.element.Tag | bs4.element.NavigableString
693 """
694 return self._find_one(self.find_previous_siblings, name, attrs, string,
695 **kwargs)
696 findPreviousSibling = find_previous_sibling # BS3
698 def find_previous_siblings(self, name=None, attrs={}, string=None,
699 limit=None, **kwargs):
700 """Returns all siblings to this PageElement that match the
701 given criteria and appear earlier in the document.
703 All find_* methods take a common set of arguments. See the online
704 documentation for detailed explanations.
706 :param name: A filter on tag name.
707 :param attrs: A dictionary of filters on attribute values.
708 :param string: A filter for a NavigableString with specific text.
709 :param limit: Stop looking after finding this many results.
710 :kwargs: A dictionary of filters on attribute values.
711 :return: A ResultSet of PageElements.
712 :rtype: bs4.element.ResultSet
713 """
714 _stacklevel = kwargs.pop('_stacklevel', 2)
715 return self._find_all(
716 name, attrs, string, limit,
717 self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
718 )
719 findPreviousSiblings = find_previous_siblings # BS3
720 fetchPreviousSiblings = find_previous_siblings # BS2
722 def find_parent(self, name=None, attrs={}, **kwargs):
723 """Find the closest parent of this PageElement that matches the given
724 criteria.
726 All find_* methods take a common set of arguments. See the online
727 documentation for detailed explanations.
729 :param name: A filter on tag name.
730 :param attrs: A dictionary of filters on attribute values.
731 :kwargs: A dictionary of filters on attribute values.
733 :return: A PageElement.
734 :rtype: bs4.element.Tag | bs4.element.NavigableString
735 """
736 # NOTE: We can't use _find_one because findParents takes a different
737 # set of arguments.
738 r = None
739 l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
740 if l:
741 r = l[0]
742 return r
743 findParent = find_parent # BS3
745 def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
746 """Find all parents of this PageElement that match the given criteria.
748 All find_* methods take a common set of arguments. See the online
749 documentation for detailed explanations.
751 :param name: A filter on tag name.
752 :param attrs: A dictionary of filters on attribute values.
753 :param limit: Stop looking after finding this many results.
754 :kwargs: A dictionary of filters on attribute values.
756 :return: A PageElement.
757 :rtype: bs4.element.Tag | bs4.element.NavigableString
758 """
759 _stacklevel = kwargs.pop('_stacklevel', 2)
760 return self._find_all(name, attrs, None, limit, self.parents,
761 _stacklevel=_stacklevel+1, **kwargs)
762 findParents = find_parents # BS3
763 fetchParents = find_parents # BS2
765 @property
766 def next(self):
767 """The PageElement, if any, that was parsed just after this one.
769 :return: A PageElement.
770 :rtype: bs4.element.Tag | bs4.element.NavigableString
771 """
772 return self.next_element
774 @property
775 def previous(self):
776 """The PageElement, if any, that was parsed just before this one.
778 :return: A PageElement.
779 :rtype: bs4.element.Tag | bs4.element.NavigableString
780 """
781 return self.previous_element
783 #These methods do the real heavy lifting.
785 def _find_one(self, method, name, attrs, string, **kwargs):
786 r = None
787 l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
788 if l:
789 r = l[0]
790 return r
792 def _find_all(self, name, attrs, string, limit, generator, **kwargs):
793 "Iterates over a generator looking for things that match."
794 _stacklevel = kwargs.pop('_stacklevel', 3)
796 if string is None and 'text' in kwargs:
797 string = kwargs.pop('text')
798 warnings.warn(
799 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
800 DeprecationWarning, stacklevel=_stacklevel
801 )
803 if isinstance(name, SoupStrainer):
804 strainer = name
805 else:
806 strainer = SoupStrainer(name, attrs, string, **kwargs)
808 if string is None and not limit and not attrs and not kwargs:
809 if name is True or name is None:
810 # Optimization to find all tags.
811 result = (element for element in generator
812 if isinstance(element, Tag))
813 return ResultSet(strainer, result)
814 elif isinstance(name, str):
815 # Optimization to find all tags with a given name.
816 if name.count(':') == 1:
817 # This is a name with a prefix. If this is a namespace-aware document,
818 # we need to match the local name against tag.name. If not,
819 # we need to match the fully-qualified name against tag.name.
820 prefix, local_name = name.split(':', 1)
821 else:
822 prefix = None
823 local_name = name
824 result = (element for element in generator
825 if isinstance(element, Tag)
826 and (
827 element.name == name
828 ) or (
829 element.name == local_name
830 and (prefix is None or element.prefix == prefix)
831 )
832 )
833 return ResultSet(strainer, result)
834 results = ResultSet(strainer)
835 while True:
836 try:
837 i = next(generator)
838 except StopIteration:
839 break
840 if i:
841 found = strainer.search(i)
842 if found:
843 results.append(found)
844 if limit and len(results) >= limit:
845 break
846 return results
848 #These generators can be used to navigate starting from both
849 #NavigableStrings and Tags.
850 @property
851 def next_elements(self):
852 """All PageElements that were parsed after this one.
854 :yield: A sequence of PageElements.
855 """
856 i = self.next_element
857 while i is not None:
858 yield i
859 i = i.next_element
861 @property
862 def next_siblings(self):
863 """All PageElements that are siblings of this one but were parsed
864 later.
866 :yield: A sequence of PageElements.
867 """
868 i = self.next_sibling
869 while i is not None:
870 yield i
871 i = i.next_sibling
873 @property
874 def previous_elements(self):
875 """All PageElements that were parsed before this one.
877 :yield: A sequence of PageElements.
878 """
879 i = self.previous_element
880 while i is not None:
881 yield i
882 i = i.previous_element
884 @property
885 def previous_siblings(self):
886 """All PageElements that are siblings of this one but were parsed
887 earlier.
889 :yield: A sequence of PageElements.
890 """
891 i = self.previous_sibling
892 while i is not None:
893 yield i
894 i = i.previous_sibling
896 @property
897 def parents(self):
898 """All PageElements that are parents of this PageElement.
900 :yield: A sequence of PageElements.
901 """
902 i = self.parent
903 while i is not None:
904 yield i
905 i = i.parent
907 @property
908 def decomposed(self):
909 """Check whether a PageElement has been decomposed.
911 :rtype: bool
912 """
913 return getattr(self, '_decomposed', False) or False
915 # Old non-property versions of the generators, for backwards
916 # compatibility with BS3.
917 def nextGenerator(self):
918 return self.next_elements
920 def nextSiblingGenerator(self):
921 return self.next_siblings
923 def previousGenerator(self):
924 return self.previous_elements
926 def previousSiblingGenerator(self):
927 return self.previous_siblings
929 def parentGenerator(self):
930 return self.parents
933class NavigableString(str, PageElement):
934 """A Python Unicode string that is part of a parse tree.
936 When Beautiful Soup parses the markup <b>penguin</b>, it will
937 create a NavigableString for the string "penguin".
938 """
940 PREFIX = ''
941 SUFFIX = ''
943 def __new__(cls, value):
944 """Create a new NavigableString.
946 When unpickling a NavigableString, this method is called with
947 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
948 passed in to the superclass's __new__ or the superclass won't know
949 how to handle non-ASCII characters.
950 """
951 if isinstance(value, str):
952 u = str.__new__(cls, value)
953 else:
954 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
955 u.setup()
956 return u
958 def __deepcopy__(self, memo, recursive=False):
959 """A copy of a NavigableString has the same contents and class
960 as the original, but it is not connected to the parse tree.
962 :param recursive: This parameter is ignored; it's only defined
963 so that NavigableString.__deepcopy__ implements the same
964 signature as Tag.__deepcopy__.
965 """
966 return type(self)(self)
968 def __copy__(self):
969 """A copy of a NavigableString can only be a deep copy, because
970 only one PageElement can occupy a given place in a parse tree.
971 """
972 return self.__deepcopy__({})
974 def __getnewargs__(self):
975 return (str(self),)
977 def __getattr__(self, attr):
978 """text.string gives you text. This is for backwards
979 compatibility for Navigable*String, but for CData* it lets you
980 get the string without the CData wrapper."""
981 if attr == 'string':
982 return self
983 else:
984 raise AttributeError(
985 "'%s' object has no attribute '%s'" % (
986 self.__class__.__name__, attr))
988 def output_ready(self, formatter="minimal"):
989 """Run the string through the provided formatter.
991 :param formatter: A Formatter object, or a string naming one of the standard formatters.
992 """
993 output = self.format_string(self, formatter)
994 return self.PREFIX + output + self.SUFFIX
996 @property
997 def name(self):
998 """Since a NavigableString is not a Tag, it has no .name.
1000 This property is implemented so that code like this doesn't crash
1001 when run on a mixture of Tag and NavigableString objects:
1002 [x.name for x in tag.children]
1003 """
1004 return None
1006 @name.setter
1007 def name(self, name):
1008 """Prevent NavigableString.name from ever being set."""
1009 raise AttributeError("A NavigableString cannot be given a name.")
1011 def _all_strings(self, strip=False, types=PageElement.default):
1012 """Yield all strings of certain classes, possibly stripping them.
1014 This makes it easy for NavigableString to implement methods
1015 like get_text() as conveniences, creating a consistent
1016 text-extraction API across all PageElements.
1018 :param strip: If True, all strings will be stripped before being
1019 yielded.
1021 :param types: A tuple of NavigableString subclasses. If this
1022 NavigableString isn't one of those subclasses, the
1023 sequence will be empty. By default, the subclasses
1024 considered are NavigableString and CData objects. That
1025 means no comments, processing instructions, etc.
1027 :yield: A sequence that either contains this string, or is empty.
1029 """
1030 if types is self.default:
1031 # This is kept in Tag because it's full of subclasses of
1032 # this class, which aren't defined until later in the file.
1033 types = Tag.DEFAULT_INTERESTING_STRING_TYPES
1035 # Do nothing if the caller is looking for specific types of
1036 # string, and we're of a different type.
1037 #
1038 # We check specific types instead of using isinstance(self,
1039 # types) because all of these classes subclass
1040 # NavigableString. Anyone who's using this feature probably
1041 # wants generic NavigableStrings but not other stuff.
1042 my_type = type(self)
1043 if types is not None:
1044 if isinstance(types, type):
1045 # Looking for a single type.
1046 if my_type is not types:
1047 return
1048 elif my_type not in types:
1049 # Looking for one of a list of types.
1050 return
1052 value = self
1053 if strip:
1054 value = value.strip()
1055 if len(value) > 0:
1056 yield value
1057 strings = property(_all_strings)
1059class PreformattedString(NavigableString):
1060 """A NavigableString not subject to the normal formatting rules.
1062 This is an abstract class used for special kinds of strings such
1063 as comments (the Comment class) and CDATA blocks (the CData
1064 class).
1065 """
1067 PREFIX = ''
1068 SUFFIX = ''
1070 def output_ready(self, formatter=None):
1071 """Make this string ready for output by adding any subclass-specific
1072 prefix or suffix.
1074 :param formatter: A Formatter object, or a string naming one
1075 of the standard formatters. The string will be passed into the
1076 Formatter, but only to trigger any side effects: the return
1077 value is ignored.
1079 :return: The string, with any subclass-specific prefix and
1080 suffix added on.
1081 """
1082 if formatter is not None:
1083 ignore = self.format_string(self, formatter)
1084 return self.PREFIX + self + self.SUFFIX
1086class CData(PreformattedString):
1087 """A CDATA block."""
1088 PREFIX = '<![CDATA['
1089 SUFFIX = ']]>'
1091class ProcessingInstruction(PreformattedString):
1092 """A SGML processing instruction."""
1094 PREFIX = '<?'
1095 SUFFIX = '>'
1097class XMLProcessingInstruction(ProcessingInstruction):
1098 """An XML processing instruction."""
1099 PREFIX = '<?'
1100 SUFFIX = '?>'
1102class Comment(PreformattedString):
1103 """An HTML or XML comment."""
1104 PREFIX = '<!--'
1105 SUFFIX = '-->'
1108class Declaration(PreformattedString):
1109 """An XML declaration."""
1110 PREFIX = '<?'
1111 SUFFIX = '?>'
1114class Doctype(PreformattedString):
1115 """A document type declaration."""
1116 @classmethod
1117 def for_name_and_ids(cls, name, pub_id, system_id):
1118 """Generate an appropriate document type declaration for a given
1119 public ID and system ID.
1121 :param name: The name of the document's root element, e.g. 'html'.
1122 :param pub_id: The Formal Public Identifier for this document type,
1123 e.g. '-//W3C//DTD XHTML 1.1//EN'
1124 :param system_id: The system identifier for this document type,
1125 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1127 :return: A Doctype.
1128 """
1129 value = name or ''
1130 if pub_id is not None:
1131 value += ' PUBLIC "%s"' % pub_id
1132 if system_id is not None:
1133 value += ' "%s"' % system_id
1134 elif system_id is not None:
1135 value += ' SYSTEM "%s"' % system_id
1137 return Doctype(value)
1139 PREFIX = '<!DOCTYPE '
1140 SUFFIX = '>\n'
1143class Stylesheet(NavigableString):
1144 """A NavigableString representing an stylesheet (probably
1145 CSS).
1147 Used to distinguish embedded stylesheets from textual content.
1148 """
1149 pass
1152class Script(NavigableString):
1153 """A NavigableString representing an executable script (probably
1154 Javascript).
1156 Used to distinguish executable code from textual content.
1157 """
1158 pass
1161class TemplateString(NavigableString):
1162 """A NavigableString representing a string found inside an HTML
1163 template embedded in a larger document.
1165 Used to distinguish such strings from the main body of the document.
1166 """
1167 pass
1170class RubyTextString(NavigableString):
1171 """A NavigableString representing the contents of the <rt> HTML
1172 element.
1174 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
1176 Can be used to distinguish such strings from the strings they're
1177 annotating.
1178 """
1179 pass
1182class RubyParenthesisString(NavigableString):
1183 """A NavigableString representing the contents of the <rp> HTML
1184 element.
1186 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
1187 """
1188 pass
1191class Tag(PageElement):
1192 """Represents an HTML or XML tag that is part of a parse tree, along
1193 with its attributes and contents.
1195 When Beautiful Soup parses the markup <b>penguin</b>, it will
1196 create a Tag object representing the <b> tag.
1197 """
1199 def __init__(self, parser=None, builder=None, name=None, namespace=None,
1200 prefix=None, attrs=None, parent=None, previous=None,
1201 is_xml=None, sourceline=None, sourcepos=None,
1202 can_be_empty_element=None, cdata_list_attributes=None,
1203 preserve_whitespace_tags=None,
1204 interesting_string_types=None,
1205 namespaces=None
1206 ):
1207 """Basic constructor.
1209 :param parser: A BeautifulSoup object.
1210 :param builder: A TreeBuilder.
1211 :param name: The name of the tag.
1212 :param namespace: The URI of this Tag's XML namespace, if any.
1213 :param prefix: The prefix for this Tag's XML namespace, if any.
1214 :param attrs: A dictionary of this Tag's attribute values.
1215 :param parent: The PageElement to use as this Tag's parent.
1216 :param previous: The PageElement that was parsed immediately before
1217 this tag.
1218 :param is_xml: If True, this is an XML tag. Otherwise, this is an
1219 HTML tag.
1220 :param sourceline: The line number where this tag was found in its
1221 source document.
1222 :param sourcepos: The character position within `sourceline` where this
1223 tag was found.
1224 :param can_be_empty_element: If True, this tag should be
1225 represented as <tag/>. If False, this tag should be represented
1226 as <tag></tag>.
1227 :param cdata_list_attributes: A list of attributes whose values should
1228 be treated as CDATA if they ever show up on this tag.
1229 :param preserve_whitespace_tags: A list of tag names whose contents
1230 should have their whitespace preserved.
1231 :param interesting_string_types: This is a NavigableString
1232 subclass or a tuple of them. When iterating over this
1233 Tag's strings in methods like Tag.strings or Tag.get_text,
1234 these are the types of strings that are interesting enough
1235 to be considered. The default is to consider
1236 NavigableString and CData the only interesting string
1237 subtypes.
1238 :param namespaces: A dictionary mapping currently active
1239 namespace prefixes to URIs. This can be used later to
1240 construct CSS selectors.
1241 """
1242 if parser is None:
1243 self.parser_class = None
1244 else:
1245 # We don't actually store the parser object: that lets extracted
1246 # chunks be garbage-collected.
1247 self.parser_class = parser.__class__
1248 if name is None:
1249 raise ValueError("No value provided for new tag's name.")
1250 self.name = name
1251 self.namespace = namespace
1252 self._namespaces = namespaces or {}
1253 self.prefix = prefix
1254 if ((not builder or builder.store_line_numbers)
1255 and (sourceline is not None or sourcepos is not None)):
1256 self.sourceline = sourceline
1257 self.sourcepos = sourcepos
1258 if attrs is None:
1259 attrs = {}
1260 elif attrs:
1261 if builder is not None and builder.cdata_list_attributes:
1262 attrs = builder._replace_cdata_list_attribute_values(
1263 self.name, attrs)
1264 else:
1265 attrs = dict(attrs)
1266 else:
1267 attrs = dict(attrs)
1269 # If possible, determine ahead of time whether this tag is an
1270 # XML tag.
1271 if builder:
1272 self.known_xml = builder.is_xml
1273 else:
1274 self.known_xml = is_xml
1275 self.attrs = attrs
1276 self.contents = []
1277 self.setup(parent, previous)
1278 self.hidden = False
1280 if builder is None:
1281 # In the absence of a TreeBuilder, use whatever values were
1282 # passed in here. They're probably None, unless this is a copy of some
1283 # other tag.
1284 self.can_be_empty_element = can_be_empty_element
1285 self.cdata_list_attributes = cdata_list_attributes
1286 self.preserve_whitespace_tags = preserve_whitespace_tags
1287 self.interesting_string_types = interesting_string_types
1288 else:
1289 # Set up any substitutions for this tag, such as the charset in a META tag.
1290 builder.set_up_substitutions(self)
1292 # Ask the TreeBuilder whether this tag might be an empty-element tag.
1293 self.can_be_empty_element = builder.can_be_empty_element(name)
1295 # Keep track of the list of attributes of this tag that
1296 # might need to be treated as a list.
1297 #
1298 # For performance reasons, we store the whole data structure
1299 # rather than asking the question of every tag. Asking would
1300 # require building a new data structure every time, and
1301 # (unlike can_be_empty_element), we almost never need
1302 # to check this.
1303 self.cdata_list_attributes = builder.cdata_list_attributes
1305 # Keep track of the names that might cause this tag to be treated as a
1306 # whitespace-preserved tag.
1307 self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1309 if self.name in builder.string_containers:
1310 # This sort of tag uses a special string container
1311 # subclass for most of its strings. When we ask the
1312 self.interesting_string_types = builder.string_containers[self.name]
1313 else:
1314 self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
1316 parserClass = _alias("parser_class") # BS3
1318 def __deepcopy__(self, memo, recursive=True):
1319 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
1320 Its contents are a copy of the old Tag's contents.
1321 """
1322 clone = self._clone()
1324 if recursive:
1325 # Clone this tag's descendants recursively, but without
1326 # making any recursive function calls.
1327 tag_stack = [clone]
1328 for event, element in self._event_stream(self.descendants):
1329 if event is Tag.END_ELEMENT_EVENT:
1330 # Stop appending incoming Tags to the Tag that was
1331 # just closed.
1332 tag_stack.pop()
1333 else:
1334 descendant_clone = element.__deepcopy__(
1335 memo, recursive=False
1336 )
1337 # Add to its parent's .contents
1338 tag_stack[-1].append(descendant_clone)
1340 if event is Tag.START_ELEMENT_EVENT:
1341 # Add the Tag itself to the stack so that its
1342 # children will be .appended to it.
1343 tag_stack.append(descendant_clone)
1344 return clone
1346 def __copy__(self):
1347 """A copy of a Tag must always be a deep copy, because a Tag's
1348 children can only have one parent at a time.
1349 """
1350 return self.__deepcopy__({})
1352 def _clone(self):
1353 """Create a new Tag just like this one, but with no
1354 contents and unattached to any parse tree.
1356 This is the first step in the deepcopy process.
1357 """
1358 clone = type(self)(
1359 None, self.builder, self.name, self.namespace,
1360 self.prefix, self.attrs, is_xml=self._is_xml,
1361 sourceline=self.sourceline, sourcepos=self.sourcepos,
1362 can_be_empty_element=self.can_be_empty_element,
1363 cdata_list_attributes=self.cdata_list_attributes,
1364 preserve_whitespace_tags=self.preserve_whitespace_tags,
1365 interesting_string_types=self.interesting_string_types
1366 )
1367 for attr in ('can_be_empty_element', 'hidden'):
1368 setattr(clone, attr, getattr(self, attr))
1369 return clone
1371 @property
1372 def is_empty_element(self):
1373 """Is this tag an empty-element tag? (aka a self-closing tag)
1375 A tag that has contents is never an empty-element tag.
1377 A tag that has no contents may or may not be an empty-element
1378 tag. It depends on the builder used to create the tag. If the
1379 builder has a designated list of empty-element tags, then only
1380 a tag whose name shows up in that list is considered an
1381 empty-element tag.
1383 If the builder has no designated list of empty-element tags,
1384 then any tag with no contents is an empty-element tag.
1385 """
1386 return len(self.contents) == 0 and self.can_be_empty_element
1387 isSelfClosing = is_empty_element # BS3
1389 @property
1390 def string(self):
1391 """Convenience property to get the single string within this
1392 PageElement.
1394 TODO It might make sense to have NavigableString.string return
1395 itself.
1397 :return: If this element has a single string child, return
1398 value is that string. If this element has one child tag,
1399 return value is the 'string' attribute of the child tag,
1400 recursively. If this element is itself a string, has no
1401 children, or has more than one child, return value is None.
1402 """
1403 if len(self.contents) != 1:
1404 return None
1405 child = self.contents[0]
1406 if isinstance(child, NavigableString):
1407 return child
1408 return child.string
1410 @string.setter
1411 def string(self, string):
1412 """Replace this PageElement's contents with `string`."""
1413 self.clear()
1414 self.append(string.__class__(string))
1416 DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
1417 def _all_strings(self, strip=False, types=PageElement.default):
1418 """Yield all strings of certain classes, possibly stripping them.
1420 :param strip: If True, all strings will be stripped before being
1421 yielded.
1423 :param types: A tuple of NavigableString subclasses. Any strings of
1424 a subclass not found in this list will be ignored. By
1425 default, the subclasses considered are the ones found in
1426 self.interesting_string_types. If that's not specified,
1427 only NavigableString and CData objects will be
1428 considered. That means no comments, processing
1429 instructions, etc.
1431 :yield: A sequence of strings.
1433 """
1434 if types is self.default:
1435 types = self.interesting_string_types
1437 for descendant in self.descendants:
1438 if (types is None and not isinstance(descendant, NavigableString)):
1439 continue
1440 descendant_type = type(descendant)
1441 if isinstance(types, type):
1442 if descendant_type is not types:
1443 # We're not interested in strings of this type.
1444 continue
1445 elif types is not None and descendant_type not in types:
1446 # We're not interested in strings of this type.
1447 continue
1448 if strip:
1449 descendant = descendant.strip()
1450 if len(descendant) == 0:
1451 continue
1452 yield descendant
1453 strings = property(_all_strings)
1455 def decompose(self):
1456 """Recursively destroys this PageElement and its children.
1458 This element will be removed from the tree and wiped out; so
1459 will everything beneath it.
1461 The behavior of a decomposed PageElement is undefined and you
1462 should never use one for anything, but if you need to _check_
1463 whether an element has been decomposed, you can use the
1464 `decomposed` property.
1465 """
1466 self.extract()
1467 i = self
1468 while i is not None:
1469 n = i.next_element
1470 i.__dict__.clear()
1471 i.contents = []
1472 i._decomposed = True
1473 i = n
1475 def clear(self, decompose=False):
1476 """Wipe out all children of this PageElement by calling extract()
1477 on them.
1479 :param decompose: If this is True, decompose() (a more
1480 destructive method) will be called instead of extract().
1481 """
1482 if decompose:
1483 for element in self.contents[:]:
1484 if isinstance(element, Tag):
1485 element.decompose()
1486 else:
1487 element.extract()
1488 else:
1489 for element in self.contents[:]:
1490 element.extract()
1492 def smooth(self):
1493 """Smooth out this element's children by consolidating consecutive
1494 strings.
1496 This makes pretty-printed output look more natural following a
1497 lot of operations that modified the tree.
1498 """
1499 # Mark the first position of every pair of children that need
1500 # to be consolidated. Do this rather than making a copy of
1501 # self.contents, since in most cases very few strings will be
1502 # affected.
1503 marked = []
1504 for i, a in enumerate(self.contents):
1505 if isinstance(a, Tag):
1506 # Recursively smooth children.
1507 a.smooth()
1508 if i == len(self.contents)-1:
1509 # This is the last item in .contents, and it's not a
1510 # tag. There's no chance it needs any work.
1511 continue
1512 b = self.contents[i+1]
1513 if (isinstance(a, NavigableString)
1514 and isinstance(b, NavigableString)
1515 and not isinstance(a, PreformattedString)
1516 and not isinstance(b, PreformattedString)
1517 ):
1518 marked.append(i)
1520 # Go over the marked positions in reverse order, so that
1521 # removing items from .contents won't affect the remaining
1522 # positions.
1523 for i in reversed(marked):
1524 a = self.contents[i]
1525 b = self.contents[i+1]
1526 b.extract()
1527 n = NavigableString(a+b)
1528 a.replace_with(n)
1530 def index(self, element):
1531 """Find the index of a child by identity, not value.
1533 Avoids issues with tag.contents.index(element) getting the
1534 index of equal elements.
1536 :param element: Look for this PageElement in `self.contents`.
1537 """
1538 for i, child in enumerate(self.contents):
1539 if child is element:
1540 return i
1541 raise ValueError("Tag.index: element not in tag")
1543 def get(self, key, default=None):
1544 """Returns the value of the 'key' attribute for the tag, or
1545 the value given for 'default' if it doesn't have that
1546 attribute."""
1547 return self.attrs.get(key, default)
1549 def get_attribute_list(self, key, default=None):
1550 """The same as get(), but always returns a list.
1552 :param key: The attribute to look for.
1553 :param default: Use this value if the attribute is not present
1554 on this PageElement.
1555 :return: A list of values, probably containing only a single
1556 value.
1557 """
1558 value = self.get(key, default)
1559 if not isinstance(value, list):
1560 value = [value]
1561 return value
1563 def has_attr(self, key):
1564 """Does this PageElement have an attribute with the given name?"""
1565 return key in self.attrs
1567 def __hash__(self):
1568 return str(self).__hash__()
1570 def __getitem__(self, key):
1571 """tag[key] returns the value of the 'key' attribute for the Tag,
1572 and throws an exception if it's not there."""
1573 return self.attrs[key]
1575 def __iter__(self):
1576 "Iterating over a Tag iterates over its contents."
1577 return iter(self.contents)
1579 def __len__(self):
1580 "The length of a Tag is the length of its list of contents."
1581 return len(self.contents)
1583 def __contains__(self, x):
1584 return x in self.contents
1586 def __bool__(self):
1587 "A tag is non-None even if it has no contents."
1588 return True
1590 def __setitem__(self, key, value):
1591 """Setting tag[key] sets the value of the 'key' attribute for the
1592 tag."""
1593 self.attrs[key] = value
1595 def __delitem__(self, key):
1596 "Deleting tag[key] deletes all 'key' attributes for the tag."
1597 self.attrs.pop(key, None)
1599 def __call__(self, *args, **kwargs):
1600 """Calling a Tag like a function is the same as calling its
1601 find_all() method. Eg. tag('a') returns a list of all the A tags
1602 found within this tag."""
1603 return self.find_all(*args, **kwargs)
1605 def __getattr__(self, tag):
1606 """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
1607 #print("Getattr %s.%s" % (self.__class__, tag))
1608 if len(tag) > 3 and tag.endswith('Tag'):
1609 # BS3: soup.aTag -> "soup.find("a")
1610 tag_name = tag[:-3]
1611 warnings.warn(
1612 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
1613 name=tag_name
1614 ),
1615 DeprecationWarning, stacklevel=2
1616 )
1617 return self.find(tag_name)
1618 # We special case contents to avoid recursion.
1619 elif not tag.startswith("__") and not tag == "contents":
1620 return self.find(tag)
1621 raise AttributeError(
1622 "'%s' object has no attribute '%s'" % (self.__class__, tag))
1624 def __eq__(self, other):
1625 """Returns true iff this Tag has the same name, the same attributes,
1626 and the same contents (recursively) as `other`."""
1627 if self is other:
1628 return True
1629 if (not hasattr(other, 'name') or
1630 not hasattr(other, 'attrs') or
1631 not hasattr(other, 'contents') or
1632 self.name != other.name or
1633 self.attrs != other.attrs or
1634 len(self) != len(other)):
1635 return False
1636 for i, my_child in enumerate(self.contents):
1637 if my_child != other.contents[i]:
1638 return False
1639 return True
1641 def __ne__(self, other):
1642 """Returns true iff this Tag is not identical to `other`,
1643 as defined in __eq__."""
1644 return not self == other
1646 def __repr__(self, encoding="unicode-escape"):
1647 """Renders this PageElement as a string.
1649 :param encoding: The encoding to use (Python 2 only).
1650 TODO: This is now ignored and a warning should be issued
1651 if a value is provided.
1652 :return: A (Unicode) string.
1653 """
1654 # "The return value must be a string object", i.e. Unicode
1655 return self.decode()
1657 def __unicode__(self):
1658 """Renders this PageElement as a Unicode string."""
1659 return self.decode()
1661 __str__ = __repr__ = __unicode__
1663 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1664 indent_level=None, formatter="minimal",
1665 errors="xmlcharrefreplace"):
1666 """Render a bytestring representation of this PageElement and its
1667 contents.
1669 :param encoding: The destination encoding.
1670 :param indent_level: Each line of the rendering will be
1671 indented this many levels. (The formatter decides what a
1672 'level' means in terms of spaces or other characters
1673 output.) Used internally in recursive calls while
1674 pretty-printing.
1675 :param formatter: A Formatter object, or a string naming one of
1676 the standard formatters.
1677 :param errors: An error handling strategy such as
1678 'xmlcharrefreplace'. This value is passed along into
1679 encode() and its value should be one of the constants
1680 defined by Python.
1681 :return: A bytestring.
1683 """
1684 # Turn the data structure into Unicode, then encode the
1685 # Unicode.
1686 u = self.decode(indent_level, encoding, formatter)
1687 return u.encode(encoding, errors)
1689 def decode(self, indent_level=None,
1690 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1691 formatter="minimal",
1692 iterator=None):
1693 pieces = []
1694 # First off, turn a non-Formatter `formatter` into a Formatter
1695 # object. This will stop the lookup from happening over and
1696 # over again.
1697 if not isinstance(formatter, Formatter):
1698 formatter = self.formatter_for_name(formatter)
1700 if indent_level is True:
1701 indent_level = 0
1703 # The currently active tag that put us into string literal
1704 # mode. Until this element is closed, children will be treated
1705 # as string literals and not pretty-printed. String literal
1706 # mode is turned on immediately after this tag begins, and
1707 # turned off immediately before it's closed. This means there
1708 # will be whitespace before and after the tag itself.
1709 string_literal_tag = None
1711 for event, element in self._event_stream(iterator):
1712 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
1713 piece = element._format_tag(
1714 eventual_encoding, formatter, opening=True
1715 )
1716 elif event is Tag.END_ELEMENT_EVENT:
1717 piece = element._format_tag(
1718 eventual_encoding, formatter, opening=False
1719 )
1720 if indent_level is not None:
1721 indent_level -= 1
1722 else:
1723 piece = element.output_ready(formatter)
1725 # Now we need to apply the 'prettiness' -- extra
1726 # whitespace before and/or after this tag. This can get
1727 # complicated because certain tags, like <pre> and
1728 # <script>, can't be prettified, since adding whitespace would
1729 # change the meaning of the content.
1731 # The default behavior is to add whitespace before and
1732 # after an element when string literal mode is off, and to
1733 # leave things as they are when string literal mode is on.
1734 if string_literal_tag:
1735 indent_before = indent_after = False
1736 else:
1737 indent_before = indent_after = True
1739 # The only time the behavior is more complex than that is
1740 # when we encounter an opening or closing tag that might
1741 # put us into or out of string literal mode.
1742 if (event is Tag.START_ELEMENT_EVENT
1743 and not string_literal_tag
1744 and not element._should_pretty_print()):
1745 # We are about to enter string literal mode. Add
1746 # whitespace before this tag, but not after. We
1747 # will stay in string literal mode until this tag
1748 # is closed.
1749 indent_before = True
1750 indent_after = False
1751 string_literal_tag = element
1752 elif (event is Tag.END_ELEMENT_EVENT
1753 and element is string_literal_tag):
1754 # We are about to exit string literal mode by closing
1755 # the tag that sent us into that mode. Add whitespace
1756 # after this tag, but not before.
1757 indent_before = False
1758 indent_after = True
1759 string_literal_tag = None
1761 # Now we know whether to add whitespace before and/or
1762 # after this element.
1763 if indent_level is not None:
1764 if (indent_before or indent_after):
1765 if isinstance(element, NavigableString):
1766 piece = piece.strip()
1767 if piece:
1768 piece = self._indent_string(
1769 piece, indent_level, formatter,
1770 indent_before, indent_after
1771 )
1772 if event == Tag.START_ELEMENT_EVENT:
1773 indent_level += 1
1774 pieces.append(piece)
1775 return "".join(pieces)
1777 # Names for the different events yielded by _event_stream
1778 START_ELEMENT_EVENT = object()
1779 END_ELEMENT_EVENT = object()
1780 EMPTY_ELEMENT_EVENT = object()
1781 STRING_ELEMENT_EVENT = object()
1783 def _event_stream(self, iterator=None):
1784 """Yield a sequence of events that can be used to reconstruct the DOM
1785 for this element.
1787 This lets us recreate the nested structure of this element
1788 (e.g. when formatting it as a string) without using recursive
1789 method calls.
1791 This is similar in concept to the SAX API, but it's a simpler
1792 interface designed for internal use. The events are different
1793 from SAX and the arguments associated with the events are Tags
1794 and other Beautiful Soup objects.
1796 :param iterator: An alternate iterator to use when traversing
1797 the tree.
1798 """
1799 tag_stack = []
1801 iterator = iterator or self.self_and_descendants
1803 for c in iterator:
1804 # If the parent of the element we're about to yield is not
1805 # the tag currently on the stack, it means that the tag on
1806 # the stack closed before this element appeared.
1807 while tag_stack and c.parent != tag_stack[-1]:
1808 now_closed_tag = tag_stack.pop()
1809 yield Tag.END_ELEMENT_EVENT, now_closed_tag
1811 if isinstance(c, Tag):
1812 if c.is_empty_element:
1813 yield Tag.EMPTY_ELEMENT_EVENT, c
1814 else:
1815 yield Tag.START_ELEMENT_EVENT, c
1816 tag_stack.append(c)
1817 continue
1818 else:
1819 yield Tag.STRING_ELEMENT_EVENT, c
1821 while tag_stack:
1822 now_closed_tag = tag_stack.pop()
1823 yield Tag.END_ELEMENT_EVENT, now_closed_tag
1825 def _indent_string(self, s, indent_level, formatter,
1826 indent_before, indent_after):
1827 """Add indentation whitespace before and/or after a string.
1829 :param s: The string to amend with whitespace.
1830 :param indent_level: The indentation level; affects how much
1831 whitespace goes before the string.
1832 :param indent_before: Whether or not to add whitespace
1833 before the string.
1834 :param indent_after: Whether or not to add whitespace
1835 (a newline) after the string.
1836 """
1837 space_before = ''
1838 if indent_before and indent_level:
1839 space_before = (formatter.indent * indent_level)
1841 space_after = ''
1842 if indent_after:
1843 space_after = "\n"
1845 return space_before + s + space_after
1847 def _format_tag(self, eventual_encoding, formatter, opening):
1848 # A tag starts with the < character (see below).
1850 # Then the / character, if this is a closing tag.
1851 closing_slash = ''
1852 if not opening:
1853 closing_slash = '/'
1855 # Then an optional namespace prefix.
1856 prefix = ''
1857 if self.prefix:
1858 prefix = self.prefix + ":"
1860 # Then a list of attribute values, if this is an opening tag.
1861 attribute_string = ''
1862 if opening:
1863 attributes = formatter.attributes(self)
1864 attrs = []
1865 for key, val in attributes:
1866 if val is None:
1867 decoded = key
1868 else:
1869 if isinstance(val, list) or isinstance(val, tuple):
1870 val = ' '.join(val)
1871 elif not isinstance(val, str):
1872 val = str(val)
1873 elif (
1874 isinstance(val, AttributeValueWithCharsetSubstitution)
1875 and eventual_encoding is not None
1876 ):
1877 val = val.encode(eventual_encoding)
1879 text = formatter.attribute_value(val)
1880 decoded = (
1881 str(key) + '='
1882 + formatter.quoted_attribute_value(text))
1883 attrs.append(decoded)
1884 if attrs:
1885 attribute_string = ' ' + ' '.join(attrs)
1887 # Then an optional closing slash (for a void element in an
1888 # XML document).
1889 void_element_closing_slash = ''
1890 if self.is_empty_element:
1891 void_element_closing_slash = formatter.void_element_close_prefix or ''
1893 # Put it all together.
1894 return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
1896 def _should_pretty_print(self, indent_level=1):
1897 """Should this tag be pretty-printed?
1899 Most of them should, but some (such as <pre> in HTML
1900 documents) should not.
1901 """
1902 return (
1903 indent_level is not None
1904 and (
1905 not self.preserve_whitespace_tags
1906 or self.name not in self.preserve_whitespace_tags
1907 )
1908 )
1910 def prettify(self, encoding=None, formatter="minimal"):
1911 """Pretty-print this PageElement as a string.
1913 :param encoding: The eventual encoding of the string. If this is None,
1914 a Unicode string will be returned.
1915 :param formatter: A Formatter object, or a string naming one of
1916 the standard formatters.
1917 :return: A Unicode string (if encoding==None) or a bytestring
1918 (otherwise).
1919 """
1920 if encoding is None:
1921 return self.decode(True, formatter=formatter)
1922 else:
1923 return self.encode(encoding, True, formatter=formatter)
1925 def decode_contents(self, indent_level=None,
1926 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1927 formatter="minimal"):
1928 """Renders the contents of this tag as a Unicode string.
1930 :param indent_level: Each line of the rendering will be
1931 indented this many levels. (The formatter decides what a
1932 'level' means in terms of spaces or other characters
1933 output.) Used internally in recursive calls while
1934 pretty-printing.
1936 :param eventual_encoding: The tag is destined to be
1937 encoded into this encoding. decode_contents() is _not_
1938 responsible for performing that encoding. This information
1939 is passed in so that it can be substituted in if the
1940 document contains a <META> tag that mentions the document's
1941 encoding.
1943 :param formatter: A Formatter object, or a string naming one of
1944 the standard Formatters.
1946 """
1947 return self.decode(indent_level, eventual_encoding, formatter,
1948 iterator=self.descendants)
1950 def encode_contents(
1951 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1952 formatter="minimal"):
1953 """Renders the contents of this PageElement as a bytestring.
1955 :param indent_level: Each line of the rendering will be
1956 indented this many levels. (The formatter decides what a
1957 'level' means in terms of spaces or other characters
1958 output.) Used internally in recursive calls while
1959 pretty-printing.
1961 :param eventual_encoding: The bytestring will be in this encoding.
1963 :param formatter: A Formatter object, or a string naming one of
1964 the standard Formatters.
1966 :return: A bytestring.
1967 """
1968 contents = self.decode_contents(indent_level, encoding, formatter)
1969 return contents.encode(encoding)
1971 # Old method for BS3 compatibility
1972 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1973 prettyPrint=False, indentLevel=0):
1974 """Deprecated method for BS3 compatibility."""
1975 if not prettyPrint:
1976 indentLevel = None
1977 return self.encode_contents(
1978 indent_level=indentLevel, encoding=encoding)
1980 #Soup methods
1982 def find(self, name=None, attrs={}, recursive=True, string=None,
1983 **kwargs):
1984 """Look in the children of this PageElement and find the first
1985 PageElement that matches the given criteria.
1987 All find_* methods take a common set of arguments. See the online
1988 documentation for detailed explanations.
1990 :param name: A filter on tag name.
1991 :param attrs: A dictionary of filters on attribute values.
1992 :param recursive: If this is True, find() will perform a
1993 recursive search of this PageElement's children. Otherwise,
1994 only the direct children will be considered.
1995 :param limit: Stop looking after finding this many results.
1996 :kwargs: A dictionary of filters on attribute values.
1997 :return: A PageElement.
1998 :rtype: bs4.element.Tag | bs4.element.NavigableString
1999 """
2000 r = None
2001 l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
2002 **kwargs)
2003 if l:
2004 r = l[0]
2005 return r
2006 findChild = find #BS2
2008 def find_all(self, name=None, attrs={}, recursive=True, string=None,
2009 limit=None, **kwargs):
2010 """Look in the children of this PageElement and find all
2011 PageElements that match the given criteria.
2013 All find_* methods take a common set of arguments. See the online
2014 documentation for detailed explanations.
2016 :param name: A filter on tag name.
2017 :param attrs: A dictionary of filters on attribute values.
2018 :param recursive: If this is True, find_all() will perform a
2019 recursive search of this PageElement's children. Otherwise,
2020 only the direct children will be considered.
2021 :param limit: Stop looking after finding this many results.
2022 :kwargs: A dictionary of filters on attribute values.
2023 :return: A ResultSet of PageElements.
2024 :rtype: bs4.element.ResultSet
2025 """
2026 generator = self.descendants
2027 if not recursive:
2028 generator = self.children
2029 _stacklevel = kwargs.pop('_stacklevel', 2)
2030 return self._find_all(name, attrs, string, limit, generator,
2031 _stacklevel=_stacklevel+1, **kwargs)
2032 findAll = find_all # BS3
2033 findChildren = find_all # BS2
2035 #Generator methods
2036 @property
2037 def children(self):
2038 """Iterate over all direct children of this PageElement.
2040 :yield: A sequence of PageElements.
2041 """
2042 # return iter() to make the purpose of the method clear
2043 return iter(self.contents) # XXX This seems to be untested.
2045 @property
2046 def self_and_descendants(self):
2047 """Iterate over this PageElement and its children in a
2048 breadth-first sequence.
2050 :yield: A sequence of PageElements.
2051 """
2052 if not self.hidden:
2053 yield self
2054 for i in self.descendants:
2055 yield i
2057 @property
2058 def descendants(self):
2059 """Iterate over all children of this PageElement in a
2060 breadth-first sequence.
2062 :yield: A sequence of PageElements.
2063 """
2064 if not len(self.contents):
2065 return
2066 stopNode = self._last_descendant().next_element
2067 current = self.contents[0]
2068 while current is not stopNode:
2069 yield current
2070 current = current.next_element
2072 # CSS selector code
2073 def select_one(self, selector, namespaces=None, **kwargs):
2074 """Perform a CSS selection operation on the current element.
2076 :param selector: A CSS selector.
2078 :param namespaces: A dictionary mapping namespace prefixes
2079 used in the CSS selector to namespace URIs. By default,
2080 Beautiful Soup will use the prefixes it encountered while
2081 parsing the document.
2083 :param kwargs: Keyword arguments to be passed into Soup Sieve's
2084 soupsieve.select() method.
2086 :return: A Tag.
2087 :rtype: bs4.element.Tag
2088 """
2089 return self.css.select_one(selector, namespaces, **kwargs)
2091 def select(self, selector, namespaces=None, limit=None, **kwargs):
2092 """Perform a CSS selection operation on the current element.
2094 This uses the SoupSieve library.
2096 :param selector: A string containing a CSS selector.
2098 :param namespaces: A dictionary mapping namespace prefixes
2099 used in the CSS selector to namespace URIs. By default,
2100 Beautiful Soup will use the prefixes it encountered while
2101 parsing the document.
2103 :param limit: After finding this number of results, stop looking.
2105 :param kwargs: Keyword arguments to be passed into SoupSieve's
2106 soupsieve.select() method.
2108 :return: A ResultSet of Tags.
2109 :rtype: bs4.element.ResultSet
2110 """
2111 return self.css.select(selector, namespaces, limit, **kwargs)
2113 @property
2114 def css(self):
2115 """Return an interface to the CSS selector API."""
2116 return CSS(self)
2118 # Old names for backwards compatibility
2119 def childGenerator(self):
2120 """Deprecated generator."""
2121 return self.children
2123 def recursiveChildGenerator(self):
2124 """Deprecated generator."""
2125 return self.descendants
2127 def has_key(self, key):
2128 """Deprecated method. This was kind of misleading because has_key()
2129 (attributes) was different from __in__ (contents).
2131 has_key() is gone in Python 3, anyway.
2132 """
2133 warnings.warn(
2134 'has_key is deprecated. Use has_attr(key) instead.',
2135 DeprecationWarning, stacklevel=2
2136 )
2137 return self.has_attr(key)
2139# Next, a couple classes to represent queries and their results.
2140class SoupStrainer(object):
2141 """Encapsulates a number of ways of matching a markup element (tag or
2142 string).
2144 This is primarily used to underpin the find_* methods, but you can
2145 create one yourself and pass it in as `parse_only` to the
2146 `BeautifulSoup` constructor, to parse a subset of a large
2147 document.
2148 """
2150 def __init__(self, name=None, attrs={}, string=None, **kwargs):
2151 """Constructor.
2153 The SoupStrainer constructor takes the same arguments passed
2154 into the find_* methods. See the online documentation for
2155 detailed explanations.
2157 :param name: A filter on tag name.
2158 :param attrs: A dictionary of filters on attribute values.
2159 :param string: A filter for a NavigableString with specific text.
2160 :kwargs: A dictionary of filters on attribute values.
2161 """
2162 if string is None and 'text' in kwargs:
2163 string = kwargs.pop('text')
2164 warnings.warn(
2165 "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
2166 DeprecationWarning, stacklevel=2
2167 )
2169 self.name = self._normalize_search_value(name)
2170 if not isinstance(attrs, dict):
2171 # Treat a non-dict value for attrs as a search for the 'class'
2172 # attribute.
2173 kwargs['class'] = attrs
2174 attrs = None
2176 if 'class_' in kwargs:
2177 # Treat class_="foo" as a search for the 'class'
2178 # attribute, overriding any non-dict value for attrs.
2179 kwargs['class'] = kwargs['class_']
2180 del kwargs['class_']
2182 if kwargs:
2183 if attrs:
2184 attrs = attrs.copy()
2185 attrs.update(kwargs)
2186 else:
2187 attrs = kwargs
2188 normalized_attrs = {}
2189 for key, value in list(attrs.items()):
2190 normalized_attrs[key] = self._normalize_search_value(value)
2192 self.attrs = normalized_attrs
2193 self.string = self._normalize_search_value(string)
2195 # DEPRECATED but just in case someone is checking this.
2196 self.text = self.string
2198 def _normalize_search_value(self, value):
2199 # Leave it alone if it's a Unicode string, a callable, a
2200 # regular expression, a boolean, or None.
2201 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
2202 or isinstance(value, bool) or value is None):
2203 return value
2205 # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
2206 if isinstance(value, bytes):
2207 return value.decode("utf8")
2209 # If it's listlike, convert it into a list of strings.
2210 if hasattr(value, '__iter__'):
2211 new_value = []
2212 for v in value:
2213 if (hasattr(v, '__iter__') and not isinstance(v, bytes)
2214 and not isinstance(v, str)):
2215 # This is almost certainly the user's mistake. In the
2216 # interests of avoiding infinite loops, we'll let
2217 # it through as-is rather than doing a recursive call.
2218 new_value.append(v)
2219 else:
2220 new_value.append(self._normalize_search_value(v))
2221 return new_value
2223 # Otherwise, convert it into a Unicode string.
2224 # The unicode(str()) thing is so this will do the same thing on Python 2
2225 # and Python 3.
2226 return str(str(value))
2228 def __str__(self):
2229 """A human-readable representation of this SoupStrainer."""
2230 if self.string:
2231 return self.string
2232 else:
2233 return "%s|%s" % (self.name, self.attrs)
2235 def search_tag(self, markup_name=None, markup_attrs={}):
2236 """Check whether a Tag with the given name and attributes would
2237 match this SoupStrainer.
2239 Used prospectively to decide whether to even bother creating a Tag
2240 object.
2242 :param markup_name: A tag name as found in some markup.
2243 :param markup_attrs: A dictionary of attributes as found in some markup.
2245 :return: True if the prospective tag would match this SoupStrainer;
2246 False otherwise.
2247 """
2248 found = None
2249 markup = None
2250 if isinstance(markup_name, Tag):
2251 markup = markup_name
2252 markup_attrs = markup
2254 if isinstance(self.name, str):
2255 # Optimization for a very common case where the user is
2256 # searching for a tag with one specific name, and we're
2257 # looking at a tag with a different name.
2258 if markup and not markup.prefix and self.name != markup.name:
2259 return False
2261 call_function_with_tag_data = (
2262 isinstance(self.name, Callable)
2263 and not isinstance(markup_name, Tag))
2265 if ((not self.name)
2266 or call_function_with_tag_data
2267 or (markup and self._matches(markup, self.name))
2268 or (not markup and self._matches(markup_name, self.name))):
2269 if call_function_with_tag_data:
2270 match = self.name(markup_name, markup_attrs)
2271 else:
2272 match = True
2273 markup_attr_map = None
2274 for attr, match_against in list(self.attrs.items()):
2275 if not markup_attr_map:
2276 if hasattr(markup_attrs, 'get'):
2277 markup_attr_map = markup_attrs
2278 else:
2279 markup_attr_map = {}
2280 for k, v in markup_attrs:
2281 markup_attr_map[k] = v
2282 attr_value = markup_attr_map.get(attr)
2283 if not self._matches(attr_value, match_against):
2284 match = False
2285 break
2286 if match:
2287 if markup:
2288 found = markup
2289 else:
2290 found = markup_name
2291 if found and self.string and not self._matches(found.string, self.string):
2292 found = None
2293 return found
2295 # For BS3 compatibility.
2296 searchTag = search_tag
2298 def search(self, markup):
2299 """Find all items in `markup` that match this SoupStrainer.
2301 Used by the core _find_all() method, which is ultimately
2302 called by all find_* methods.
2304 :param markup: A PageElement or a list of them.
2305 """
2306 # print('looking for %s in %s' % (self, markup))
2307 found = None
2308 # If given a list of items, scan it for a text element that
2309 # matches.
2310 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
2311 for element in markup:
2312 if isinstance(element, NavigableString) \
2313 and self.search(element):
2314 found = element
2315 break
2316 # If it's a Tag, make sure its name or attributes match.
2317 # Don't bother with Tags if we're searching for text.
2318 elif isinstance(markup, Tag):
2319 if not self.string or self.name or self.attrs:
2320 found = self.search_tag(markup)
2321 # If it's text, make sure the text matches.
2322 elif isinstance(markup, NavigableString) or \
2323 isinstance(markup, str):
2324 if not self.name and not self.attrs and self._matches(markup, self.string):
2325 found = markup
2326 else:
2327 raise Exception(
2328 "I don't know how to match against a %s" % markup.__class__)
2329 return found
2331 def _matches(self, markup, match_against, already_tried=None):
2332 # print(u"Matching %s against %s" % (markup, match_against))
2333 result = False
2334 if isinstance(markup, list) or isinstance(markup, tuple):
2335 # This should only happen when searching a multi-valued attribute
2336 # like 'class'.
2337 for item in markup:
2338 if self._matches(item, match_against):
2339 return True
2340 # We didn't match any particular value of the multivalue
2341 # attribute, but maybe we match the attribute value when
2342 # considered as a string.
2343 if self._matches(' '.join(markup), match_against):
2344 return True
2345 return False
2347 if match_against is True:
2348 # True matches any non-None value.
2349 return markup is not None
2351 if isinstance(match_against, Callable):
2352 return match_against(markup)
2354 # Custom callables take the tag as an argument, but all
2355 # other ways of matching match the tag name as a string.
2356 original_markup = markup
2357 if isinstance(markup, Tag):
2358 markup = markup.name
2360 # Ensure that `markup` is either a Unicode string, or None.
2361 markup = self._normalize_search_value(markup)
2363 if markup is None:
2364 # None matches None, False, an empty string, an empty list, and so on.
2365 return not match_against
2367 if (hasattr(match_against, '__iter__')
2368 and not isinstance(match_against, str)):
2369 # We're asked to match against an iterable of items.
2370 # The markup must be match at least one item in the
2371 # iterable. We'll try each one in turn.
2372 #
2373 # To avoid infinite recursion we need to keep track of
2374 # items we've already seen.
2375 if not already_tried:
2376 already_tried = set()
2377 for item in match_against:
2378 if item.__hash__:
2379 key = item
2380 else:
2381 key = id(item)
2382 if key in already_tried:
2383 continue
2384 else:
2385 already_tried.add(key)
2386 if self._matches(original_markup, item, already_tried):
2387 return True
2388 else:
2389 return False
2391 # Beyond this point we might need to run the test twice: once against
2392 # the tag's name and once against its prefixed name.
2393 match = False
2395 if not match and isinstance(match_against, str):
2396 # Exact string match
2397 match = markup == match_against
2399 if not match and hasattr(match_against, 'search'):
2400 # Regexp match
2401 return match_against.search(markup)
2403 if (not match
2404 and isinstance(original_markup, Tag)
2405 and original_markup.prefix):
2406 # Try the whole thing again with the prefixed tag name.
2407 return self._matches(
2408 original_markup.prefix + ':' + original_markup.name, match_against
2409 )
2411 return match
2414class ResultSet(list):
2415 """A ResultSet is just a list that keeps track of the SoupStrainer
2416 that created it."""
2417 def __init__(self, source, result=()):
2418 """Constructor.
2420 :param source: A SoupStrainer.
2421 :param result: A list of PageElements.
2422 """
2423 super(ResultSet, self).__init__(result)
2424 self.source = source
2426 def __getattr__(self, key):
2427 """Raise a helpful exception to explain a common code fix."""
2428 raise AttributeError(
2429 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2430 )