Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/soupsieve/css_match.py: 17%
959 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1"""CSS matcher."""
2from __future__ import annotations
3from datetime import datetime
4from . import util
5import re
6from . import css_types as ct
7import unicodedata
8import bs4 # type: ignore[import]
9from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401
11# Empty tag pattern (whitespace okay)
12RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
14RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
16# Relationships
17REL_PARENT = ' '
18REL_CLOSE_PARENT = '>'
19REL_SIBLING = '~'
20REL_CLOSE_SIBLING = '+'
22# Relationships for :has() (forward looking)
23REL_HAS_PARENT = ': '
24REL_HAS_CLOSE_PARENT = ':>'
25REL_HAS_SIBLING = ':~'
26REL_HAS_CLOSE_SIBLING = ':+'
28NS_XHTML = 'http://www.w3.org/1999/xhtml'
29NS_XML = 'http://www.w3.org/XML/1998/namespace'
31DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
32RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
34DIR_MAP = {
35 'ltr': ct.SEL_DIR_LTR,
36 'rtl': ct.SEL_DIR_RTL,
37 'auto': 0
38}
40RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
41RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
42RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
43RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
44RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
45RE_DATETIME = re.compile(
46 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
47)
48RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
50MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
51FEB = 2
52SHORT_MONTH = 30
53LONG_MONTH = 31
54FEB_MONTH = 28
55FEB_LEAP_MONTH = 29
56DAYS_IN_WEEK = 7
59class _FakeParent:
60 """
61 Fake parent class.
63 When we have a fragment with no `BeautifulSoup` document object,
64 we can't evaluate `nth` selectors properly. Create a temporary
65 fake parent so we can traverse the root element as a child.
66 """
68 def __init__(self, element: bs4.Tag) -> None:
69 """Initialize."""
71 self.contents = [element]
73 def __len__(self) -> bs4.PageElement:
74 """Length."""
76 return len(self.contents)
79class _DocumentNav:
80 """Navigate a Beautiful Soup document."""
82 @classmethod
83 def assert_valid_input(cls, tag: Any) -> None:
84 """Check if valid input tag or document."""
86 # Fail on unexpected types.
87 if not cls.is_tag(tag):
88 raise TypeError("Expected a BeautifulSoup 'Tag', but instead received type {}".format(type(tag)))
90 @staticmethod
91 def is_doc(obj: bs4.Tag) -> bool:
92 """Is `BeautifulSoup` object."""
93 return isinstance(obj, bs4.BeautifulSoup)
95 @staticmethod
96 def is_tag(obj: bs4.PageElement) -> bool:
97 """Is tag."""
98 return isinstance(obj, bs4.Tag)
100 @staticmethod
101 def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover
102 """Is declaration."""
103 return isinstance(obj, bs4.Declaration)
105 @staticmethod
106 def is_cdata(obj: bs4.PageElement) -> bool:
107 """Is CDATA."""
108 return isinstance(obj, bs4.CData)
110 @staticmethod
111 def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover
112 """Is processing instruction."""
113 return isinstance(obj, bs4.ProcessingInstruction)
115 @staticmethod
116 def is_navigable_string(obj: bs4.PageElement) -> bool:
117 """Is navigable string."""
118 return isinstance(obj, bs4.NavigableString)
120 @staticmethod
121 def is_special_string(obj: bs4.PageElement) -> bool:
122 """Is special string."""
123 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
125 @classmethod
126 def is_content_string(cls, obj: bs4.PageElement) -> bool:
127 """Check if node is content string."""
129 return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
131 @staticmethod
132 def create_fake_parent(el: bs4.Tag) -> _FakeParent:
133 """Create fake parent for a given element."""
135 return _FakeParent(el)
137 @staticmethod
138 def is_xml_tree(el: bs4.Tag) -> bool:
139 """Check if element (or document) is from a XML tree."""
141 return bool(el._is_xml)
143 def is_iframe(self, el: bs4.Tag) -> bool:
144 """Check if element is an `iframe`."""
146 return bool(
147 ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and
148 self.is_html_tag(el) # type: ignore[attr-defined]
149 )
151 def is_root(self, el: bs4.Tag) -> bool:
152 """
153 Return whether element is a root element.
155 We check that the element is the root of the tree (which we have already pre-calculated),
156 and we check if it is the root element under an `iframe`.
157 """
159 root = self.root and self.root is el # type: ignore[attr-defined]
160 if not root:
161 parent = self.get_parent(el)
162 root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]
163 return root
165 def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]:
166 """Get contents or contents in reverse."""
167 if not no_iframe or not self.is_iframe(el):
168 for content in el.contents:
169 yield content
171 def get_children(
172 self,
173 el: bs4.Tag,
174 start: int | None = None,
175 reverse: bool = False,
176 tags: bool = True,
177 no_iframe: bool = False
178 ) -> Iterator[bs4.PageElement]:
179 """Get children."""
181 if not no_iframe or not self.is_iframe(el):
182 last = len(el.contents) - 1
183 if start is None:
184 index = last if reverse else 0
185 else:
186 index = start
187 end = -1 if reverse else last + 1
188 incr = -1 if reverse else 1
190 if 0 <= index <= last:
191 while index != end:
192 node = el.contents[index]
193 index += incr
194 if not tags or self.is_tag(node):
195 yield node
197 def get_descendants(
198 self,
199 el: bs4.Tag,
200 tags: bool = True,
201 no_iframe: bool = False
202 ) -> Iterator[bs4.PageElement]:
203 """Get descendants."""
205 if not no_iframe or not self.is_iframe(el):
206 next_good = None
207 for child in el.descendants:
209 if next_good is not None:
210 if child is not next_good:
211 continue
212 next_good = None
214 is_tag = self.is_tag(child)
216 if no_iframe and is_tag and self.is_iframe(child):
217 if child.next_sibling is not None:
218 next_good = child.next_sibling
219 else:
220 last_child = child
221 while self.is_tag(last_child) and last_child.contents:
222 last_child = last_child.contents[-1]
223 next_good = last_child.next_element
224 yield child
225 if next_good is None:
226 break
227 # Coverage isn't seeing this even though it's executed
228 continue # pragma: no cover
230 if not tags or is_tag:
231 yield child
233 def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag:
234 """Get parent."""
236 parent = el.parent
237 if no_iframe and parent is not None and self.is_iframe(parent):
238 parent = None
239 return parent
241 @staticmethod
242 def get_tag_name(el: bs4.Tag) -> str | None:
243 """Get tag."""
245 return cast('str | None', el.name)
247 @staticmethod
248 def get_prefix_name(el: bs4.Tag) -> str | None:
249 """Get prefix."""
251 return cast('str | None', el.prefix)
253 @staticmethod
254 def get_uri(el: bs4.Tag) -> str | None:
255 """Get namespace `URI`."""
257 return cast('str | None', el.namespace)
259 @classmethod
260 def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
261 """Get next sibling tag."""
263 sibling = el.next_sibling
264 while tags and not cls.is_tag(sibling) and sibling is not None:
265 sibling = sibling.next_sibling
266 return sibling
268 @classmethod
269 def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
270 """Get previous sibling tag."""
272 sibling = el.previous_sibling
273 while tags and not cls.is_tag(sibling) and sibling is not None:
274 sibling = sibling.previous_sibling
275 return sibling
277 @staticmethod
278 def has_html_ns(el: bs4.Tag) -> bool:
279 """
280 Check if element has an HTML namespace.
282 This is a bit different than whether a element is treated as having an HTML namespace,
283 like we do in the case of `is_html_tag`.
284 """
286 ns = getattr(el, 'namespace') if el else None
287 return bool(ns and ns == NS_XHTML)
289 @staticmethod
290 def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]:
291 """Return namespace and attribute name without the prefix."""
293 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
295 @classmethod
296 def normalize_value(cls, value: Any) -> str | Sequence[str]:
297 """Normalize the value to be a string or list of strings."""
299 # Treat `None` as empty string.
300 if value is None:
301 return ''
303 # Pass through strings
304 if (isinstance(value, str)):
305 return value
307 # If it's a byte string, convert it to Unicode, treating it as UTF-8.
308 if isinstance(value, bytes):
309 return value.decode("utf8")
311 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
312 if isinstance(value, Sequence):
313 new_value = []
314 for v in value:
315 if not isinstance(v, (str, bytes)) and isinstance(v, Sequence):
316 # This is most certainly a user error and will crash and burn later.
317 # To keep things working, we'll do what we do with all objects,
318 # And convert them to strings.
319 new_value.append(str(v))
320 else:
321 # Convert the child to a string
322 new_value.append(cast(str, cls.normalize_value(v)))
323 return new_value
325 # Try and make anything else a string
326 return str(value)
328 @classmethod
329 def get_attribute_by_name(
330 cls,
331 el: bs4.Tag,
332 name: str,
333 default: str | Sequence[str] | None = None
334 ) -> str | Sequence[str] | None:
335 """Get attribute by name."""
337 value = default
338 if el._is_xml:
339 try:
340 value = cls.normalize_value(el.attrs[name])
341 except KeyError:
342 pass
343 else:
344 for k, v in el.attrs.items():
345 if util.lower(k) == name:
346 value = cls.normalize_value(v)
347 break
348 return value
350 @classmethod
351 def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]:
352 """Iterate attributes."""
354 for k, v in el.attrs.items():
355 yield k, cls.normalize_value(v)
357 @classmethod
358 def get_classes(cls, el: bs4.Tag) -> Sequence[str]:
359 """Get classes."""
361 classes = cls.get_attribute_by_name(el, 'class', [])
362 if isinstance(classes, str):
363 classes = RE_NOT_WS.findall(classes)
364 return cast(Sequence[str], classes)
366 def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str:
367 """Get text."""
369 return ''.join(
370 [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
371 )
373 def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]:
374 """Get Own Text."""
376 return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
379class Inputs:
380 """Class for parsing and validating input items."""
382 @staticmethod
383 def validate_day(year: int, month: int, day: int) -> bool:
384 """Validate day."""
386 max_days = LONG_MONTH
387 if month == FEB:
388 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
389 elif month in MONTHS_30:
390 max_days = SHORT_MONTH
391 return 1 <= day <= max_days
393 @staticmethod
394 def validate_week(year: int, week: int) -> bool:
395 """Validate week."""
397 max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1]
398 if max_week == 1:
399 max_week = 53
400 return 1 <= week <= max_week
402 @staticmethod
403 def validate_month(month: int) -> bool:
404 """Validate month."""
406 return 1 <= month <= 12
408 @staticmethod
409 def validate_year(year: int) -> bool:
410 """Validate year."""
412 return 1 <= year
414 @staticmethod
415 def validate_hour(hour: int) -> bool:
416 """Validate hour."""
418 return 0 <= hour <= 23
420 @staticmethod
421 def validate_minutes(minutes: int) -> bool:
422 """Validate minutes."""
424 return 0 <= minutes <= 59
426 @classmethod
427 def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:
428 """Parse the input value."""
430 parsed = None # type: tuple[float, ...] | None
431 if value is None:
432 return value
433 if itype == "date":
434 m = RE_DATE.match(value)
435 if m:
436 year = int(m.group('year'), 10)
437 month = int(m.group('month'), 10)
438 day = int(m.group('day'), 10)
439 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
440 parsed = (year, month, day)
441 elif itype == "month":
442 m = RE_MONTH.match(value)
443 if m:
444 year = int(m.group('year'), 10)
445 month = int(m.group('month'), 10)
446 if cls.validate_year(year) and cls.validate_month(month):
447 parsed = (year, month)
448 elif itype == "week":
449 m = RE_WEEK.match(value)
450 if m:
451 year = int(m.group('year'), 10)
452 week = int(m.group('week'), 10)
453 if cls.validate_year(year) and cls.validate_week(year, week):
454 parsed = (year, week)
455 elif itype == "time":
456 m = RE_TIME.match(value)
457 if m:
458 hour = int(m.group('hour'), 10)
459 minutes = int(m.group('minutes'), 10)
460 if cls.validate_hour(hour) and cls.validate_minutes(minutes):
461 parsed = (hour, minutes)
462 elif itype == "datetime-local":
463 m = RE_DATETIME.match(value)
464 if m:
465 year = int(m.group('year'), 10)
466 month = int(m.group('month'), 10)
467 day = int(m.group('day'), 10)
468 hour = int(m.group('hour'), 10)
469 minutes = int(m.group('minutes'), 10)
470 if (
471 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
472 cls.validate_hour(hour) and cls.validate_minutes(minutes)
473 ):
474 parsed = (year, month, day, hour, minutes)
475 elif itype in ("number", "range"):
476 m = RE_NUM.match(value)
477 if m:
478 parsed = (float(m.group('value')),)
479 return parsed
482class CSSMatch(_DocumentNav):
483 """Perform CSS matching."""
485 def __init__(
486 self,
487 selectors: ct.SelectorList,
488 scope: bs4.Tag,
489 namespaces: ct.Namespaces | None,
490 flags: int
491 ) -> None:
492 """Initialize."""
494 self.assert_valid_input(scope)
495 self.tag = scope
496 self.cached_meta_lang = [] # type: list[tuple[str, str]]
497 self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]]
498 self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]]
499 self.selectors = selectors
500 self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str]
501 self.flags = flags
502 self.iframe_restrict = False
504 # Find the root element for the whole tree
505 doc = scope
506 parent = self.get_parent(doc)
507 while parent:
508 doc = parent
509 parent = self.get_parent(doc)
510 root = None
511 if not self.is_doc(doc):
512 root = doc
513 else:
514 for child in self.get_children(doc):
515 root = child
516 break
518 self.root = root
519 self.scope = scope if scope is not doc else root
520 self.has_html_namespace = self.has_html_ns(root)
522 # A document can be both XML and HTML (XHTML)
523 self.is_xml = self.is_xml_tree(doc)
524 self.is_html = not self.is_xml or self.has_html_namespace
526 def supports_namespaces(self) -> bool:
527 """Check if namespaces are supported in the HTML type."""
529 return self.is_xml or self.has_html_namespace
531 def get_tag_ns(self, el: bs4.Tag) -> str:
532 """Get tag namespace."""
534 if self.supports_namespaces():
535 namespace = ''
536 ns = self.get_uri(el)
537 if ns:
538 namespace = ns
539 else:
540 namespace = NS_XHTML
541 return namespace
543 def is_html_tag(self, el: bs4.Tag) -> bool:
544 """Check if tag is in HTML namespace."""
546 return self.get_tag_ns(el) == NS_XHTML
548 def get_tag(self, el: bs4.Tag) -> str | None:
549 """Get tag."""
551 name = self.get_tag_name(el)
552 return util.lower(name) if name is not None and not self.is_xml else name
554 def get_prefix(self, el: bs4.Tag) -> str | None:
555 """Get prefix."""
557 prefix = self.get_prefix_name(el)
558 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
560 def find_bidi(self, el: bs4.Tag) -> int | None:
561 """Get directionality from element text."""
563 for node in self.get_children(el, tags=False):
565 # Analyze child text nodes
566 if self.is_tag(node):
568 # Avoid analyzing certain elements specified in the specification.
569 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
570 if (
571 self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or
572 not self.is_html_tag(node) or
573 direction is not None
574 ):
575 continue # pragma: no cover
577 # Check directionality of this node's text
578 value = self.find_bidi(node)
579 if value is not None:
580 return value
582 # Direction could not be determined
583 continue # pragma: no cover
585 # Skip `doctype` comments, etc.
586 if self.is_special_string(node):
587 continue
589 # Analyze text nodes for directionality.
590 for c in node:
591 bidi = unicodedata.bidirectional(c)
592 if bidi in ('AL', 'R', 'L'):
593 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
594 return None
596 def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool:
597 """Filter the language tags."""
599 match = True
600 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
601 ranges = lang_range.split('-')
602 subtags = lang_tag.lower().split('-')
603 length = len(ranges)
604 slength = len(subtags)
605 rindex = 0
606 sindex = 0
607 r = ranges[rindex]
608 s = subtags[sindex]
610 # Empty specified language should match unspecified language attributes
611 if length == 1 and slength == 1 and not r and r == s:
612 return True
614 # Primary tag needs to match
615 if (r != '*' and r != s) or (r == '*' and slength == 1 and not s):
616 match = False
618 rindex += 1
619 sindex += 1
621 # Match until we run out of ranges
622 while match and rindex < length:
623 r = ranges[rindex]
624 try:
625 s = subtags[sindex]
626 except IndexError:
627 # Ran out of subtags,
628 # but we still have ranges
629 match = False
630 continue
632 # Empty range
633 if not r:
634 match = False
635 continue
637 # Matched range
638 elif s == r:
639 rindex += 1
641 # Implicit wildcard cannot match
642 # singletons
643 elif len(s) == 1:
644 match = False
645 continue
647 # Implicitly matched, so grab next subtag
648 sindex += 1
650 return match
652 def match_attribute_name(
653 self,
654 el: bs4.Tag,
655 attr: str,
656 prefix: str | None
657 ) -> str | Sequence[str] | None:
658 """Match attribute name and return value if it exists."""
660 value = None
661 if self.supports_namespaces():
662 value = None
663 # If we have not defined namespaces, we can't very well find them, so don't bother trying.
664 if prefix:
665 ns = self.namespaces.get(prefix)
666 if ns is None and prefix != '*':
667 return None
668 else:
669 ns = None
671 for k, v in self.iter_attributes(el):
673 # Get attribute parts
674 namespace, name = self.split_namespace(el, k)
676 # Can't match a prefix attribute as we haven't specified one to match
677 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
678 if ns is None:
679 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
680 value = v
681 break
682 # Coverage is not finding this even though it is executed.
683 # Adding a print statement before this (and erasing coverage) causes coverage to find the line.
684 # Ignore the false positive message.
685 continue # pragma: no cover
687 # We can't match our desired prefix attribute as the attribute doesn't have a prefix
688 if namespace is None or ns != namespace and prefix != '*':
689 continue
691 # The attribute doesn't match.
692 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
693 continue
695 value = v
696 break
697 else:
698 for k, v in self.iter_attributes(el):
699 if util.lower(attr) != util.lower(k):
700 continue
701 value = v
702 break
703 return value
705 def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
706 """Match the namespace of the element."""
708 match = True
709 namespace = self.get_tag_ns(el)
710 default_namespace = self.namespaces.get('')
711 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix)
712 # We must match the default namespace if one is not provided
713 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
714 match = False
715 # If we specified `|tag`, we must not have a namespace.
716 elif (tag.prefix is not None and tag.prefix == '' and namespace):
717 match = False
718 # Verify prefix matches
719 elif (
720 tag.prefix and
721 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
722 ):
723 match = False
724 return match
726 def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool:
727 """Match attributes."""
729 match = True
730 if attributes:
731 for a in attributes:
732 temp = self.match_attribute_name(el, a.attribute, a.prefix)
733 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
734 if temp is None:
735 match = False
736 break
737 value = temp if isinstance(temp, str) else ' '.join(temp)
738 if pattern is None:
739 continue
740 elif pattern.match(value) is None:
741 match = False
742 break
743 return match
745 def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
746 """Match tag name."""
748 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
749 return not (
750 name is not None and
751 name not in (self.get_tag(el), '*')
752 )
754 def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:
755 """Match the tag."""
757 match = True
758 if tag is not None:
759 # Verify namespace
760 if not self.match_namespace(el, tag):
761 match = False
762 if not self.match_tagname(el, tag):
763 match = False
764 return match
766 def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
767 """Match past relationship."""
769 found = False
770 # I don't think this can ever happen, but it makes `mypy` happy
771 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
772 return found
774 if relation[0].rel_type == REL_PARENT:
775 parent = self.get_parent(el, no_iframe=self.iframe_restrict)
776 while not found and parent:
777 found = self.match_selectors(parent, relation)
778 parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
779 elif relation[0].rel_type == REL_CLOSE_PARENT:
780 parent = self.get_parent(el, no_iframe=self.iframe_restrict)
781 if parent:
782 found = self.match_selectors(parent, relation)
783 elif relation[0].rel_type == REL_SIBLING:
784 sibling = self.get_previous(el)
785 while not found and sibling:
786 found = self.match_selectors(sibling, relation)
787 sibling = self.get_previous(sibling)
788 elif relation[0].rel_type == REL_CLOSE_SIBLING:
789 sibling = self.get_previous(el)
790 if sibling and self.is_tag(sibling):
791 found = self.match_selectors(sibling, relation)
792 return found
794 def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool:
795 """Match future child."""
797 match = False
798 if recursive:
799 children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]]
800 else:
801 children = self.get_children
802 for child in children(parent, no_iframe=self.iframe_restrict):
803 match = self.match_selectors(child, relation)
804 if match:
805 break
806 return match
808 def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
809 """Match future relationship."""
811 found = False
812 # I don't think this can ever happen, but it makes `mypy` happy
813 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
814 return found
816 if relation[0].rel_type == REL_HAS_PARENT:
817 found = self.match_future_child(el, relation, True)
818 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
819 found = self.match_future_child(el, relation)
820 elif relation[0].rel_type == REL_HAS_SIBLING:
821 sibling = self.get_next(el)
822 while not found and sibling:
823 found = self.match_selectors(sibling, relation)
824 sibling = self.get_next(sibling)
825 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
826 sibling = self.get_next(el)
827 if sibling and self.is_tag(sibling):
828 found = self.match_selectors(sibling, relation)
829 return found
831 def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
832 """Match relationship to other elements."""
834 found = False
836 if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None:
837 return found
839 if relation[0].rel_type.startswith(':'):
840 found = self.match_future_relations(el, relation)
841 else:
842 found = self.match_past_relations(el, relation)
844 return found
846 def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool:
847 """Match element's ID."""
849 found = True
850 for i in ids:
851 if i != self.get_attribute_by_name(el, 'id', ''):
852 found = False
853 break
854 return found
856 def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool:
857 """Match element's classes."""
859 current_classes = self.get_classes(el)
860 found = True
861 for c in classes:
862 if c not in current_classes:
863 found = False
864 break
865 return found
867 def match_root(self, el: bs4.Tag) -> bool:
868 """Match element as root."""
870 is_root = self.is_root(el)
871 if is_root:
872 sibling = self.get_previous(el, tags=False)
873 while is_root and sibling is not None:
874 if (
875 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
876 self.is_cdata(sibling)
877 ):
878 is_root = False
879 else:
880 sibling = self.get_previous(sibling, tags=False)
881 if is_root:
882 sibling = self.get_next(el, tags=False)
883 while is_root and sibling is not None:
884 if (
885 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
886 self.is_cdata(sibling)
887 ):
888 is_root = False
889 else:
890 sibling = self.get_next(sibling, tags=False)
891 return is_root
893 def match_scope(self, el: bs4.Tag) -> bool:
894 """Match element as scope."""
896 return self.scope is el
898 def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool:
899 """Match tag type for `nth` matches."""
901 return (
902 (self.get_tag(child) == self.get_tag(el)) and
903 (self.get_tag_ns(child) == self.get_tag_ns(el))
904 )
906 def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool:
907 """Match `nth` elements."""
909 matched = True
911 for n in nth:
912 matched = False
913 if n.selectors and not self.match_selectors(el, n.selectors):
914 break
915 parent = self.get_parent(el)
916 if parent is None:
917 parent = self.create_fake_parent(el)
918 last = n.last
919 last_index = len(parent) - 1
920 index = last_index if last else 0
921 relative_index = 0
922 a = n.a
923 b = n.b
924 var = n.n
925 count = 0
926 count_incr = 1
927 factor = -1 if last else 1
928 idx = last_idx = a * count + b if var else a
930 # We can only adjust bounds within a variable index
931 if var:
932 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
933 # Otherwise, increment to try to get in bounds.
934 adjust = None
935 while idx < 1 or idx > last_index:
936 if idx < 0:
937 diff_low = 0 - idx
938 if adjust is not None and adjust == 1:
939 break
940 adjust = -1
941 count += count_incr
942 idx = last_idx = a * count + b if var else a
943 diff = 0 - idx
944 if diff >= diff_low:
945 break
946 else:
947 diff_high = idx - last_index
948 if adjust is not None and adjust == -1:
949 break
950 adjust = 1
951 count += count_incr
952 idx = last_idx = a * count + b if var else a
953 diff = idx - last_index
954 if diff >= diff_high:
955 break
956 diff_high = diff
958 # If a < 0, our count is working backwards, so floor the index by increasing the count.
959 # Find the count that yields the lowest, in bound value and use that.
960 # Lastly reverse count increment so that we'll increase our index.
961 lowest = count
962 if a < 0:
963 while idx >= 1:
964 lowest = count
965 count += count_incr
966 idx = last_idx = a * count + b if var else a
967 count_incr = -1
968 count = lowest
969 idx = last_idx = a * count + b if var else a
971 # Evaluate elements while our calculated nth index is still in range
972 while 1 <= idx <= last_index + 1:
973 child = None
974 # Evaluate while our child index is still in range.
975 for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):
976 index += factor
977 if not self.is_tag(child):
978 continue
979 # Handle `of S` in `nth-child`
980 if n.selectors and not self.match_selectors(child, n.selectors):
981 continue
982 # Handle `of-type`
983 if n.of_type and not self.match_nth_tag_type(el, child):
984 continue
985 relative_index += 1
986 if relative_index == idx:
987 if child is el:
988 matched = True
989 else:
990 break
991 if child is el:
992 break
993 if child is el:
994 break
995 last_idx = idx
996 count += count_incr
997 if count < 0:
998 # Count is counting down and has now ventured into invalid territory.
999 break
1000 idx = a * count + b if var else a
1001 if last_idx == idx:
1002 break
1003 if not matched:
1004 break
1005 return matched
1007 def match_empty(self, el: bs4.Tag) -> bool:
1008 """Check if element is empty (if requested)."""
1010 is_empty = True
1011 for child in self.get_children(el, tags=False):
1012 if self.is_tag(child):
1013 is_empty = False
1014 break
1015 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):
1016 is_empty = False
1017 break
1018 return is_empty
1020 def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool:
1021 """Match selectors."""
1023 match = True
1024 for sel in selectors:
1025 if not self.match_selectors(el, sel):
1026 match = False
1027 return match
1029 def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool:
1030 """Match element if it contains text."""
1032 match = True
1033 content = None # type: str | Sequence[str] | None
1034 for contain_list in contains:
1035 if content is None:
1036 if contain_list.own:
1037 content = self.get_own_text(el, no_iframe=self.is_html)
1038 else:
1039 content = self.get_text(el, no_iframe=self.is_html)
1040 found = False
1041 for text in contain_list.text:
1042 if contain_list.own:
1043 for c in content:
1044 if text in c:
1045 found = True
1046 break
1047 if found:
1048 break
1049 else:
1050 if text in content:
1051 found = True
1052 break
1053 if not found:
1054 match = False
1055 return match
1057 def match_default(self, el: bs4.Tag) -> bool:
1058 """Match default."""
1060 match = False
1062 # Find this input's form
1063 form = None
1064 parent = self.get_parent(el, no_iframe=True)
1065 while parent and form is None:
1066 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
1067 form = parent
1068 else:
1069 parent = self.get_parent(parent, no_iframe=True)
1071 # Look in form cache to see if we've already located its default button
1072 found_form = False
1073 for f, t in self.cached_default_forms:
1074 if f is form:
1075 found_form = True
1076 if t is el:
1077 match = True
1078 break
1080 # We didn't have the form cached, so look for its default button
1081 if not found_form:
1082 for child in self.get_descendants(form, no_iframe=True):
1083 name = self.get_tag(child)
1084 # Can't do nested forms (haven't figured out why we never hit this)
1085 if name == 'form': # pragma: no cover
1086 break
1087 if name in ('input', 'button'):
1088 v = self.get_attribute_by_name(child, 'type', '')
1089 if v and util.lower(v) == 'submit':
1090 self.cached_default_forms.append((form, child))
1091 if el is child:
1092 match = True
1093 break
1094 return match
1096 def match_indeterminate(self, el: bs4.Tag) -> bool:
1097 """Match default."""
1099 match = False
1100 name = cast(str, self.get_attribute_by_name(el, 'name'))
1102 def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:
1103 """Find this input's form."""
1104 form = None
1105 parent = self.get_parent(el, no_iframe=True)
1106 while form is None:
1107 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
1108 form = parent
1109 break
1110 last_parent = parent
1111 parent = self.get_parent(parent, no_iframe=True)
1112 if parent is None:
1113 form = last_parent
1114 break
1115 return form
1117 form = get_parent_form(el)
1119 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
1120 found_form = False
1121 for f, n, i in self.cached_indeterminate_forms:
1122 if f is form and n == name:
1123 found_form = True
1124 if i is True:
1125 match = True
1126 break
1128 # We didn't have the form cached, so validate that the radio button is indeterminate
1129 if not found_form:
1130 checked = False
1131 for child in self.get_descendants(form, no_iframe=True):
1132 if child is el:
1133 continue
1134 tag_name = self.get_tag(child)
1135 if tag_name == 'input':
1136 is_radio = False
1137 check = False
1138 has_name = False
1139 for k, v in self.iter_attributes(child):
1140 if util.lower(k) == 'type' and util.lower(v) == 'radio':
1141 is_radio = True
1142 elif util.lower(k) == 'name' and v == name:
1143 has_name = True
1144 elif util.lower(k) == 'checked':
1145 check = True
1146 if is_radio and check and has_name and get_parent_form(child) is form:
1147 checked = True
1148 break
1149 if checked:
1150 break
1151 if not checked:
1152 match = True
1153 self.cached_indeterminate_forms.append((form, name, match))
1155 return match
1157 def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool:
1158 """Match languages."""
1160 match = False
1161 has_ns = self.supports_namespaces()
1162 root = self.root
1163 has_html_namespace = self.has_html_namespace
1165 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
1166 parent = el
1167 found_lang = None
1168 last = None
1169 while not found_lang:
1170 has_html_ns = self.has_html_ns(parent)
1171 for k, v in self.iter_attributes(parent):
1172 attr_ns, attr = self.split_namespace(parent, k)
1173 if (
1174 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
1175 (
1176 has_ns and not has_html_ns and attr_ns == NS_XML and
1177 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
1178 )
1179 ):
1180 found_lang = v
1181 break
1182 last = parent
1183 parent = self.get_parent(parent, no_iframe=self.is_html)
1185 if parent is None:
1186 root = last
1187 has_html_namespace = self.has_html_ns(root)
1188 parent = last
1189 break
1191 # Use cached meta language.
1192 if found_lang is None and self.cached_meta_lang:
1193 for cache in self.cached_meta_lang:
1194 if root is cache[0]:
1195 found_lang = cache[1]
1197 # If we couldn't find a language, and the document is HTML, look to meta to determine language.
1198 if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):
1199 # Find head
1200 found = False
1201 for tag in ('html', 'head'):
1202 found = False
1203 for child in self.get_children(parent, no_iframe=self.is_html):
1204 if self.get_tag(child) == tag and self.is_html_tag(child):
1205 found = True
1206 parent = child
1207 break
1208 if not found: # pragma: no cover
1209 break
1211 # Search meta tags
1212 if found:
1213 for child in parent:
1214 if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):
1215 c_lang = False
1216 content = None
1217 for k, v in self.iter_attributes(child):
1218 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
1219 c_lang = True
1220 if util.lower(k) == 'content':
1221 content = v
1222 if c_lang and content:
1223 found_lang = content
1224 self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))
1225 break
1226 if found_lang is not None:
1227 break
1228 if found_lang is None:
1229 self.cached_meta_lang.append((cast(str, root), ''))
1231 # If we determined a language, compare.
1232 if found_lang is not None:
1233 for patterns in langs:
1234 match = False
1235 for pattern in patterns:
1236 if self.extended_language_filter(pattern, cast(str, found_lang)):
1237 match = True
1238 if not match:
1239 break
1241 return match
1243 def match_dir(self, el: bs4.Tag, directionality: int) -> bool:
1244 """Check directionality."""
1246 # If we have to match both left and right, we can't match either.
1247 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
1248 return False
1250 if el is None or not self.is_html_tag(el):
1251 return False
1253 # Element has defined direction of left to right or right to left
1254 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
1255 if direction not in (None, 0):
1256 return direction == directionality
1258 # Element is the document element (the root) and no direction assigned, assume left to right.
1259 is_root = self.is_root(el)
1260 if is_root and direction is None:
1261 return ct.SEL_DIR_LTR == directionality
1263 # If `input[type=telephone]` and no direction is assigned, assume left to right.
1264 name = self.get_tag(el)
1265 is_input = name == 'input'
1266 is_textarea = name == 'textarea'
1267 is_bdi = name == 'bdi'
1268 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
1269 if is_input and itype == 'tel' and direction is None:
1270 return ct.SEL_DIR_LTR == directionality
1272 # Auto handling for text inputs
1273 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
1274 if is_textarea:
1275 temp = []
1276 for node in self.get_contents(el, no_iframe=True):
1277 if self.is_content_string(node):
1278 temp.append(node)
1279 value = ''.join(temp)
1280 else:
1281 value = cast(str, self.get_attribute_by_name(el, 'value', ''))
1282 if value:
1283 for c in value:
1284 bidi = unicodedata.bidirectional(c)
1285 if bidi in ('AL', 'R', 'L'):
1286 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
1287 return direction == directionality
1288 # Assume left to right
1289 return ct.SEL_DIR_LTR == directionality
1290 elif is_root:
1291 return ct.SEL_DIR_LTR == directionality
1292 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1294 # Auto handling for `bdi` and other non text inputs.
1295 if (is_bdi and direction is None) or direction == 0:
1296 direction = self.find_bidi(el)
1297 if direction is not None:
1298 return direction == directionality
1299 elif is_root:
1300 return ct.SEL_DIR_LTR == directionality
1301 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1303 # Match parents direction
1304 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1306 def match_range(self, el: bs4.Tag, condition: int) -> bool:
1307 """
1308 Match range.
1310 Behavior is modeled after what we see in browsers. Browsers seem to evaluate
1311 if the value is out of range, and if not, it is in range. So a missing value
1312 will not evaluate out of range; therefore, value is in range. Personally, I
1313 feel like this should evaluate as neither in or out of range.
1314 """
1316 out_of_range = False
1318 itype = util.lower(self.get_attribute_by_name(el, 'type'))
1319 mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None)))
1320 mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None)))
1322 # There is no valid min or max, so we cannot evaluate a range
1323 if mn is None and mx is None:
1324 return False
1326 value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None)))
1327 if value is not None:
1328 if itype in ("date", "datetime-local", "month", "week", "number", "range"):
1329 if mn is not None and value < mn:
1330 out_of_range = True
1331 if not out_of_range and mx is not None and value > mx:
1332 out_of_range = True
1333 elif itype == "time":
1334 if mn is not None and mx is not None and mn > mx:
1335 # Time is periodic, so this is a reversed/discontinuous range
1336 if value < mn and value > mx:
1337 out_of_range = True
1338 else:
1339 if mn is not None and value < mn:
1340 out_of_range = True
1341 if not out_of_range and mx is not None and value > mx:
1342 out_of_range = True
1344 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
1346 def match_defined(self, el: bs4.Tag) -> bool:
1347 """
1348 Match defined.
1350 `:defined` is related to custom elements in a browser.
1352 - If the document is XML (not XHTML), all tags will match.
1353 - Tags that are not custom (don't have a hyphen) are marked defined.
1354 - If the tag has a prefix (without or without a namespace), it will not match.
1356 This is of course requires the parser to provide us with the proper prefix and namespace info,
1357 if it doesn't, there is nothing we can do.
1358 """
1360 name = self.get_tag(el)
1361 return (
1362 name is not None and (
1363 name.find('-') == -1 or
1364 name.find(':') != -1 or
1365 self.get_prefix(el) is not None
1366 )
1367 )
1369 def match_placeholder_shown(self, el: bs4.Tag) -> bool:
1370 """
1371 Match placeholder shown according to HTML spec.
1373 - text area should be checked if they have content. A single newline does not count as content.
1375 """
1377 match = False
1378 content = self.get_text(el)
1379 if content in ('', '\n'):
1380 match = True
1382 return match
1384 def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool:
1385 """Check if element matches one of the selectors."""
1387 match = False
1388 is_not = selectors.is_not
1389 is_html = selectors.is_html
1391 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
1392 if is_html:
1393 namespaces = self.namespaces
1394 iframe_restrict = self.iframe_restrict
1395 self.namespaces = {'html': NS_XHTML}
1396 self.iframe_restrict = True
1398 if not is_html or self.is_html:
1399 for selector in selectors:
1400 match = is_not
1401 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
1402 if isinstance(selector, ct.SelectorNull):
1403 continue
1404 # Verify tag matches
1405 if not self.match_tag(el, selector.tag):
1406 continue
1407 # Verify tag is defined
1408 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
1409 continue
1410 # Verify element is root
1411 if selector.flags & ct.SEL_ROOT and not self.match_root(el):
1412 continue
1413 # Verify element is scope
1414 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
1415 continue
1416 # Verify element has placeholder shown
1417 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
1418 continue
1419 # Verify `nth` matches
1420 if not self.match_nth(el, selector.nth):
1421 continue
1422 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
1423 continue
1424 # Verify id matches
1425 if selector.ids and not self.match_id(el, selector.ids):
1426 continue
1427 # Verify classes match
1428 if selector.classes and not self.match_classes(el, selector.classes):
1429 continue
1430 # Verify attribute(s) match
1431 if not self.match_attributes(el, selector.attributes):
1432 continue
1433 # Verify ranges
1434 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
1435 continue
1436 # Verify language patterns
1437 if selector.lang and not self.match_lang(el, selector.lang):
1438 continue
1439 # Verify pseudo selector patterns
1440 if selector.selectors and not self.match_subselectors(el, selector.selectors):
1441 continue
1442 # Verify relationship selectors
1443 if selector.relation and not self.match_relations(el, selector.relation):
1444 continue
1445 # Validate that the current default selector match corresponds to the first submit button in the form
1446 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
1447 continue
1448 # Validate that the unset radio button is among radio buttons with the same name in a form that are
1449 # also not set.
1450 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
1451 continue
1452 # Validate element directionality
1453 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
1454 continue
1455 # Validate that the tag contains the specified text.
1456 if selector.contains and not self.match_contains(el, selector.contains):
1457 continue
1458 match = not is_not
1459 break
1461 # Restore actual namespaces being used for external selector lists
1462 if is_html:
1463 self.namespaces = namespaces
1464 self.iframe_restrict = iframe_restrict
1466 return match
1468 def select(self, limit: int = 0) -> Iterator[bs4.Tag]:
1469 """Match all tags under the targeted tag."""
1471 lim = None if limit < 1 else limit
1473 for child in self.get_descendants(self.tag):
1474 if self.match(child):
1475 yield child
1476 if lim is not None:
1477 lim -= 1
1478 if lim < 1:
1479 break
1481 def closest(self) -> bs4.Tag | None:
1482 """Match closest ancestor."""
1484 current = self.tag
1485 closest = None
1486 while closest is None and current is not None:
1487 if self.match(current):
1488 closest = current
1489 else:
1490 current = self.get_parent(current)
1491 return closest
1493 def filter(self) -> list[bs4.Tag]: # noqa A001
1494 """Filter tag's children."""
1496 return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
1498 def match(self, el: bs4.Tag) -> bool:
1499 """Match."""
1501 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
1504class SoupSieve(ct.Immutable):
1505 """Compiled Soup Sieve selector matching object."""
1507 pattern: str
1508 selectors: ct.SelectorList
1509 namespaces: ct.Namespaces | None
1510 custom: dict[str, str]
1511 flags: int
1513 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
1515 def __init__(
1516 self,
1517 pattern: str,
1518 selectors: ct.SelectorList,
1519 namespaces: ct.Namespaces | None,
1520 custom: ct.CustomSelectors | None,
1521 flags: int
1522 ):
1523 """Initialize."""
1525 super().__init__(
1526 pattern=pattern,
1527 selectors=selectors,
1528 namespaces=namespaces,
1529 custom=custom,
1530 flags=flags
1531 )
1533 def match(self, tag: bs4.Tag) -> bool:
1534 """Match."""
1536 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
1538 def closest(self, tag: bs4.Tag) -> bs4.Tag:
1539 """Match closest ancestor."""
1541 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
1543 def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001
1544 """
1545 Filter.
1547 `CSSMatch` can cache certain searches for tags of the same document,
1548 so if we are given a tag, all tags are from the same document,
1549 and we can take advantage of the optimization.
1551 Any other kind of iterable could have tags from different documents or detached tags,
1552 so for those, we use a new `CSSMatch` for each item in the iterable.
1553 """
1555 if CSSMatch.is_tag(iterable):
1556 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
1557 else:
1558 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
1560 def select_one(self, tag: bs4.Tag) -> bs4.Tag:
1561 """Select a single tag."""
1563 tags = self.select(tag, limit=1)
1564 return tags[0] if tags else None
1566 def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]:
1567 """Select the specified tags."""
1569 return list(self.iselect(tag, limit))
1571 def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]:
1572 """Iterate the specified tags."""
1574 for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):
1575 yield el
1577 def __repr__(self) -> str: # pragma: no cover
1578 """Representation."""
1580 return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format(
1581 self.pattern,
1582 self.namespaces,
1583 self.custom,
1584 self.flags
1585 )
1587 __str__ = __repr__
1590ct.pickle_register(SoupSieve)