Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/soupsieve/css_match.py: 58%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""CSS matcher."""
2from __future__ import annotations
3from datetime import datetime
4from . import util
5import re
6from . import css_types as ct
7import unicodedata
8import bs4
9from typing import Iterator, Iterable, Any, Callable, Sequence, Any, cast # noqa: F401, F811
11# Empty tag pattern (whitespace okay)
12RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
14RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
16# Relationships
17REL_PARENT = ' '
18REL_CLOSE_PARENT = '>'
19REL_SIBLING = '~'
20REL_CLOSE_SIBLING = '+'
22# Relationships for :has() (forward looking)
23REL_HAS_PARENT = ': '
24REL_HAS_CLOSE_PARENT = ':>'
25REL_HAS_SIBLING = ':~'
26REL_HAS_CLOSE_SIBLING = ':+'
28NS_XHTML = 'http://www.w3.org/1999/xhtml'
29NS_XML = 'http://www.w3.org/XML/1998/namespace'
31DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
32RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
34DIR_MAP = {
35 'ltr': ct.SEL_DIR_LTR,
36 'rtl': ct.SEL_DIR_RTL,
37 'auto': 0
38}
40RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
41RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
42RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
43RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
44RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
45RE_DATETIME = re.compile(
46 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
47)
48RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
50MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
51FEB = 2
52SHORT_MONTH = 30
53LONG_MONTH = 31
54FEB_MONTH = 28
55FEB_LEAP_MONTH = 29
56DAYS_IN_WEEK = 7
59class _FakeParent:
60 """
61 Fake parent class.
63 When we have a fragment with no `BeautifulSoup` document object,
64 we can't evaluate `nth` selectors properly. Create a temporary
65 fake parent so we can traverse the root element as a child.
66 """
68 def __init__(self, element: bs4.Tag) -> None:
69 """Initialize."""
71 self.contents = [element]
73 def __len__(self) -> int:
74 """Length."""
76 return len(self.contents)
79class _DocumentNav:
80 """Navigate a Beautiful Soup document."""
82 @classmethod
83 def assert_valid_input(cls, tag: Any) -> None:
84 """Check if valid input tag or document."""
86 # Fail on unexpected types.
87 if not cls.is_tag(tag):
88 raise TypeError(f"Expected a BeautifulSoup 'Tag', but instead received type {type(tag)}")
90 @staticmethod
91 def is_doc(obj: bs4.element.PageElement | None) -> bool:
92 """Is `BeautifulSoup` object."""
93 return isinstance(obj, bs4.BeautifulSoup)
95 @staticmethod
96 def is_tag(obj: bs4.element.PageElement | None) -> bool:
97 """Is tag."""
98 return isinstance(obj, bs4.Tag)
100 @staticmethod
101 def is_declaration(obj: bs4.element.PageElement | None) -> bool: # pragma: no cover
102 """Is declaration."""
103 return isinstance(obj, bs4.Declaration)
105 @staticmethod
106 def is_cdata(obj: bs4.element.PageElement | None) -> bool:
107 """Is CDATA."""
108 return isinstance(obj, bs4.CData)
110 @staticmethod
111 def is_processing_instruction(obj: bs4.element.PageElement | None) -> bool: # pragma: no cover
112 """Is processing instruction."""
113 return isinstance(obj, bs4.ProcessingInstruction)
115 @staticmethod
116 def is_navigable_string(obj: bs4.element.PageElement | None) -> bool:
117 """Is navigable string."""
118 return isinstance(obj, bs4.element.NavigableString)
120 @staticmethod
121 def is_special_string(obj: bs4.element.PageElement | None) -> bool:
122 """Is special string."""
123 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
125 @classmethod
126 def is_content_string(cls, obj: bs4.element.PageElement | None) -> bool:
127 """Check if node is content string."""
129 return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
131 @staticmethod
132 def create_fake_parent(el: bs4.Tag) -> _FakeParent:
133 """Create fake parent for a given element."""
135 return _FakeParent(el)
137 @staticmethod
138 def is_xml_tree(el: bs4.Tag | None) -> bool:
139 """Check if element (or document) is from a XML tree."""
141 return el is not None and bool(el._is_xml)
143 def is_iframe(self, el: bs4.Tag | None) -> bool:
144 """Check if element is an `iframe`."""
146 if el is None: # pragma: no cover
147 return False
149 return bool(
150 ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and
151 self.is_html_tag(el) # type: ignore[attr-defined]
152 )
154 def is_root(self, el: bs4.Tag) -> bool:
155 """
156 Return whether element is a root element.
158 We check that the element is the root of the tree (which we have already pre-calculated),
159 and we check if it is the root element under an `iframe`.
160 """
162 root = self.root and self.root is el # type: ignore[attr-defined]
163 if not root:
164 parent = self.get_parent(el)
165 root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]
166 return root
168 def get_contents(self, el: bs4.Tag | None, no_iframe: bool = False) -> Iterator[bs4.element.PageElement]:
169 """Get contents or contents in reverse."""
171 if el is not None:
172 if not no_iframe or not self.is_iframe(el):
173 yield from el.contents
175 def get_tag_children(
176 self,
177 el: bs4.Tag | None,
178 start: int | None = None,
179 reverse: bool = False,
180 no_iframe: bool = False
181 ) -> Iterator[bs4.Tag]:
182 """Get tag children."""
184 return self.get_children(el, start, reverse, True, no_iframe) # type: ignore[return-value]
186 def get_children(
187 self,
188 el: bs4.Tag | None,
189 start: int | None = None,
190 reverse: bool = False,
191 tags: bool = False,
192 no_iframe: bool = False
193 ) -> Iterator[bs4.element.PageElement]:
194 """Get children."""
196 if el is not None and (not no_iframe or not self.is_iframe(el)):
197 last = len(el.contents) - 1
198 if start is None:
199 index = last if reverse else 0
200 else:
201 index = start
202 end = -1 if reverse else last + 1
203 incr = -1 if reverse else 1
205 if 0 <= index <= last:
206 while index != end:
207 node = el.contents[index]
208 index += incr
209 if not tags or self.is_tag(node):
210 yield node
212 def get_tag_descendants(
213 self,
214 el: bs4.Tag | None,
215 no_iframe: bool = False
216 ) -> Iterator[bs4.Tag]:
217 """Specifically get tag descendants."""
219 yield from self.get_descendants(el, tags=True, no_iframe=no_iframe) # type: ignore[misc]
221 def get_descendants(
222 self,
223 el: bs4.Tag | None,
224 tags: bool = False,
225 no_iframe: bool = False
226 ) -> Iterator[bs4.element.PageElement]:
227 """Get descendants."""
229 if el is not None and (not no_iframe or not self.is_iframe(el)):
230 next_good = None
231 for child in el.descendants:
233 if next_good is not None:
234 if child is not next_good:
235 continue
236 next_good = None
238 if isinstance(child, bs4.Tag):
239 if no_iframe and self.is_iframe(child):
240 if child.next_sibling is not None:
241 next_good = child.next_sibling
242 else:
243 last_child = child # type: bs4.element.PageElement
244 while isinstance(last_child, bs4.Tag) and last_child.contents:
245 last_child = last_child.contents[-1]
246 next_good = last_child.next_element
247 yield child
248 if next_good is None:
249 break
250 # Coverage isn't seeing this even though it's executed
251 continue # pragma: no cover
252 yield child
254 elif not tags:
255 yield child
257 def get_parent(self, el: bs4.Tag | None, no_iframe: bool = False) -> bs4.Tag | None:
258 """Get parent."""
260 parent = el.parent if el is not None else None
261 if no_iframe and parent is not None and self.is_iframe(parent): # pragma: no cover
262 parent = None
263 return parent
265 @staticmethod
266 def get_tag_name(el: bs4.Tag | None) -> str | None:
267 """Get tag."""
269 return el.name if el is not None else None
271 @staticmethod
272 def get_prefix_name(el: bs4.Tag) -> str | None:
273 """Get prefix."""
275 return el.prefix
277 @staticmethod
278 def get_uri(el: bs4.Tag | None) -> str | None:
279 """Get namespace `URI`."""
281 return el.namespace if el is not None else None
283 @classmethod
284 def get_next_tag(cls, el: bs4.Tag) -> bs4.Tag | None:
285 """Get next sibling tag."""
287 return cls.get_next(el, tags=True) # type: ignore[return-value]
289 @classmethod
290 def get_next(cls, el: bs4.Tag, tags: bool = False) -> bs4.element.PageElement | None:
291 """Get next sibling tag."""
293 sibling = el.next_sibling
294 while tags and not isinstance(sibling, bs4.Tag) and sibling is not None:
295 sibling = sibling.next_sibling
297 if tags and not isinstance(sibling, bs4.Tag):
298 sibling = None
300 return sibling
302 @classmethod
303 def get_previous_tag(cls, el: bs4.Tag, tags: bool = True) -> bs4.Tag | None:
304 """Get previous sibling tag."""
306 return cls.get_previous(el, True) # type: ignore[return-value]
308 @classmethod
309 def get_previous(cls, el: bs4.Tag, tags: bool = False) -> bs4.element.PageElement | None:
310 """Get previous sibling tag."""
312 sibling = el.previous_sibling
313 while tags and not isinstance(sibling, bs4.Tag) and sibling is not None:
314 sibling = sibling.previous_sibling
316 if tags and not isinstance(sibling, bs4.Tag):
317 sibling = None
319 return sibling
321 @staticmethod
322 def has_html_ns(el: bs4.Tag | None) -> bool:
323 """
324 Check if element has an HTML namespace.
326 This is a bit different than whether a element is treated as having an HTML namespace,
327 like we do in the case of `is_html_tag`.
328 """
330 ns = getattr(el, 'namespace') if el is not None else None # noqa: B009
331 return bool(ns and ns == NS_XHTML)
333 @staticmethod
334 def split_namespace(el: bs4.Tag | None, attr_name: str) -> tuple[str | None, str | None]:
335 """Return namespace and attribute name without the prefix."""
337 if el is None: # pragma: no cover
338 return None, None
340 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
342 @classmethod
343 def normalize_value(cls, value: Any) -> str | Sequence[str]:
344 """Normalize the value to be a string or list of strings."""
346 # Treat `None` as empty string.
347 if value is None:
348 return ''
350 # Pass through strings
351 if (isinstance(value, str)):
352 return value
354 # If it's a byte string, convert it to Unicode, treating it as UTF-8.
355 if isinstance(value, bytes):
356 return value.decode("utf8")
358 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
359 if isinstance(value, Sequence):
360 new_value = []
361 for v in value:
362 if not isinstance(v, (str, bytes)) and isinstance(v, Sequence):
363 # This is most certainly a user error and will crash and burn later.
364 # To keep things working, we'll do what we do with all objects,
365 # And convert them to strings.
366 new_value.append(str(v))
367 else:
368 # Convert the child to a string
369 new_value.append(cast(str, cls.normalize_value(v)))
370 return new_value
372 # Try and make anything else a string
373 return str(value)
375 @classmethod
376 def get_attribute_by_name(
377 cls,
378 el: bs4.Tag,
379 name: str,
380 default: str | Sequence[str] | None = None
381 ) -> str | Sequence[str] | None:
382 """Get attribute by name."""
384 value = default
385 if el._is_xml:
386 try:
387 value = cls.normalize_value(el.attrs[name])
388 except KeyError:
389 pass
390 else:
391 for k, v in el.attrs.items():
392 if util.lower(k) == name:
393 value = cls.normalize_value(v)
394 break
395 return value
397 @classmethod
398 def iter_attributes(cls, el: bs4.Tag | None) -> Iterator[tuple[str, str | Sequence[str] | None]]:
399 """Iterate attributes."""
401 if el is not None:
402 for k, v in el.attrs.items():
403 yield k, cls.normalize_value(v)
405 @classmethod
406 def get_classes(cls, el: bs4.Tag) -> Sequence[str]:
407 """Get classes."""
409 classes = cls.get_attribute_by_name(el, 'class', [])
410 if isinstance(classes, str):
411 classes = RE_NOT_WS.findall(classes)
412 return cast(Sequence[str], classes)
414 def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str:
415 """Get text."""
417 return ''.join(
418 [
419 node for node in self.get_descendants(el, no_iframe=no_iframe) # type: ignore[misc]
420 if self.is_content_string(node)
421 ]
422 )
424 def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]:
425 """Get Own Text."""
427 return [
428 node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node) # type: ignore[misc]
429 ]
432class Inputs:
433 """Class for parsing and validating input items."""
435 @staticmethod
436 def validate_day(year: int, month: int, day: int) -> bool:
437 """Validate day."""
439 max_days = LONG_MONTH
440 if month == FEB:
441 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
442 elif month in MONTHS_30:
443 max_days = SHORT_MONTH
444 return 1 <= day <= max_days
446 @staticmethod
447 def validate_week(year: int, week: int) -> bool:
448 """Validate week."""
450 # Validate an ISO week number for `year`.
451 #
452 # Per ISO 8601 rules, the last ISO week of a year is the week
453 # containing Dec 28. Using Dec 28 guarantees we obtain the
454 # correct ISO week-number for the final week of `year`, even in
455 # years where Dec 31 falls in ISO week 01 of the following year.
456 #
457 # Example: if Dec 31 is a Thursday the year's last ISO week will
458 # be week 53; if Dec 31 is a Monday and that week is counted as
459 # week 1 of the next year, Dec 28 still belongs to the final
460 # week of the current ISO year and yields the correct max week.
461 max_week = datetime(year, 12, 28).isocalendar()[1]
462 return 1 <= week <= max_week
464 @staticmethod
465 def validate_month(month: int) -> bool:
466 """Validate month."""
468 return 1 <= month <= 12
470 @staticmethod
471 def validate_year(year: int) -> bool:
472 """Validate year."""
474 return 1 <= year
476 @staticmethod
477 def validate_hour(hour: int) -> bool:
478 """Validate hour."""
480 return 0 <= hour <= 23
482 @staticmethod
483 def validate_minutes(minutes: int) -> bool:
484 """Validate minutes."""
486 return 0 <= minutes <= 59
488 @classmethod
489 def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:
490 """Parse the input value."""
492 parsed = None # type: tuple[float, ...] | None
493 if value is None:
494 return value
495 if itype == "date":
496 m = RE_DATE.match(value)
497 if m:
498 year = int(m.group('year'), 10)
499 month = int(m.group('month'), 10)
500 day = int(m.group('day'), 10)
501 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
502 parsed = (year, month, day)
503 elif itype == "month":
504 m = RE_MONTH.match(value)
505 if m:
506 year = int(m.group('year'), 10)
507 month = int(m.group('month'), 10)
508 if cls.validate_year(year) and cls.validate_month(month):
509 parsed = (year, month)
510 elif itype == "week":
511 m = RE_WEEK.match(value)
512 if m:
513 year = int(m.group('year'), 10)
514 week = int(m.group('week'), 10)
515 if cls.validate_year(year) and cls.validate_week(year, week):
516 parsed = (year, week)
517 elif itype == "time":
518 m = RE_TIME.match(value)
519 if m:
520 hour = int(m.group('hour'), 10)
521 minutes = int(m.group('minutes'), 10)
522 if cls.validate_hour(hour) and cls.validate_minutes(minutes):
523 parsed = (hour, minutes)
524 elif itype == "datetime-local":
525 m = RE_DATETIME.match(value)
526 if m:
527 year = int(m.group('year'), 10)
528 month = int(m.group('month'), 10)
529 day = int(m.group('day'), 10)
530 hour = int(m.group('hour'), 10)
531 minutes = int(m.group('minutes'), 10)
532 if (
533 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
534 cls.validate_hour(hour) and cls.validate_minutes(minutes)
535 ):
536 parsed = (year, month, day, hour, minutes)
537 elif itype in ("number", "range"):
538 m = RE_NUM.match(value)
539 if m:
540 parsed = (float(m.group('value')),)
541 return parsed
544class CSSMatch(_DocumentNav):
545 """Perform CSS matching."""
547 def __init__(
548 self,
549 selectors: ct.SelectorList,
550 scope: bs4.Tag | None,
551 namespaces: ct.Namespaces | None,
552 flags: int
553 ) -> None:
554 """Initialize."""
556 self.assert_valid_input(scope)
557 self.tag = scope
558 self.cached_meta_lang = [] # type: list[tuple[str, str]]
559 self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]]
560 self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]]
561 self.selectors = selectors
562 self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str]
563 self.flags = flags
564 self.iframe_restrict = False
566 # Find the root element for the whole tree
567 doc = scope
568 parent = self.get_parent(doc)
569 while parent:
570 doc = parent
571 parent = self.get_parent(doc)
572 root = None # type: bs4.Tag | None
573 if not self.is_doc(doc):
574 root = doc
575 else:
576 for child in self.get_tag_children(doc):
577 root = child
578 break
580 self.root = root
581 self.scope = scope if scope is not doc else root
582 self.has_html_namespace = self.has_html_ns(root)
584 # A document can be both XML and HTML (XHTML)
585 self.is_xml = self.is_xml_tree(doc)
586 self.is_html = not self.is_xml or self.has_html_namespace
588 def supports_namespaces(self) -> bool:
589 """Check if namespaces are supported in the HTML type."""
591 return self.is_xml or self.has_html_namespace
593 def get_tag_ns(self, el: bs4.Tag | None) -> str:
594 """Get tag namespace."""
596 namespace = ''
597 if el is None: # pragma: no cover
598 return namespace
600 if self.supports_namespaces():
601 ns = self.get_uri(el)
602 if ns:
603 namespace = ns
604 else:
605 namespace = NS_XHTML
606 return namespace
608 def is_html_tag(self, el: bs4.Tag | None) -> bool:
609 """Check if tag is in HTML namespace."""
611 return self.get_tag_ns(el) == NS_XHTML
613 def get_tag(self, el: bs4.Tag | None) -> str | None:
614 """Get tag."""
616 name = self.get_tag_name(el)
617 return util.lower(name) if name is not None and not self.is_xml else name
619 def get_prefix(self, el: bs4.Tag) -> str | None:
620 """Get prefix."""
622 prefix = self.get_prefix_name(el)
623 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
625 def find_bidi(self, el: bs4.Tag) -> int | None:
626 """Get directionality from element text."""
628 for node in self.get_children(el):
630 # Analyze child text nodes
631 if self.is_tag(node):
633 # Avoid analyzing certain elements specified in the specification.
634 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) # type: ignore[arg-type]
635 name = self.get_tag(node) # type: ignore[arg-type]
636 if (
637 (name and name in ('bdi', 'script', 'style', 'textarea', 'iframe')) or
638 not self.is_html_tag(node) or # type: ignore[arg-type]
639 direction is not None
640 ):
641 continue # pragma: no cover
643 # Check directionality of this node's text
644 value = self.find_bidi(node) # type: ignore[arg-type]
645 if value is not None:
646 return value
648 # Direction could not be determined
649 continue # pragma: no cover
651 # Skip `doctype` comments, etc.
652 if self.is_special_string(node):
653 continue
655 # Analyze text nodes for directionality.
656 for c in node: # type: ignore[attr-defined]
657 bidi = unicodedata.bidirectional(c)
658 if bidi in ('AL', 'R', 'L'):
659 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
660 return None
662 def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool:
663 """Filter the language tags."""
665 match = True
666 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
667 ranges = lang_range.split('-')
668 subtags = lang_tag.lower().split('-')
669 length = len(ranges)
670 slength = len(subtags)
671 rindex = 0
672 sindex = 0
673 r = ranges[rindex]
674 s = subtags[sindex]
676 # Empty specified language should match unspecified language attributes
677 if length == 1 and slength == 1 and not r and r == s:
678 return True
680 # Primary tag needs to match
681 if (r != '*' and r != s) or (r == '*' and slength == 1 and not s):
682 match = False
684 rindex += 1
685 sindex += 1
687 # Match until we run out of ranges
688 while match and rindex < length:
689 r = ranges[rindex]
690 try:
691 s = subtags[sindex]
692 except IndexError:
693 # Ran out of subtags,
694 # but we still have ranges
695 match = False
696 continue
698 # Empty range
699 if not r:
700 match = False
701 continue
703 # Matched range
704 elif s == r:
705 rindex += 1
707 # Implicit wildcard cannot match
708 # singletons
709 elif len(s) == 1:
710 match = False
711 continue
713 # Implicitly matched, so grab next subtag
714 sindex += 1
716 return match
718 def match_attribute_name(
719 self,
720 el: bs4.Tag,
721 attr: str,
722 prefix: str | None
723 ) -> str | Sequence[str] | None:
724 """Match attribute name and return value if it exists."""
726 value = None
727 if self.supports_namespaces():
728 value = None
729 # If we have not defined namespaces, we can't very well find them, so don't bother trying.
730 if prefix:
731 ns = self.namespaces.get(prefix)
732 if ns is None and prefix != '*':
733 return None
734 else:
735 ns = None
737 for k, v in self.iter_attributes(el):
739 # Get attribute parts
740 namespace, name = self.split_namespace(el, k)
742 # Can't match a prefix attribute as we haven't specified one to match
743 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
744 if ns is None:
745 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
746 value = v
747 break
748 # Coverage is not finding this even though it is executed.
749 # Adding a print statement before this (and erasing coverage) causes coverage to find the line.
750 # Ignore the false positive message.
751 continue # pragma: no cover
753 # We can't match our desired prefix attribute as the attribute doesn't have a prefix
754 if namespace is None or (ns != namespace and prefix != '*'):
755 continue
757 # The attribute doesn't match.
758 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
759 continue
761 value = v
762 break
763 else:
764 for k, v in self.iter_attributes(el):
765 if util.lower(attr) != util.lower(k):
766 continue
767 value = v
768 break
769 return value
771 def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
772 """Match the namespace of the element."""
774 match = True
775 namespace = self.get_tag_ns(el)
776 default_namespace = self.namespaces.get('')
777 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix)
778 # We must match the default namespace if one is not provided
779 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
780 match = False
781 # If we specified `|tag`, we must not have a namespace.
782 elif (tag.prefix is not None and tag.prefix == '' and namespace):
783 match = False
784 # Verify prefix matches
785 elif (
786 tag.prefix and
787 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
788 ):
789 match = False
790 return match
792 def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool:
793 """Match attributes."""
795 match = True
796 if attributes:
797 for a in attributes:
798 temp = self.match_attribute_name(el, a.attribute, a.prefix)
799 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
800 if temp is None:
801 match = False
802 break
803 value = temp if isinstance(temp, str) else ' '.join(temp)
804 if pattern is None:
805 continue
806 elif pattern.match(value) is None:
807 match = False
808 break
809 return match
811 def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
812 """Match tag name."""
814 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
815 return not (
816 name is not None and
817 name not in (self.get_tag(el), '*')
818 )
820 def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:
821 """Match the tag."""
823 match = True
824 if tag is not None:
825 # Verify namespace
826 if not self.match_namespace(el, tag):
827 match = False
828 if not self.match_tagname(el, tag):
829 match = False
830 return match
832 def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
833 """Match past relationship."""
835 found = False
836 # I don't think this can ever happen, but it makes `mypy` happy
837 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
838 return found
840 if relation[0].rel_type == REL_PARENT:
841 parent = self.get_parent(el, no_iframe=self.iframe_restrict)
842 while not found and parent:
843 found = self.match_selectors(parent, relation)
844 parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
845 elif relation[0].rel_type == REL_CLOSE_PARENT:
846 parent = self.get_parent(el, no_iframe=self.iframe_restrict)
847 if parent:
848 found = self.match_selectors(parent, relation)
849 elif relation[0].rel_type == REL_SIBLING:
850 sibling = self.get_previous_tag(el)
851 while not found and sibling:
852 found = self.match_selectors(sibling, relation)
853 sibling = self.get_previous_tag(sibling)
854 elif relation[0].rel_type == REL_CLOSE_SIBLING:
855 sibling = self.get_previous_tag(el)
856 if sibling and self.is_tag(sibling):
857 found = self.match_selectors(sibling, relation)
858 return found
860 def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool:
861 """Match future child."""
863 match = False
864 if recursive:
865 children = self.get_tag_descendants # type: Callable[..., Iterator[bs4.Tag]]
866 else:
867 children = self.get_tag_children
868 for child in children(parent, no_iframe=self.iframe_restrict):
869 match = self.match_selectors(child, relation)
870 if match:
871 break
872 return match
874 def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
875 """Match future relationship."""
877 found = False
878 # I don't think this can ever happen, but it makes `mypy` happy
879 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
880 return found
882 if relation[0].rel_type == REL_HAS_PARENT:
883 found = self.match_future_child(el, relation, True)
884 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
885 found = self.match_future_child(el, relation)
886 elif relation[0].rel_type == REL_HAS_SIBLING:
887 sibling = self.get_next_tag(el)
888 while not found and sibling:
889 found = self.match_selectors(sibling, relation)
890 sibling = self.get_next_tag(sibling)
891 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
892 sibling = self.get_next_tag(el)
893 if sibling and self.is_tag(sibling):
894 found = self.match_selectors(sibling, relation)
895 return found
897 def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
898 """Match relationship to other elements."""
900 found = False
902 if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None:
903 return found
905 if relation[0].rel_type.startswith(':'):
906 found = self.match_future_relations(el, relation)
907 else:
908 found = self.match_past_relations(el, relation)
910 return found
912 def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool:
913 """Match element's ID."""
915 found = True
916 for i in ids:
917 if i != self.get_attribute_by_name(el, 'id', ''):
918 found = False
919 break
920 return found
922 def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool:
923 """Match element's classes."""
925 current_classes = self.get_classes(el)
926 found = True
927 for c in classes:
928 if c not in current_classes:
929 found = False
930 break
931 return found
933 def match_root(self, el: bs4.Tag) -> bool:
934 """Match element as root."""
936 is_root = self.is_root(el)
937 if is_root:
938 sibling = self.get_previous(el) # type: Any
939 while is_root and sibling is not None:
940 if (
941 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
942 self.is_cdata(sibling)
943 ):
944 is_root = False
945 else:
946 sibling = self.get_previous(sibling)
947 if is_root:
948 sibling = self.get_next(el)
949 while is_root and sibling is not None:
950 if (
951 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
952 self.is_cdata(sibling)
953 ):
954 is_root = False
955 else:
956 sibling = self.get_next(sibling)
957 return is_root
959 def match_scope(self, el: bs4.Tag) -> bool:
960 """Match element as scope."""
962 return self.scope is el
964 def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool:
965 """Match tag type for `nth` matches."""
967 return (
968 (self.get_tag(child) == self.get_tag(el)) and
969 (self.get_tag_ns(child) == self.get_tag_ns(el))
970 )
972 def match_nth(self, el: bs4.Tag, nth: tuple[ct.SelectorNth, ...]) -> bool:
973 """Match `nth` elements."""
975 matched = True
977 for n in nth:
978 matched = False
979 if n.selectors and not self.match_selectors(el, n.selectors):
980 break
981 parent = self.get_parent(el) # type: bs4.Tag | None
982 if parent is None:
983 parent = cast('bs4.Tag', self.create_fake_parent(el))
984 last = n.last
985 last_index = len(parent) - 1
986 index = last_index if last else 0
987 relative_index = 0
988 a = n.a
989 b = n.b
990 var = n.n
991 count = 0
992 count_incr = 1
993 factor = -1 if last else 1
994 idx = last_idx = a * count + b if var else a
996 # We can only adjust bounds within a variable index
997 if var:
998 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
999 # Otherwise, increment to try to get in bounds.
1000 adjust = None
1001 while idx < 1 or idx > last_index:
1002 if idx < 0:
1003 diff_low = 0 - idx
1004 if adjust is not None and adjust == 1:
1005 break
1006 adjust = -1
1007 count += count_incr
1008 idx = last_idx = a * count + b if var else a
1009 diff = 0 - idx
1010 if diff >= diff_low:
1011 break
1012 else:
1013 diff_high = idx - last_index
1014 if adjust is not None and adjust == -1:
1015 break
1016 adjust = 1
1017 count += count_incr
1018 idx = last_idx = a * count + b if var else a
1019 diff = idx - last_index
1020 if diff >= diff_high:
1021 break
1022 diff_high = diff
1024 # If a < 0, our count is working backwards, so floor the index by increasing the count.
1025 # Find the count that yields the lowest, in bound value and use that.
1026 # Lastly reverse count increment so that we'll increase our index.
1027 lowest = count
1028 if a < 0:
1029 while idx >= 1:
1030 lowest = count
1031 count += count_incr
1032 idx = last_idx = a * count + b if var else a
1033 count_incr = -1
1034 count = lowest
1035 idx = last_idx = a * count + b if var else a
1037 # Evaluate elements while our calculated nth index is still in range
1038 while 1 <= idx <= last_index + 1:
1039 child = None # type: bs4.element.PageElement | None
1040 # Evaluate while our child index is still in range.
1041 for child in self.get_children(parent, start=index, reverse=factor < 0):
1042 index += factor
1043 if not isinstance(child, bs4.Tag):
1044 continue
1045 # Handle `of S` in `nth-child`
1046 if n.selectors and not self.match_selectors(child, n.selectors):
1047 continue
1048 # Handle `of-type`
1049 if n.of_type and not self.match_nth_tag_type(el, child):
1050 continue
1051 relative_index += 1
1052 if relative_index == idx:
1053 if child is el:
1054 matched = True
1055 else:
1056 break
1057 if child is el:
1058 break
1059 if child is el:
1060 break
1061 last_idx = idx
1062 count += count_incr
1063 if count < 0:
1064 # Count is counting down and has now ventured into invalid territory.
1065 break
1066 idx = a * count + b if var else a
1067 if last_idx == idx:
1068 break
1069 if not matched:
1070 break
1071 return matched
1073 def match_empty(self, el: bs4.Tag) -> bool:
1074 """Check if element is empty (if requested)."""
1076 is_empty = True
1077 for child in self.get_children(el):
1078 if self.is_tag(child):
1079 is_empty = False
1080 break
1081 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): # type: ignore[call-overload]
1082 is_empty = False
1083 break
1084 return is_empty
1086 def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool:
1087 """Match selectors."""
1089 match = True
1090 for sel in selectors:
1091 if not self.match_selectors(el, sel):
1092 match = False
1093 return match
1095 def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool:
1096 """Match element if it contains text."""
1098 match = True
1099 content = None # type: str | Sequence[str] | None
1100 for contain_list in contains:
1101 if content is None:
1102 if contain_list.own:
1103 content = self.get_own_text(el, no_iframe=self.is_html)
1104 else:
1105 content = self.get_text(el, no_iframe=self.is_html)
1106 found = False
1107 for text in contain_list.text:
1108 if contain_list.own:
1109 for c in content:
1110 if text in c:
1111 found = True
1112 break
1113 if found:
1114 break
1115 else:
1116 if text in content:
1117 found = True
1118 break
1119 if not found:
1120 match = False
1121 return match
1123 def match_default(self, el: bs4.Tag) -> bool:
1124 """Match default."""
1126 match = False
1128 # Find this input's form
1129 form = None # type: bs4.Tag | None
1130 parent = self.get_parent(el, no_iframe=True)
1131 while parent and form is None:
1132 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
1133 form = parent
1134 else:
1135 parent = self.get_parent(parent, no_iframe=True)
1137 if form is not None:
1138 # Look in form cache to see if we've already located its default button
1139 found_form = False
1140 for f, t in self.cached_default_forms:
1141 if f is form:
1142 found_form = True
1143 if t is el:
1144 match = True
1145 break
1147 # We didn't have the form cached, so look for its default button
1148 if not found_form:
1149 for child in self.get_tag_descendants(form, no_iframe=True):
1150 name = self.get_tag(child)
1151 # Can't do nested forms (haven't figured out why we never hit this)
1152 if name == 'form': # pragma: no cover
1153 break
1154 if name in ('input', 'button'):
1155 v = self.get_attribute_by_name(child, 'type', '')
1156 if v and util.lower(v) == 'submit':
1157 self.cached_default_forms.append((form, child))
1158 if el is child:
1159 match = True
1160 break
1161 return match
1163 def match_indeterminate(self, el: bs4.Tag) -> bool:
1164 """Match default."""
1166 match = False
1167 name = cast(str, self.get_attribute_by_name(el, 'name'))
1169 def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:
1170 """Find this input's form."""
1171 form = None
1172 parent = self.get_parent(el, no_iframe=True)
1173 while form is None:
1174 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
1175 form = parent
1176 break
1177 last_parent = parent
1178 parent = self.get_parent(parent, no_iframe=True)
1179 if parent is None:
1180 form = last_parent
1181 break
1182 return form
1184 form = get_parent_form(el)
1186 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
1187 if form is not None:
1188 found_form = False
1189 for f, n, i in self.cached_indeterminate_forms:
1190 if f is form and n == name:
1191 found_form = True
1192 if i is True:
1193 match = True
1194 break
1196 # We didn't have the form cached, so validate that the radio button is indeterminate
1197 if not found_form:
1198 checked = False
1199 for child in self.get_tag_descendants(form, no_iframe=True):
1200 if child is el:
1201 continue
1202 tag_name = self.get_tag(child)
1203 if tag_name == 'input':
1204 is_radio = False
1205 check = False
1206 has_name = False
1207 for k, v in self.iter_attributes(child):
1208 if util.lower(k) == 'type' and util.lower(v) == 'radio':
1209 is_radio = True
1210 elif util.lower(k) == 'name' and v == name:
1211 has_name = True
1212 elif util.lower(k) == 'checked':
1213 check = True
1214 if is_radio and check and has_name and get_parent_form(child) is form:
1215 checked = True
1216 break
1217 if checked:
1218 break
1219 if not checked:
1220 match = True
1221 self.cached_indeterminate_forms.append((form, name, match))
1223 return match
1225 def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool:
1226 """Match languages."""
1228 match = False
1229 has_ns = self.supports_namespaces()
1230 root = self.root
1231 has_html_namespace = self.has_html_namespace
1233 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
1234 parent = el # type: bs4.Tag | None
1235 found_lang = None
1236 last = None
1237 while not found_lang:
1238 has_html_ns = self.has_html_ns(parent)
1239 for k, v in self.iter_attributes(parent):
1240 attr_ns, attr = self.split_namespace(parent, k)
1241 if (
1242 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
1243 (
1244 has_ns and not has_html_ns and attr_ns == NS_XML and
1245 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
1246 )
1247 ):
1248 found_lang = v
1249 break
1250 last = parent
1251 parent = self.get_parent(parent, no_iframe=self.is_html)
1253 if parent is None:
1254 root = last
1255 has_html_namespace = self.has_html_ns(root)
1256 parent = last
1257 break
1259 # Use cached meta language.
1260 if found_lang is None and self.cached_meta_lang:
1261 for cache in self.cached_meta_lang:
1262 if root is not None and cast(str, root) is cache[0]:
1263 found_lang = cache[1]
1265 # If we couldn't find a language, and the document is HTML, look to meta to determine language.
1266 if found_lang is None and (not self.is_xml or (has_html_namespace and root and root.name == 'html')):
1267 # Find head
1268 found = False
1269 for tag in ('html', 'head'):
1270 found = False
1271 for child in self.get_tag_children(parent, no_iframe=self.is_html):
1272 if self.get_tag(child) == tag and self.is_html_tag(child):
1273 found = True
1274 parent = child
1275 break
1276 if not found: # pragma: no cover
1277 break
1279 # Search meta tags
1280 if found and parent is not None:
1281 for child2 in parent:
1282 if isinstance(child2, bs4.Tag) and self.get_tag(child2) == 'meta' and self.is_html_tag(parent):
1283 c_lang = False
1284 content = None
1285 for k, v in self.iter_attributes(child2):
1286 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
1287 c_lang = True
1288 if util.lower(k) == 'content':
1289 content = v
1290 if c_lang and content:
1291 found_lang = content
1292 self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))
1293 break
1294 if found_lang is not None:
1295 break
1296 if found_lang is None:
1297 self.cached_meta_lang.append((cast(str, root), ''))
1299 # If we determined a language, compare.
1300 if found_lang is not None:
1301 for patterns in langs:
1302 match = False
1303 for pattern in patterns:
1304 if self.extended_language_filter(pattern, cast(str, found_lang)):
1305 match = True
1306 if not match:
1307 break
1309 return match
1311 def match_dir(self, el: bs4.Tag | None, directionality: int) -> bool:
1312 """Check directionality."""
1314 # If we have to match both left and right, we can't match either.
1315 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
1316 return False
1318 if el is None or not self.is_html_tag(el):
1319 return False
1321 # Element has defined direction of left to right or right to left
1322 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
1323 if direction not in (None, 0):
1324 return direction == directionality
1326 # Element is the document element (the root) and no direction assigned, assume left to right.
1327 is_root = self.is_root(el)
1328 if is_root and direction is None:
1329 return ct.SEL_DIR_LTR == directionality
1331 # If `input[type=telephone]` and no direction is assigned, assume left to right.
1332 name = self.get_tag(el)
1333 is_input = name == 'input'
1334 is_textarea = name == 'textarea'
1335 is_bdi = name == 'bdi'
1336 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
1337 if is_input and itype == 'tel' and direction is None:
1338 return ct.SEL_DIR_LTR == directionality
1340 # Auto handling for text inputs
1341 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
1342 if is_textarea:
1343 value = ''.join(node for node in self.get_contents(el, no_iframe=True) if self.is_content_string(node)) # type: ignore[misc]
1344 else:
1345 value = cast(str, self.get_attribute_by_name(el, 'value', ''))
1346 if value:
1347 for c in value:
1348 bidi = unicodedata.bidirectional(c)
1349 if bidi in ('AL', 'R', 'L'):
1350 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
1351 return direction == directionality
1352 # Assume left to right
1353 return ct.SEL_DIR_LTR == directionality
1354 elif is_root:
1355 return ct.SEL_DIR_LTR == directionality
1356 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1358 # Auto handling for `bdi` and other non text inputs.
1359 if (is_bdi and direction is None) or direction == 0:
1360 direction = self.find_bidi(el)
1361 if direction is not None:
1362 return direction == directionality
1363 elif is_root:
1364 return ct.SEL_DIR_LTR == directionality
1365 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1367 # Match parents direction
1368 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1370 def match_range(self, el: bs4.Tag, condition: int) -> bool:
1371 """
1372 Match range.
1374 Behavior is modeled after what we see in browsers. Browsers seem to evaluate
1375 if the value is out of range, and if not, it is in range. So a missing value
1376 will not evaluate out of range; therefore, value is in range. Personally, I
1377 feel like this should evaluate as neither in or out of range.
1378 """
1380 out_of_range = False
1382 itype = util.lower(self.get_attribute_by_name(el, 'type'))
1383 mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None)))
1384 mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None)))
1386 # There is no valid min or max, so we cannot evaluate a range
1387 if mn is None and mx is None:
1388 return False
1390 value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None)))
1391 if value is not None:
1392 if itype in ("date", "datetime-local", "month", "week", "number", "range"):
1393 if mn is not None and value < mn:
1394 out_of_range = True
1395 if not out_of_range and mx is not None and value > mx:
1396 out_of_range = True
1397 elif itype == "time":
1398 if mn is not None and mx is not None and mn > mx:
1399 # Time is periodic, so this is a reversed/discontinuous range
1400 if value < mn and value > mx:
1401 out_of_range = True
1402 else:
1403 if mn is not None and value < mn:
1404 out_of_range = True
1405 if not out_of_range and mx is not None and value > mx:
1406 out_of_range = True
1408 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
1410 def match_defined(self, el: bs4.Tag) -> bool:
1411 """
1412 Match defined.
1414 `:defined` is related to custom elements in a browser.
1416 - If the document is XML (not XHTML), all tags will match.
1417 - Tags that are not custom (don't have a hyphen) are marked defined.
1418 - If the tag has a prefix (without or without a namespace), it will not match.
1420 This is of course requires the parser to provide us with the proper prefix and namespace info,
1421 if it doesn't, there is nothing we can do.
1422 """
1424 name = self.get_tag(el)
1425 return (
1426 name is not None and (
1427 name.find('-') == -1 or
1428 name.find(':') != -1 or
1429 self.get_prefix(el) is not None
1430 )
1431 )
1433 def match_placeholder_shown(self, el: bs4.Tag) -> bool:
1434 """
1435 Match placeholder shown according to HTML spec.
1437 - text area should be checked if they have content. A single newline does not count as content.
1439 """
1441 match = False
1442 content = self.get_text(el)
1443 if content in ('', '\n'):
1444 match = True
1446 return match
1448 def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool:
1449 """Check if element matches one of the selectors."""
1451 match = False
1452 is_not = selectors.is_not
1453 is_html = selectors.is_html
1455 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
1456 if is_html:
1457 namespaces = self.namespaces
1458 iframe_restrict = self.iframe_restrict
1459 self.namespaces = {'html': NS_XHTML}
1460 self.iframe_restrict = True
1462 if not is_html or self.is_html:
1463 for selector in selectors:
1464 match = is_not
1465 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
1466 if isinstance(selector, ct.SelectorNull):
1467 continue
1468 # Verify tag matches
1469 if not self.match_tag(el, selector.tag):
1470 continue
1471 # Verify tag is defined
1472 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
1473 continue
1474 # Verify element is root
1475 if selector.flags & ct.SEL_ROOT and not self.match_root(el):
1476 continue
1477 # Verify element is scope
1478 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
1479 continue
1480 # Verify element has placeholder shown
1481 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
1482 continue
1483 # Verify `nth` matches
1484 if not self.match_nth(el, selector.nth):
1485 continue
1486 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
1487 continue
1488 # Verify id matches
1489 if selector.ids and not self.match_id(el, selector.ids):
1490 continue
1491 # Verify classes match
1492 if selector.classes and not self.match_classes(el, selector.classes):
1493 continue
1494 # Verify attribute(s) match
1495 if not self.match_attributes(el, selector.attributes):
1496 continue
1497 # Verify ranges
1498 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
1499 continue
1500 # Verify language patterns
1501 if selector.lang and not self.match_lang(el, selector.lang):
1502 continue
1503 # Verify pseudo selector patterns
1504 if selector.selectors and not self.match_subselectors(el, selector.selectors):
1505 continue
1506 # Verify relationship selectors
1507 if selector.relation and not self.match_relations(el, selector.relation):
1508 continue
1509 # Validate that the current default selector match corresponds to the first submit button in the form
1510 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
1511 continue
1512 # Validate that the unset radio button is among radio buttons with the same name in a form that are
1513 # also not set.
1514 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
1515 continue
1516 # Validate element directionality
1517 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
1518 continue
1519 # Validate that the tag contains the specified text.
1520 if selector.contains and not self.match_contains(el, selector.contains):
1521 continue
1522 match = not is_not
1523 break
1525 # Restore actual namespaces being used for external selector lists
1526 if is_html:
1527 self.namespaces = namespaces
1528 self.iframe_restrict = iframe_restrict
1530 return match
1532 def select(self, limit: int = 0) -> Iterator[bs4.Tag]:
1533 """Match all tags under the targeted tag."""
1535 lim = None if limit < 1 else limit
1537 for child in self.get_tag_descendants(self.tag):
1538 if self.match(child):
1539 yield child
1540 if lim is not None:
1541 lim -= 1
1542 if lim < 1:
1543 break
1545 def closest(self) -> bs4.Tag | None:
1546 """Match closest ancestor."""
1548 current = self.tag # type: bs4.Tag | None
1549 closest = None
1550 while closest is None and current is not None:
1551 if self.match(current):
1552 closest = current
1553 else:
1554 current = self.get_parent(current)
1555 return closest
1557 def filter(self) -> list[bs4.Tag]: # noqa A001
1558 """Filter tag's children."""
1560 return [
1561 tag for tag in self.get_contents(self.tag)
1562 if isinstance(tag, bs4.Tag) and self.match(tag)
1563 ]
1565 def match(self, el: bs4.Tag) -> bool:
1566 """Match."""
1568 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
1571class SoupSieve(ct.Immutable):
1572 """Compiled Soup Sieve selector matching object."""
1574 pattern: str
1575 selectors: ct.SelectorList
1576 namespaces: ct.Namespaces | None
1577 custom: dict[str, str]
1578 flags: int
1580 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
1582 def __init__(
1583 self,
1584 pattern: str,
1585 selectors: ct.SelectorList,
1586 namespaces: ct.Namespaces | None,
1587 custom: ct.CustomSelectors | None,
1588 flags: int
1589 ):
1590 """Initialize."""
1592 super().__init__(
1593 pattern=pattern,
1594 selectors=selectors,
1595 namespaces=namespaces,
1596 custom=custom,
1597 flags=flags
1598 )
1600 def match(self, tag: bs4.Tag) -> bool:
1601 """Match."""
1603 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
1605 def closest(self, tag: bs4.Tag) -> bs4.Tag | None:
1606 """Match closest ancestor."""
1608 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
1610 def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001
1611 """
1612 Filter.
1614 `CSSMatch` can cache certain searches for tags of the same document,
1615 so if we are given a tag, all tags are from the same document,
1616 and we can take advantage of the optimization.
1618 Any other kind of iterable could have tags from different documents or detached tags,
1619 so for those, we use a new `CSSMatch` for each item in the iterable.
1620 """
1622 if isinstance(iterable, bs4.Tag):
1623 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
1624 else:
1625 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
1627 def select_one(self, tag: bs4.Tag) -> bs4.Tag | None:
1628 """Select a single tag."""
1630 tags = self.select(tag, limit=1)
1631 return tags[0] if tags else None
1633 def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]:
1634 """Select the specified tags."""
1636 return list(self.iselect(tag, limit))
1638 def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]:
1639 """Iterate the specified tags."""
1641 yield from CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit)
1643 def __repr__(self) -> str: # pragma: no cover
1644 """Representation."""
1646 return (
1647 f"SoupSieve(pattern={self.pattern!r}, namespaces={self.namespaces!r}, "
1648 f"custom={self.custom!r}, flags={self.flags!r})"
1649 )
1651 __str__ = __repr__
1654ct.pickle_register(SoupSieve)