1"""CSS matcher."""
2from __future__ import annotations
3from datetime import datetime
4from . import util
5import re
6from . import css_types as ct
7import unicodedata
8import bs4 # type: ignore[import-untyped]
9from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401
10
11# Empty tag pattern (whitespace okay)
12RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
13
14RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
15
16# Relationships
17REL_PARENT = ' '
18REL_CLOSE_PARENT = '>'
19REL_SIBLING = '~'
20REL_CLOSE_SIBLING = '+'
21
22# Relationships for :has() (forward looking)
23REL_HAS_PARENT = ': '
24REL_HAS_CLOSE_PARENT = ':>'
25REL_HAS_SIBLING = ':~'
26REL_HAS_CLOSE_SIBLING = ':+'
27
28NS_XHTML = 'http://www.w3.org/1999/xhtml'
29NS_XML = 'http://www.w3.org/XML/1998/namespace'
30
31DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
32RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
33
34DIR_MAP = {
35 'ltr': ct.SEL_DIR_LTR,
36 'rtl': ct.SEL_DIR_RTL,
37 'auto': 0
38}
39
40RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
41RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
42RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
43RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
44RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
45RE_DATETIME = re.compile(
46 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
47)
48RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
49
50MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
51FEB = 2
52SHORT_MONTH = 30
53LONG_MONTH = 31
54FEB_MONTH = 28
55FEB_LEAP_MONTH = 29
56DAYS_IN_WEEK = 7
57
58
59class _FakeParent:
60 """
61 Fake parent class.
62
63 When we have a fragment with no `BeautifulSoup` document object,
64 we can't evaluate `nth` selectors properly. Create a temporary
65 fake parent so we can traverse the root element as a child.
66 """
67
68 def __init__(self, element: bs4.Tag) -> None:
69 """Initialize."""
70
71 self.contents = [element]
72
73 def __len__(self) -> bs4.PageElement:
74 """Length."""
75
76 return len(self.contents)
77
78
79class _DocumentNav:
80 """Navigate a Beautiful Soup document."""
81
82 @classmethod
83 def assert_valid_input(cls, tag: Any) -> None:
84 """Check if valid input tag or document."""
85
86 # Fail on unexpected types.
87 if not cls.is_tag(tag):
88 raise TypeError(f"Expected a BeautifulSoup 'Tag', but instead received type {type(tag)}")
89
90 @staticmethod
91 def is_doc(obj: bs4.Tag) -> bool:
92 """Is `BeautifulSoup` object."""
93 return isinstance(obj, bs4.BeautifulSoup)
94
95 @staticmethod
96 def is_tag(obj: bs4.PageElement) -> bool:
97 """Is tag."""
98 return isinstance(obj, bs4.Tag)
99
100 @staticmethod
101 def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover
102 """Is declaration."""
103 return isinstance(obj, bs4.Declaration)
104
105 @staticmethod
106 def is_cdata(obj: bs4.PageElement) -> bool:
107 """Is CDATA."""
108 return isinstance(obj, bs4.CData)
109
110 @staticmethod
111 def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover
112 """Is processing instruction."""
113 return isinstance(obj, bs4.ProcessingInstruction)
114
115 @staticmethod
116 def is_navigable_string(obj: bs4.PageElement) -> bool:
117 """Is navigable string."""
118 return isinstance(obj, bs4.NavigableString)
119
120 @staticmethod
121 def is_special_string(obj: bs4.PageElement) -> bool:
122 """Is special string."""
123 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
124
125 @classmethod
126 def is_content_string(cls, obj: bs4.PageElement) -> bool:
127 """Check if node is content string."""
128
129 return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
130
131 @staticmethod
132 def create_fake_parent(el: bs4.Tag) -> _FakeParent:
133 """Create fake parent for a given element."""
134
135 return _FakeParent(el)
136
137 @staticmethod
138 def is_xml_tree(el: bs4.Tag) -> bool:
139 """Check if element (or document) is from a XML tree."""
140
141 return bool(el._is_xml)
142
143 def is_iframe(self, el: bs4.Tag) -> bool:
144 """Check if element is an `iframe`."""
145
146 return bool(
147 ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and
148 self.is_html_tag(el) # type: ignore[attr-defined]
149 )
150
151 def is_root(self, el: bs4.Tag) -> bool:
152 """
153 Return whether element is a root element.
154
155 We check that the element is the root of the tree (which we have already pre-calculated),
156 and we check if it is the root element under an `iframe`.
157 """
158
159 root = self.root and self.root is el # type: ignore[attr-defined]
160 if not root:
161 parent = self.get_parent(el)
162 root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]
163 return root
164
165 def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]:
166 """Get contents or contents in reverse."""
167 if not no_iframe or not self.is_iframe(el):
168 yield from el.contents
169
170 def get_children(
171 self,
172 el: bs4.Tag,
173 start: int | None = None,
174 reverse: bool = False,
175 tags: bool = True,
176 no_iframe: bool = False
177 ) -> Iterator[bs4.PageElement]:
178 """Get children."""
179
180 if not no_iframe or not self.is_iframe(el):
181 last = len(el.contents) - 1
182 if start is None:
183 index = last if reverse else 0
184 else:
185 index = start
186 end = -1 if reverse else last + 1
187 incr = -1 if reverse else 1
188
189 if 0 <= index <= last:
190 while index != end:
191 node = el.contents[index]
192 index += incr
193 if not tags or self.is_tag(node):
194 yield node
195
196 def get_descendants(
197 self,
198 el: bs4.Tag,
199 tags: bool = True,
200 no_iframe: bool = False
201 ) -> Iterator[bs4.PageElement]:
202 """Get descendants."""
203
204 if not no_iframe or not self.is_iframe(el):
205 next_good = None
206 for child in el.descendants:
207
208 if next_good is not None:
209 if child is not next_good:
210 continue
211 next_good = None
212
213 is_tag = self.is_tag(child)
214
215 if no_iframe and is_tag and self.is_iframe(child):
216 if child.next_sibling is not None:
217 next_good = child.next_sibling
218 else:
219 last_child = child
220 while self.is_tag(last_child) and last_child.contents:
221 last_child = last_child.contents[-1]
222 next_good = last_child.next_element
223 yield child
224 if next_good is None:
225 break
226 # Coverage isn't seeing this even though it's executed
227 continue # pragma: no cover
228
229 if not tags or is_tag:
230 yield child
231
232 def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag:
233 """Get parent."""
234
235 parent = el.parent
236 if no_iframe and parent is not None and self.is_iframe(parent):
237 parent = None
238 return parent
239
240 @staticmethod
241 def get_tag_name(el: bs4.Tag) -> str | None:
242 """Get tag."""
243
244 return cast('str | None', el.name)
245
246 @staticmethod
247 def get_prefix_name(el: bs4.Tag) -> str | None:
248 """Get prefix."""
249
250 return cast('str | None', el.prefix)
251
252 @staticmethod
253 def get_uri(el: bs4.Tag) -> str | None:
254 """Get namespace `URI`."""
255
256 return cast('str | None', el.namespace)
257
258 @classmethod
259 def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
260 """Get next sibling tag."""
261
262 sibling = el.next_sibling
263 while tags and not cls.is_tag(sibling) and sibling is not None:
264 sibling = sibling.next_sibling
265 return sibling
266
267 @classmethod
268 def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
269 """Get previous sibling tag."""
270
271 sibling = el.previous_sibling
272 while tags and not cls.is_tag(sibling) and sibling is not None:
273 sibling = sibling.previous_sibling
274 return sibling
275
276 @staticmethod
277 def has_html_ns(el: bs4.Tag) -> bool:
278 """
279 Check if element has an HTML namespace.
280
281 This is a bit different than whether a element is treated as having an HTML namespace,
282 like we do in the case of `is_html_tag`.
283 """
284
285 ns = getattr(el, 'namespace') if el else None # noqa: B009
286 return bool(ns and ns == NS_XHTML)
287
288 @staticmethod
289 def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]:
290 """Return namespace and attribute name without the prefix."""
291
292 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
293
294 @classmethod
295 def normalize_value(cls, value: Any) -> str | Sequence[str]:
296 """Normalize the value to be a string or list of strings."""
297
298 # Treat `None` as empty string.
299 if value is None:
300 return ''
301
302 # Pass through strings
303 if (isinstance(value, str)):
304 return value
305
306 # If it's a byte string, convert it to Unicode, treating it as UTF-8.
307 if isinstance(value, bytes):
308 return value.decode("utf8")
309
310 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
311 if isinstance(value, Sequence):
312 new_value = []
313 for v in value:
314 if not isinstance(v, (str, bytes)) and isinstance(v, Sequence):
315 # This is most certainly a user error and will crash and burn later.
316 # To keep things working, we'll do what we do with all objects,
317 # And convert them to strings.
318 new_value.append(str(v))
319 else:
320 # Convert the child to a string
321 new_value.append(cast(str, cls.normalize_value(v)))
322 return new_value
323
324 # Try and make anything else a string
325 return str(value)
326
327 @classmethod
328 def get_attribute_by_name(
329 cls,
330 el: bs4.Tag,
331 name: str,
332 default: str | Sequence[str] | None = None
333 ) -> str | Sequence[str] | None:
334 """Get attribute by name."""
335
336 value = default
337 if el._is_xml:
338 try:
339 value = cls.normalize_value(el.attrs[name])
340 except KeyError:
341 pass
342 else:
343 for k, v in el.attrs.items():
344 if util.lower(k) == name:
345 value = cls.normalize_value(v)
346 break
347 return value
348
349 @classmethod
350 def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]:
351 """Iterate attributes."""
352
353 for k, v in el.attrs.items():
354 yield k, cls.normalize_value(v)
355
356 @classmethod
357 def get_classes(cls, el: bs4.Tag) -> Sequence[str]:
358 """Get classes."""
359
360 classes = cls.get_attribute_by_name(el, 'class', [])
361 if isinstance(classes, str):
362 classes = RE_NOT_WS.findall(classes)
363 return cast(Sequence[str], classes)
364
365 def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str:
366 """Get text."""
367
368 return ''.join(
369 [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
370 )
371
372 def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]:
373 """Get Own Text."""
374
375 return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
376
377
378class Inputs:
379 """Class for parsing and validating input items."""
380
381 @staticmethod
382 def validate_day(year: int, month: int, day: int) -> bool:
383 """Validate day."""
384
385 max_days = LONG_MONTH
386 if month == FEB:
387 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
388 elif month in MONTHS_30:
389 max_days = SHORT_MONTH
390 return 1 <= day <= max_days
391
392 @staticmethod
393 def validate_week(year: int, week: int) -> bool:
394 """Validate week."""
395
396 max_week = datetime.strptime(f"{12}-{31}-{year}", "%m-%d-%Y").isocalendar()[1]
397 if max_week == 1:
398 max_week = 53
399 return 1 <= week <= max_week
400
401 @staticmethod
402 def validate_month(month: int) -> bool:
403 """Validate month."""
404
405 return 1 <= month <= 12
406
407 @staticmethod
408 def validate_year(year: int) -> bool:
409 """Validate year."""
410
411 return 1 <= year
412
413 @staticmethod
414 def validate_hour(hour: int) -> bool:
415 """Validate hour."""
416
417 return 0 <= hour <= 23
418
419 @staticmethod
420 def validate_minutes(minutes: int) -> bool:
421 """Validate minutes."""
422
423 return 0 <= minutes <= 59
424
425 @classmethod
426 def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:
427 """Parse the input value."""
428
429 parsed = None # type: tuple[float, ...] | None
430 if value is None:
431 return value
432 if itype == "date":
433 m = RE_DATE.match(value)
434 if m:
435 year = int(m.group('year'), 10)
436 month = int(m.group('month'), 10)
437 day = int(m.group('day'), 10)
438 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
439 parsed = (year, month, day)
440 elif itype == "month":
441 m = RE_MONTH.match(value)
442 if m:
443 year = int(m.group('year'), 10)
444 month = int(m.group('month'), 10)
445 if cls.validate_year(year) and cls.validate_month(month):
446 parsed = (year, month)
447 elif itype == "week":
448 m = RE_WEEK.match(value)
449 if m:
450 year = int(m.group('year'), 10)
451 week = int(m.group('week'), 10)
452 if cls.validate_year(year) and cls.validate_week(year, week):
453 parsed = (year, week)
454 elif itype == "time":
455 m = RE_TIME.match(value)
456 if m:
457 hour = int(m.group('hour'), 10)
458 minutes = int(m.group('minutes'), 10)
459 if cls.validate_hour(hour) and cls.validate_minutes(minutes):
460 parsed = (hour, minutes)
461 elif itype == "datetime-local":
462 m = RE_DATETIME.match(value)
463 if m:
464 year = int(m.group('year'), 10)
465 month = int(m.group('month'), 10)
466 day = int(m.group('day'), 10)
467 hour = int(m.group('hour'), 10)
468 minutes = int(m.group('minutes'), 10)
469 if (
470 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
471 cls.validate_hour(hour) and cls.validate_minutes(minutes)
472 ):
473 parsed = (year, month, day, hour, minutes)
474 elif itype in ("number", "range"):
475 m = RE_NUM.match(value)
476 if m:
477 parsed = (float(m.group('value')),)
478 return parsed
479
480
481class CSSMatch(_DocumentNav):
482 """Perform CSS matching."""
483
484 def __init__(
485 self,
486 selectors: ct.SelectorList,
487 scope: bs4.Tag,
488 namespaces: ct.Namespaces | None,
489 flags: int
490 ) -> None:
491 """Initialize."""
492
493 self.assert_valid_input(scope)
494 self.tag = scope
495 self.cached_meta_lang = [] # type: list[tuple[str, str]]
496 self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]]
497 self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]]
498 self.selectors = selectors
499 self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str]
500 self.flags = flags
501 self.iframe_restrict = False
502
503 # Find the root element for the whole tree
504 doc = scope
505 parent = self.get_parent(doc)
506 while parent:
507 doc = parent
508 parent = self.get_parent(doc)
509 root = None
510 if not self.is_doc(doc):
511 root = doc
512 else:
513 for child in self.get_children(doc):
514 root = child
515 break
516
517 self.root = root
518 self.scope = scope if scope is not doc else root
519 self.has_html_namespace = self.has_html_ns(root)
520
521 # A document can be both XML and HTML (XHTML)
522 self.is_xml = self.is_xml_tree(doc)
523 self.is_html = not self.is_xml or self.has_html_namespace
524
525 def supports_namespaces(self) -> bool:
526 """Check if namespaces are supported in the HTML type."""
527
528 return self.is_xml or self.has_html_namespace
529
530 def get_tag_ns(self, el: bs4.Tag) -> str:
531 """Get tag namespace."""
532
533 if self.supports_namespaces():
534 namespace = ''
535 ns = self.get_uri(el)
536 if ns:
537 namespace = ns
538 else:
539 namespace = NS_XHTML
540 return namespace
541
542 def is_html_tag(self, el: bs4.Tag) -> bool:
543 """Check if tag is in HTML namespace."""
544
545 return self.get_tag_ns(el) == NS_XHTML
546
547 def get_tag(self, el: bs4.Tag) -> str | None:
548 """Get tag."""
549
550 name = self.get_tag_name(el)
551 return util.lower(name) if name is not None and not self.is_xml else name
552
553 def get_prefix(self, el: bs4.Tag) -> str | None:
554 """Get prefix."""
555
556 prefix = self.get_prefix_name(el)
557 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
558
559 def find_bidi(self, el: bs4.Tag) -> int | None:
560 """Get directionality from element text."""
561
562 for node in self.get_children(el, tags=False):
563
564 # Analyze child text nodes
565 if self.is_tag(node):
566
567 # Avoid analyzing certain elements specified in the specification.
568 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
569 if (
570 self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or
571 not self.is_html_tag(node) or
572 direction is not None
573 ):
574 continue # pragma: no cover
575
576 # Check directionality of this node's text
577 value = self.find_bidi(node)
578 if value is not None:
579 return value
580
581 # Direction could not be determined
582 continue # pragma: no cover
583
584 # Skip `doctype` comments, etc.
585 if self.is_special_string(node):
586 continue
587
588 # Analyze text nodes for directionality.
589 for c in node:
590 bidi = unicodedata.bidirectional(c)
591 if bidi in ('AL', 'R', 'L'):
592 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
593 return None
594
595 def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool:
596 """Filter the language tags."""
597
598 match = True
599 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
600 ranges = lang_range.split('-')
601 subtags = lang_tag.lower().split('-')
602 length = len(ranges)
603 slength = len(subtags)
604 rindex = 0
605 sindex = 0
606 r = ranges[rindex]
607 s = subtags[sindex]
608
609 # Empty specified language should match unspecified language attributes
610 if length == 1 and slength == 1 and not r and r == s:
611 return True
612
613 # Primary tag needs to match
614 if (r != '*' and r != s) or (r == '*' and slength == 1 and not s):
615 match = False
616
617 rindex += 1
618 sindex += 1
619
620 # Match until we run out of ranges
621 while match and rindex < length:
622 r = ranges[rindex]
623 try:
624 s = subtags[sindex]
625 except IndexError:
626 # Ran out of subtags,
627 # but we still have ranges
628 match = False
629 continue
630
631 # Empty range
632 if not r:
633 match = False
634 continue
635
636 # Matched range
637 elif s == r:
638 rindex += 1
639
640 # Implicit wildcard cannot match
641 # singletons
642 elif len(s) == 1:
643 match = False
644 continue
645
646 # Implicitly matched, so grab next subtag
647 sindex += 1
648
649 return match
650
651 def match_attribute_name(
652 self,
653 el: bs4.Tag,
654 attr: str,
655 prefix: str | None
656 ) -> str | Sequence[str] | None:
657 """Match attribute name and return value if it exists."""
658
659 value = None
660 if self.supports_namespaces():
661 value = None
662 # If we have not defined namespaces, we can't very well find them, so don't bother trying.
663 if prefix:
664 ns = self.namespaces.get(prefix)
665 if ns is None and prefix != '*':
666 return None
667 else:
668 ns = None
669
670 for k, v in self.iter_attributes(el):
671
672 # Get attribute parts
673 namespace, name = self.split_namespace(el, k)
674
675 # Can't match a prefix attribute as we haven't specified one to match
676 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
677 if ns is None:
678 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
679 value = v
680 break
681 # Coverage is not finding this even though it is executed.
682 # Adding a print statement before this (and erasing coverage) causes coverage to find the line.
683 # Ignore the false positive message.
684 continue # pragma: no cover
685
686 # We can't match our desired prefix attribute as the attribute doesn't have a prefix
687 if namespace is None or ns != namespace and prefix != '*':
688 continue
689
690 # The attribute doesn't match.
691 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
692 continue
693
694 value = v
695 break
696 else:
697 for k, v in self.iter_attributes(el):
698 if util.lower(attr) != util.lower(k):
699 continue
700 value = v
701 break
702 return value
703
704 def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
705 """Match the namespace of the element."""
706
707 match = True
708 namespace = self.get_tag_ns(el)
709 default_namespace = self.namespaces.get('')
710 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix)
711 # We must match the default namespace if one is not provided
712 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
713 match = False
714 # If we specified `|tag`, we must not have a namespace.
715 elif (tag.prefix is not None and tag.prefix == '' and namespace):
716 match = False
717 # Verify prefix matches
718 elif (
719 tag.prefix and
720 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
721 ):
722 match = False
723 return match
724
725 def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool:
726 """Match attributes."""
727
728 match = True
729 if attributes:
730 for a in attributes:
731 temp = self.match_attribute_name(el, a.attribute, a.prefix)
732 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
733 if temp is None:
734 match = False
735 break
736 value = temp if isinstance(temp, str) else ' '.join(temp)
737 if pattern is None:
738 continue
739 elif pattern.match(value) is None:
740 match = False
741 break
742 return match
743
744 def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
745 """Match tag name."""
746
747 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
748 return not (
749 name is not None and
750 name not in (self.get_tag(el), '*')
751 )
752
753 def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:
754 """Match the tag."""
755
756 match = True
757 if tag is not None:
758 # Verify namespace
759 if not self.match_namespace(el, tag):
760 match = False
761 if not self.match_tagname(el, tag):
762 match = False
763 return match
764
765 def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
766 """Match past relationship."""
767
768 found = False
769 # I don't think this can ever happen, but it makes `mypy` happy
770 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
771 return found
772
773 if relation[0].rel_type == REL_PARENT:
774 parent = self.get_parent(el, no_iframe=self.iframe_restrict)
775 while not found and parent:
776 found = self.match_selectors(parent, relation)
777 parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
778 elif relation[0].rel_type == REL_CLOSE_PARENT:
779 parent = self.get_parent(el, no_iframe=self.iframe_restrict)
780 if parent:
781 found = self.match_selectors(parent, relation)
782 elif relation[0].rel_type == REL_SIBLING:
783 sibling = self.get_previous(el)
784 while not found and sibling:
785 found = self.match_selectors(sibling, relation)
786 sibling = self.get_previous(sibling)
787 elif relation[0].rel_type == REL_CLOSE_SIBLING:
788 sibling = self.get_previous(el)
789 if sibling and self.is_tag(sibling):
790 found = self.match_selectors(sibling, relation)
791 return found
792
793 def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool:
794 """Match future child."""
795
796 match = False
797 if recursive:
798 children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]]
799 else:
800 children = self.get_children
801 for child in children(parent, no_iframe=self.iframe_restrict):
802 match = self.match_selectors(child, relation)
803 if match:
804 break
805 return match
806
807 def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
808 """Match future relationship."""
809
810 found = False
811 # I don't think this can ever happen, but it makes `mypy` happy
812 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
813 return found
814
815 if relation[0].rel_type == REL_HAS_PARENT:
816 found = self.match_future_child(el, relation, True)
817 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
818 found = self.match_future_child(el, relation)
819 elif relation[0].rel_type == REL_HAS_SIBLING:
820 sibling = self.get_next(el)
821 while not found and sibling:
822 found = self.match_selectors(sibling, relation)
823 sibling = self.get_next(sibling)
824 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
825 sibling = self.get_next(el)
826 if sibling and self.is_tag(sibling):
827 found = self.match_selectors(sibling, relation)
828 return found
829
830 def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
831 """Match relationship to other elements."""
832
833 found = False
834
835 if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None:
836 return found
837
838 if relation[0].rel_type.startswith(':'):
839 found = self.match_future_relations(el, relation)
840 else:
841 found = self.match_past_relations(el, relation)
842
843 return found
844
845 def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool:
846 """Match element's ID."""
847
848 found = True
849 for i in ids:
850 if i != self.get_attribute_by_name(el, 'id', ''):
851 found = False
852 break
853 return found
854
855 def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool:
856 """Match element's classes."""
857
858 current_classes = self.get_classes(el)
859 found = True
860 for c in classes:
861 if c not in current_classes:
862 found = False
863 break
864 return found
865
866 def match_root(self, el: bs4.Tag) -> bool:
867 """Match element as root."""
868
869 is_root = self.is_root(el)
870 if is_root:
871 sibling = self.get_previous(el, tags=False)
872 while is_root and sibling is not None:
873 if (
874 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
875 self.is_cdata(sibling)
876 ):
877 is_root = False
878 else:
879 sibling = self.get_previous(sibling, tags=False)
880 if is_root:
881 sibling = self.get_next(el, tags=False)
882 while is_root and sibling is not None:
883 if (
884 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
885 self.is_cdata(sibling)
886 ):
887 is_root = False
888 else:
889 sibling = self.get_next(sibling, tags=False)
890 return is_root
891
892 def match_scope(self, el: bs4.Tag) -> bool:
893 """Match element as scope."""
894
895 return self.scope is el
896
897 def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool:
898 """Match tag type for `nth` matches."""
899
900 return (
901 (self.get_tag(child) == self.get_tag(el)) and
902 (self.get_tag_ns(child) == self.get_tag_ns(el))
903 )
904
905 def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool:
906 """Match `nth` elements."""
907
908 matched = True
909
910 for n in nth:
911 matched = False
912 if n.selectors and not self.match_selectors(el, n.selectors):
913 break
914 parent = self.get_parent(el)
915 if parent is None:
916 parent = self.create_fake_parent(el)
917 last = n.last
918 last_index = len(parent) - 1
919 index = last_index if last else 0
920 relative_index = 0
921 a = n.a
922 b = n.b
923 var = n.n
924 count = 0
925 count_incr = 1
926 factor = -1 if last else 1
927 idx = last_idx = a * count + b if var else a
928
929 # We can only adjust bounds within a variable index
930 if var:
931 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
932 # Otherwise, increment to try to get in bounds.
933 adjust = None
934 while idx < 1 or idx > last_index:
935 if idx < 0:
936 diff_low = 0 - idx
937 if adjust is not None and adjust == 1:
938 break
939 adjust = -1
940 count += count_incr
941 idx = last_idx = a * count + b if var else a
942 diff = 0 - idx
943 if diff >= diff_low:
944 break
945 else:
946 diff_high = idx - last_index
947 if adjust is not None and adjust == -1:
948 break
949 adjust = 1
950 count += count_incr
951 idx = last_idx = a * count + b if var else a
952 diff = idx - last_index
953 if diff >= diff_high:
954 break
955 diff_high = diff
956
957 # If a < 0, our count is working backwards, so floor the index by increasing the count.
958 # Find the count that yields the lowest, in bound value and use that.
959 # Lastly reverse count increment so that we'll increase our index.
960 lowest = count
961 if a < 0:
962 while idx >= 1:
963 lowest = count
964 count += count_incr
965 idx = last_idx = a * count + b if var else a
966 count_incr = -1
967 count = lowest
968 idx = last_idx = a * count + b if var else a
969
970 # Evaluate elements while our calculated nth index is still in range
971 while 1 <= idx <= last_index + 1:
972 child = None
973 # Evaluate while our child index is still in range.
974 for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):
975 index += factor
976 if not self.is_tag(child):
977 continue
978 # Handle `of S` in `nth-child`
979 if n.selectors and not self.match_selectors(child, n.selectors):
980 continue
981 # Handle `of-type`
982 if n.of_type and not self.match_nth_tag_type(el, child):
983 continue
984 relative_index += 1
985 if relative_index == idx:
986 if child is el:
987 matched = True
988 else:
989 break
990 if child is el:
991 break
992 if child is el:
993 break
994 last_idx = idx
995 count += count_incr
996 if count < 0:
997 # Count is counting down and has now ventured into invalid territory.
998 break
999 idx = a * count + b if var else a
1000 if last_idx == idx:
1001 break
1002 if not matched:
1003 break
1004 return matched
1005
1006 def match_empty(self, el: bs4.Tag) -> bool:
1007 """Check if element is empty (if requested)."""
1008
1009 is_empty = True
1010 for child in self.get_children(el, tags=False):
1011 if self.is_tag(child):
1012 is_empty = False
1013 break
1014 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):
1015 is_empty = False
1016 break
1017 return is_empty
1018
1019 def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool:
1020 """Match selectors."""
1021
1022 match = True
1023 for sel in selectors:
1024 if not self.match_selectors(el, sel):
1025 match = False
1026 return match
1027
1028 def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool:
1029 """Match element if it contains text."""
1030
1031 match = True
1032 content = None # type: str | Sequence[str] | None
1033 for contain_list in contains:
1034 if content is None:
1035 if contain_list.own:
1036 content = self.get_own_text(el, no_iframe=self.is_html)
1037 else:
1038 content = self.get_text(el, no_iframe=self.is_html)
1039 found = False
1040 for text in contain_list.text:
1041 if contain_list.own:
1042 for c in content:
1043 if text in c:
1044 found = True
1045 break
1046 if found:
1047 break
1048 else:
1049 if text in content:
1050 found = True
1051 break
1052 if not found:
1053 match = False
1054 return match
1055
1056 def match_default(self, el: bs4.Tag) -> bool:
1057 """Match default."""
1058
1059 match = False
1060
1061 # Find this input's form
1062 form = None
1063 parent = self.get_parent(el, no_iframe=True)
1064 while parent and form is None:
1065 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
1066 form = parent
1067 else:
1068 parent = self.get_parent(parent, no_iframe=True)
1069
1070 # Look in form cache to see if we've already located its default button
1071 found_form = False
1072 for f, t in self.cached_default_forms:
1073 if f is form:
1074 found_form = True
1075 if t is el:
1076 match = True
1077 break
1078
1079 # We didn't have the form cached, so look for its default button
1080 if not found_form:
1081 for child in self.get_descendants(form, no_iframe=True):
1082 name = self.get_tag(child)
1083 # Can't do nested forms (haven't figured out why we never hit this)
1084 if name == 'form': # pragma: no cover
1085 break
1086 if name in ('input', 'button'):
1087 v = self.get_attribute_by_name(child, 'type', '')
1088 if v and util.lower(v) == 'submit':
1089 self.cached_default_forms.append((form, child))
1090 if el is child:
1091 match = True
1092 break
1093 return match
1094
1095 def match_indeterminate(self, el: bs4.Tag) -> bool:
1096 """Match default."""
1097
1098 match = False
1099 name = cast(str, self.get_attribute_by_name(el, 'name'))
1100
1101 def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:
1102 """Find this input's form."""
1103 form = None
1104 parent = self.get_parent(el, no_iframe=True)
1105 while form is None:
1106 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
1107 form = parent
1108 break
1109 last_parent = parent
1110 parent = self.get_parent(parent, no_iframe=True)
1111 if parent is None:
1112 form = last_parent
1113 break
1114 return form
1115
1116 form = get_parent_form(el)
1117
1118 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
1119 found_form = False
1120 for f, n, i in self.cached_indeterminate_forms:
1121 if f is form and n == name:
1122 found_form = True
1123 if i is True:
1124 match = True
1125 break
1126
1127 # We didn't have the form cached, so validate that the radio button is indeterminate
1128 if not found_form:
1129 checked = False
1130 for child in self.get_descendants(form, no_iframe=True):
1131 if child is el:
1132 continue
1133 tag_name = self.get_tag(child)
1134 if tag_name == 'input':
1135 is_radio = False
1136 check = False
1137 has_name = False
1138 for k, v in self.iter_attributes(child):
1139 if util.lower(k) == 'type' and util.lower(v) == 'radio':
1140 is_radio = True
1141 elif util.lower(k) == 'name' and v == name:
1142 has_name = True
1143 elif util.lower(k) == 'checked':
1144 check = True
1145 if is_radio and check and has_name and get_parent_form(child) is form:
1146 checked = True
1147 break
1148 if checked:
1149 break
1150 if not checked:
1151 match = True
1152 self.cached_indeterminate_forms.append((form, name, match))
1153
1154 return match
1155
1156 def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool:
1157 """Match languages."""
1158
1159 match = False
1160 has_ns = self.supports_namespaces()
1161 root = self.root
1162 has_html_namespace = self.has_html_namespace
1163
1164 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
1165 parent = el
1166 found_lang = None
1167 last = None
1168 while not found_lang:
1169 has_html_ns = self.has_html_ns(parent)
1170 for k, v in self.iter_attributes(parent):
1171 attr_ns, attr = self.split_namespace(parent, k)
1172 if (
1173 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
1174 (
1175 has_ns and not has_html_ns and attr_ns == NS_XML and
1176 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
1177 )
1178 ):
1179 found_lang = v
1180 break
1181 last = parent
1182 parent = self.get_parent(parent, no_iframe=self.is_html)
1183
1184 if parent is None:
1185 root = last
1186 has_html_namespace = self.has_html_ns(root)
1187 parent = last
1188 break
1189
1190 # Use cached meta language.
1191 if found_lang is None and self.cached_meta_lang:
1192 for cache in self.cached_meta_lang:
1193 if root is cache[0]:
1194 found_lang = cache[1]
1195
1196 # If we couldn't find a language, and the document is HTML, look to meta to determine language.
1197 if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):
1198 # Find head
1199 found = False
1200 for tag in ('html', 'head'):
1201 found = False
1202 for child in self.get_children(parent, no_iframe=self.is_html):
1203 if self.get_tag(child) == tag and self.is_html_tag(child):
1204 found = True
1205 parent = child
1206 break
1207 if not found: # pragma: no cover
1208 break
1209
1210 # Search meta tags
1211 if found:
1212 for child in parent:
1213 if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):
1214 c_lang = False
1215 content = None
1216 for k, v in self.iter_attributes(child):
1217 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
1218 c_lang = True
1219 if util.lower(k) == 'content':
1220 content = v
1221 if c_lang and content:
1222 found_lang = content
1223 self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))
1224 break
1225 if found_lang is not None:
1226 break
1227 if found_lang is None:
1228 self.cached_meta_lang.append((cast(str, root), ''))
1229
1230 # If we determined a language, compare.
1231 if found_lang is not None:
1232 for patterns in langs:
1233 match = False
1234 for pattern in patterns:
1235 if self.extended_language_filter(pattern, cast(str, found_lang)):
1236 match = True
1237 if not match:
1238 break
1239
1240 return match
1241
1242 def match_dir(self, el: bs4.Tag, directionality: int) -> bool:
1243 """Check directionality."""
1244
1245 # If we have to match both left and right, we can't match either.
1246 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
1247 return False
1248
1249 if el is None or not self.is_html_tag(el):
1250 return False
1251
1252 # Element has defined direction of left to right or right to left
1253 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
1254 if direction not in (None, 0):
1255 return direction == directionality
1256
1257 # Element is the document element (the root) and no direction assigned, assume left to right.
1258 is_root = self.is_root(el)
1259 if is_root and direction is None:
1260 return ct.SEL_DIR_LTR == directionality
1261
1262 # If `input[type=telephone]` and no direction is assigned, assume left to right.
1263 name = self.get_tag(el)
1264 is_input = name == 'input'
1265 is_textarea = name == 'textarea'
1266 is_bdi = name == 'bdi'
1267 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
1268 if is_input and itype == 'tel' and direction is None:
1269 return ct.SEL_DIR_LTR == directionality
1270
1271 # Auto handling for text inputs
1272 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
1273 if is_textarea:
1274 value = ''.join(node for node in self.get_contents(el, no_iframe=True) if self.is_content_string(node))
1275 else:
1276 value = cast(str, self.get_attribute_by_name(el, 'value', ''))
1277 if value:
1278 for c in value:
1279 bidi = unicodedata.bidirectional(c)
1280 if bidi in ('AL', 'R', 'L'):
1281 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
1282 return direction == directionality
1283 # Assume left to right
1284 return ct.SEL_DIR_LTR == directionality
1285 elif is_root:
1286 return ct.SEL_DIR_LTR == directionality
1287 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1288
1289 # Auto handling for `bdi` and other non text inputs.
1290 if (is_bdi and direction is None) or direction == 0:
1291 direction = self.find_bidi(el)
1292 if direction is not None:
1293 return direction == directionality
1294 elif is_root:
1295 return ct.SEL_DIR_LTR == directionality
1296 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1297
1298 # Match parents direction
1299 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1300
1301 def match_range(self, el: bs4.Tag, condition: int) -> bool:
1302 """
1303 Match range.
1304
1305 Behavior is modeled after what we see in browsers. Browsers seem to evaluate
1306 if the value is out of range, and if not, it is in range. So a missing value
1307 will not evaluate out of range; therefore, value is in range. Personally, I
1308 feel like this should evaluate as neither in or out of range.
1309 """
1310
1311 out_of_range = False
1312
1313 itype = util.lower(self.get_attribute_by_name(el, 'type'))
1314 mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None)))
1315 mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None)))
1316
1317 # There is no valid min or max, so we cannot evaluate a range
1318 if mn is None and mx is None:
1319 return False
1320
1321 value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None)))
1322 if value is not None:
1323 if itype in ("date", "datetime-local", "month", "week", "number", "range"):
1324 if mn is not None and value < mn:
1325 out_of_range = True
1326 if not out_of_range and mx is not None and value > mx:
1327 out_of_range = True
1328 elif itype == "time":
1329 if mn is not None and mx is not None and mn > mx:
1330 # Time is periodic, so this is a reversed/discontinuous range
1331 if value < mn and value > mx:
1332 out_of_range = True
1333 else:
1334 if mn is not None and value < mn:
1335 out_of_range = True
1336 if not out_of_range and mx is not None and value > mx:
1337 out_of_range = True
1338
1339 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
1340
1341 def match_defined(self, el: bs4.Tag) -> bool:
1342 """
1343 Match defined.
1344
1345 `:defined` is related to custom elements in a browser.
1346
1347 - If the document is XML (not XHTML), all tags will match.
1348 - Tags that are not custom (don't have a hyphen) are marked defined.
1349 - If the tag has a prefix (without or without a namespace), it will not match.
1350
1351 This is of course requires the parser to provide us with the proper prefix and namespace info,
1352 if it doesn't, there is nothing we can do.
1353 """
1354
1355 name = self.get_tag(el)
1356 return (
1357 name is not None and (
1358 name.find('-') == -1 or
1359 name.find(':') != -1 or
1360 self.get_prefix(el) is not None
1361 )
1362 )
1363
1364 def match_placeholder_shown(self, el: bs4.Tag) -> bool:
1365 """
1366 Match placeholder shown according to HTML spec.
1367
1368 - text area should be checked if they have content. A single newline does not count as content.
1369
1370 """
1371
1372 match = False
1373 content = self.get_text(el)
1374 if content in ('', '\n'):
1375 match = True
1376
1377 return match
1378
1379 def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool:
1380 """Check if element matches one of the selectors."""
1381
1382 match = False
1383 is_not = selectors.is_not
1384 is_html = selectors.is_html
1385
1386 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
1387 if is_html:
1388 namespaces = self.namespaces
1389 iframe_restrict = self.iframe_restrict
1390 self.namespaces = {'html': NS_XHTML}
1391 self.iframe_restrict = True
1392
1393 if not is_html or self.is_html:
1394 for selector in selectors:
1395 match = is_not
1396 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
1397 if isinstance(selector, ct.SelectorNull):
1398 continue
1399 # Verify tag matches
1400 if not self.match_tag(el, selector.tag):
1401 continue
1402 # Verify tag is defined
1403 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
1404 continue
1405 # Verify element is root
1406 if selector.flags & ct.SEL_ROOT and not self.match_root(el):
1407 continue
1408 # Verify element is scope
1409 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
1410 continue
1411 # Verify element has placeholder shown
1412 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
1413 continue
1414 # Verify `nth` matches
1415 if not self.match_nth(el, selector.nth):
1416 continue
1417 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
1418 continue
1419 # Verify id matches
1420 if selector.ids and not self.match_id(el, selector.ids):
1421 continue
1422 # Verify classes match
1423 if selector.classes and not self.match_classes(el, selector.classes):
1424 continue
1425 # Verify attribute(s) match
1426 if not self.match_attributes(el, selector.attributes):
1427 continue
1428 # Verify ranges
1429 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
1430 continue
1431 # Verify language patterns
1432 if selector.lang and not self.match_lang(el, selector.lang):
1433 continue
1434 # Verify pseudo selector patterns
1435 if selector.selectors and not self.match_subselectors(el, selector.selectors):
1436 continue
1437 # Verify relationship selectors
1438 if selector.relation and not self.match_relations(el, selector.relation):
1439 continue
1440 # Validate that the current default selector match corresponds to the first submit button in the form
1441 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
1442 continue
1443 # Validate that the unset radio button is among radio buttons with the same name in a form that are
1444 # also not set.
1445 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
1446 continue
1447 # Validate element directionality
1448 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
1449 continue
1450 # Validate that the tag contains the specified text.
1451 if selector.contains and not self.match_contains(el, selector.contains):
1452 continue
1453 match = not is_not
1454 break
1455
1456 # Restore actual namespaces being used for external selector lists
1457 if is_html:
1458 self.namespaces = namespaces
1459 self.iframe_restrict = iframe_restrict
1460
1461 return match
1462
1463 def select(self, limit: int = 0) -> Iterator[bs4.Tag]:
1464 """Match all tags under the targeted tag."""
1465
1466 lim = None if limit < 1 else limit
1467
1468 for child in self.get_descendants(self.tag):
1469 if self.match(child):
1470 yield child
1471 if lim is not None:
1472 lim -= 1
1473 if lim < 1:
1474 break
1475
1476 def closest(self) -> bs4.Tag | None:
1477 """Match closest ancestor."""
1478
1479 current = self.tag
1480 closest = None
1481 while closest is None and current is not None:
1482 if self.match(current):
1483 closest = current
1484 else:
1485 current = self.get_parent(current)
1486 return closest
1487
1488 def filter(self) -> list[bs4.Tag]: # noqa A001
1489 """Filter tag's children."""
1490
1491 return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
1492
1493 def match(self, el: bs4.Tag) -> bool:
1494 """Match."""
1495
1496 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
1497
1498
1499class SoupSieve(ct.Immutable):
1500 """Compiled Soup Sieve selector matching object."""
1501
1502 pattern: str
1503 selectors: ct.SelectorList
1504 namespaces: ct.Namespaces | None
1505 custom: dict[str, str]
1506 flags: int
1507
1508 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
1509
1510 def __init__(
1511 self,
1512 pattern: str,
1513 selectors: ct.SelectorList,
1514 namespaces: ct.Namespaces | None,
1515 custom: ct.CustomSelectors | None,
1516 flags: int
1517 ):
1518 """Initialize."""
1519
1520 super().__init__(
1521 pattern=pattern,
1522 selectors=selectors,
1523 namespaces=namespaces,
1524 custom=custom,
1525 flags=flags
1526 )
1527
1528 def match(self, tag: bs4.Tag) -> bool:
1529 """Match."""
1530
1531 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
1532
1533 def closest(self, tag: bs4.Tag) -> bs4.Tag:
1534 """Match closest ancestor."""
1535
1536 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
1537
1538 def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001
1539 """
1540 Filter.
1541
1542 `CSSMatch` can cache certain searches for tags of the same document,
1543 so if we are given a tag, all tags are from the same document,
1544 and we can take advantage of the optimization.
1545
1546 Any other kind of iterable could have tags from different documents or detached tags,
1547 so for those, we use a new `CSSMatch` for each item in the iterable.
1548 """
1549
1550 if CSSMatch.is_tag(iterable):
1551 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
1552 else:
1553 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
1554
1555 def select_one(self, tag: bs4.Tag) -> bs4.Tag:
1556 """Select a single tag."""
1557
1558 tags = self.select(tag, limit=1)
1559 return tags[0] if tags else None
1560
1561 def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]:
1562 """Select the specified tags."""
1563
1564 return list(self.iselect(tag, limit))
1565
1566 def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]:
1567 """Iterate the specified tags."""
1568
1569 yield from CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit)
1570
1571 def __repr__(self) -> str: # pragma: no cover
1572 """Representation."""
1573
1574 return (
1575 f"SoupSieve(pattern={self.pattern!r}, namespaces={self.namespaces!r}, "
1576 f"custom={self.custom!r}, flags={self.flags!r})"
1577 )
1578
1579 __str__ = __repr__
1580
1581
1582ct.pickle_register(SoupSieve)