1"""CSS matcher."""
2from __future__ import annotations
3from datetime import datetime
4from . import util
5import re
6from . import css_types as ct
7import unicodedata
8import bs4
9from typing import Iterator, Iterable, Any, Callable, Sequence, Any, cast # noqa: F401, F811
10
11# Empty tag pattern (whitespace okay)
12RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
13
14RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
15
16# Relationships
17REL_PARENT = ' '
18REL_CLOSE_PARENT = '>'
19REL_SIBLING = '~'
20REL_CLOSE_SIBLING = '+'
21
22# Relationships for :has() (forward looking)
23REL_HAS_PARENT = ': '
24REL_HAS_CLOSE_PARENT = ':>'
25REL_HAS_SIBLING = ':~'
26REL_HAS_CLOSE_SIBLING = ':+'
27
28NS_XHTML = 'http://www.w3.org/1999/xhtml'
29NS_XML = 'http://www.w3.org/XML/1998/namespace'
30
31DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
32RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
33
34DIR_MAP = {
35 'ltr': ct.SEL_DIR_LTR,
36 'rtl': ct.SEL_DIR_RTL,
37 'auto': 0
38}
39
40RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
41RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
42RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
43RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
44RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
45RE_DATETIME = re.compile(
46 r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
47)
48RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
49
50MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
51FEB = 2
52SHORT_MONTH = 30
53LONG_MONTH = 31
54FEB_MONTH = 28
55FEB_LEAP_MONTH = 29
56DAYS_IN_WEEK = 7
57
58
59class _FakeParent:
60 """
61 Fake parent class.
62
63 When we have a fragment with no `BeautifulSoup` document object,
64 we can't evaluate `nth` selectors properly. Create a temporary
65 fake parent so we can traverse the root element as a child.
66 """
67
68 def __init__(self, element: bs4.Tag) -> None:
69 """Initialize."""
70
71 self.contents = [element]
72
73 def __len__(self) -> int:
74 """Length."""
75
76 return len(self.contents)
77
78
79class _DocumentNav:
80 """Navigate a Beautiful Soup document."""
81
82 @classmethod
83 def assert_valid_input(cls, tag: Any) -> None:
84 """Check if valid input tag or document."""
85
86 # Fail on unexpected types.
87 if not cls.is_tag(tag):
88 raise TypeError(f"Expected a BeautifulSoup 'Tag', but instead received type {type(tag)}")
89
90 @staticmethod
91 def is_doc(obj: bs4.element.PageElement | None) -> bool:
92 """Is `BeautifulSoup` object."""
93 return isinstance(obj, bs4.BeautifulSoup)
94
95 @staticmethod
96 def is_tag(obj: bs4.element.PageElement | None) -> bool:
97 """Is tag."""
98 return isinstance(obj, bs4.Tag)
99
100 @staticmethod
101 def is_declaration(obj: bs4.element.PageElement | None) -> bool: # pragma: no cover
102 """Is declaration."""
103 return isinstance(obj, bs4.Declaration)
104
105 @staticmethod
106 def is_cdata(obj: bs4.element.PageElement | None) -> bool:
107 """Is CDATA."""
108 return isinstance(obj, bs4.CData)
109
110 @staticmethod
111 def is_processing_instruction(obj: bs4.element.PageElement | None) -> bool: # pragma: no cover
112 """Is processing instruction."""
113 return isinstance(obj, bs4.ProcessingInstruction)
114
115 @staticmethod
116 def is_navigable_string(obj: bs4.element.PageElement | None) -> bool:
117 """Is navigable string."""
118 return isinstance(obj, bs4.element.NavigableString)
119
120 @staticmethod
121 def is_special_string(obj: bs4.element.PageElement | None) -> bool:
122 """Is special string."""
123 return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
124
125 @classmethod
126 def is_content_string(cls, obj: bs4.element.PageElement | None) -> bool:
127 """Check if node is content string."""
128
129 return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
130
131 @staticmethod
132 def create_fake_parent(el: bs4.Tag) -> _FakeParent:
133 """Create fake parent for a given element."""
134
135 return _FakeParent(el)
136
137 @staticmethod
138 def is_xml_tree(el: bs4.Tag | None) -> bool:
139 """Check if element (or document) is from a XML tree."""
140
141 return el is not None and bool(el._is_xml)
142
143 def is_iframe(self, el: bs4.Tag | None) -> bool:
144 """Check if element is an `iframe`."""
145
146 if el is None: # pragma: no cover
147 return False
148
149 return bool(
150 ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and
151 self.is_html_tag(el) # type: ignore[attr-defined]
152 )
153
154 def is_root(self, el: bs4.Tag) -> bool:
155 """
156 Return whether element is a root element.
157
158 We check that the element is the root of the tree (which we have already pre-calculated),
159 and we check if it is the root element under an `iframe`.
160 """
161
162 root = self.root and self.root is el # type: ignore[attr-defined]
163 if not root:
164 parent = self.get_parent(el)
165 root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]
166 return root
167
168 def get_contents(self, el: bs4.Tag | None, no_iframe: bool = False) -> Iterator[bs4.element.PageElement]:
169 """Get contents or contents in reverse."""
170
171 if el is not None:
172 if not no_iframe or not self.is_iframe(el):
173 yield from el.contents
174
175 def get_tag_children(
176 self,
177 el: bs4.Tag | None,
178 start: int | None = None,
179 reverse: bool = False,
180 no_iframe: bool = False
181 ) -> Iterator[bs4.Tag]:
182 """Get tag children."""
183
184 return self.get_children(el, start, reverse, True, no_iframe) # type: ignore[return-value]
185
186 def get_children(
187 self,
188 el: bs4.Tag | None,
189 start: int | None = None,
190 reverse: bool = False,
191 tags: bool = False,
192 no_iframe: bool = False
193 ) -> Iterator[bs4.element.PageElement]:
194 """Get children."""
195
196 if el is not None and (not no_iframe or not self.is_iframe(el)):
197 last = len(el.contents) - 1
198 if start is None:
199 index = last if reverse else 0
200 else:
201 index = start
202 end = -1 if reverse else last + 1
203 incr = -1 if reverse else 1
204
205 if 0 <= index <= last:
206 while index != end:
207 node = el.contents[index]
208 index += incr
209 if not tags or self.is_tag(node):
210 yield node
211
212 def get_tag_descendants(
213 self,
214 el: bs4.Tag | None,
215 no_iframe: bool = False
216 ) -> Iterator[bs4.Tag]:
217 """Specifically get tag descendants."""
218
219 yield from self.get_descendants(el, tags=True, no_iframe=no_iframe) # type: ignore[misc]
220
221 def get_descendants(
222 self,
223 el: bs4.Tag | None,
224 tags: bool = False,
225 no_iframe: bool = False
226 ) -> Iterator[bs4.element.PageElement]:
227 """Get descendants."""
228
229 if el is not None and (not no_iframe or not self.is_iframe(el)):
230 next_good = None
231 for child in el.descendants:
232
233 if next_good is not None:
234 if child is not next_good:
235 continue
236 next_good = None
237
238 if isinstance(child, bs4.Tag):
239 if no_iframe and self.is_iframe(child):
240 if child.next_sibling is not None:
241 next_good = child.next_sibling
242 else:
243 last_child = child # type: bs4.element.PageElement
244 while isinstance(last_child, bs4.Tag) and last_child.contents:
245 last_child = last_child.contents[-1]
246 next_good = last_child.next_element
247 yield child
248 if next_good is None:
249 break
250 # Coverage isn't seeing this even though it's executed
251 continue # pragma: no cover
252 yield child
253
254 elif not tags:
255 yield child
256
257 def get_parent(self, el: bs4.Tag | None, no_iframe: bool = False) -> bs4.Tag | None:
258 """Get parent."""
259
260 parent = el.parent if el is not None else None
261 if no_iframe and parent is not None and self.is_iframe(parent):
262 parent = None
263 return parent
264
265 @staticmethod
266 def get_tag_name(el: bs4.Tag | None) -> str | None:
267 """Get tag."""
268
269 return el.name if el is not None else None
270
271 @staticmethod
272 def get_prefix_name(el: bs4.Tag) -> str | None:
273 """Get prefix."""
274
275 return el.prefix
276
277 @staticmethod
278 def get_uri(el: bs4.Tag | None) -> str | None:
279 """Get namespace `URI`."""
280
281 return el.namespace if el is not None else None
282
283 @classmethod
284 def get_next_tag(cls, el: bs4.Tag) -> bs4.Tag | None:
285 """Get next sibling tag."""
286
287 return cls.get_next(el, tags=True) # type: ignore[return-value]
288
289 @classmethod
290 def get_next(cls, el: bs4.Tag, tags: bool = False) -> bs4.element.PageElement | None:
291 """Get next sibling tag."""
292
293 sibling = el.next_sibling
294 while tags and not isinstance(sibling, bs4.Tag) and sibling is not None:
295 sibling = sibling.next_sibling
296
297 if tags and not isinstance(sibling, bs4.Tag):
298 sibling = None
299
300 return sibling
301
302 @classmethod
303 def get_previous_tag(cls, el: bs4.Tag, tags: bool = True) -> bs4.Tag | None:
304 """Get previous sibling tag."""
305
306 return cls.get_previous(el, True) # type: ignore[return-value]
307
308 @classmethod
309 def get_previous(cls, el: bs4.Tag, tags: bool = False) -> bs4.element.PageElement | None:
310 """Get previous sibling tag."""
311
312 sibling = el.previous_sibling
313 while tags and not isinstance(sibling, bs4.Tag) and sibling is not None:
314 sibling = sibling.previous_sibling
315
316 if tags and not isinstance(sibling, bs4.Tag):
317 sibling = None
318
319 return sibling
320
321 @staticmethod
322 def has_html_ns(el: bs4.Tag | None) -> bool:
323 """
324 Check if element has an HTML namespace.
325
326 This is a bit different than whether a element is treated as having an HTML namespace,
327 like we do in the case of `is_html_tag`.
328 """
329
330 ns = getattr(el, 'namespace') if el is not None else None # noqa: B009
331 return bool(ns and ns == NS_XHTML)
332
333 @staticmethod
334 def split_namespace(el: bs4.Tag | None, attr_name: str) -> tuple[str | None, str | None]:
335 """Return namespace and attribute name without the prefix."""
336
337 if el is None: # pragma: no cover
338 return None, None
339
340 return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
341
342 @classmethod
343 def normalize_value(cls, value: Any) -> str | Sequence[str]:
344 """Normalize the value to be a string or list of strings."""
345
346 # Treat `None` as empty string.
347 if value is None:
348 return ''
349
350 # Pass through strings
351 if (isinstance(value, str)):
352 return value
353
354 # If it's a byte string, convert it to Unicode, treating it as UTF-8.
355 if isinstance(value, bytes):
356 return value.decode("utf8")
357
358 # BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
359 if isinstance(value, Sequence):
360 new_value = []
361 for v in value:
362 if not isinstance(v, (str, bytes)) and isinstance(v, Sequence):
363 # This is most certainly a user error and will crash and burn later.
364 # To keep things working, we'll do what we do with all objects,
365 # And convert them to strings.
366 new_value.append(str(v))
367 else:
368 # Convert the child to a string
369 new_value.append(cast(str, cls.normalize_value(v)))
370 return new_value
371
372 # Try and make anything else a string
373 return str(value)
374
375 @classmethod
376 def get_attribute_by_name(
377 cls,
378 el: bs4.Tag,
379 name: str,
380 default: str | Sequence[str] | None = None
381 ) -> str | Sequence[str] | None:
382 """Get attribute by name."""
383
384 value = default
385 if el._is_xml:
386 try:
387 value = cls.normalize_value(el.attrs[name])
388 except KeyError:
389 pass
390 else:
391 for k, v in el.attrs.items():
392 if util.lower(k) == name:
393 value = cls.normalize_value(v)
394 break
395 return value
396
397 @classmethod
398 def iter_attributes(cls, el: bs4.Tag | None) -> Iterator[tuple[str, str | Sequence[str] | None]]:
399 """Iterate attributes."""
400
401 if el is not None:
402 for k, v in el.attrs.items():
403 yield k, cls.normalize_value(v)
404
405 @classmethod
406 def get_classes(cls, el: bs4.Tag) -> Sequence[str]:
407 """Get classes."""
408
409 classes = cls.get_attribute_by_name(el, 'class', [])
410 if isinstance(classes, str):
411 classes = RE_NOT_WS.findall(classes)
412 return cast(Sequence[str], classes)
413
414 def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str:
415 """Get text."""
416
417 return ''.join(
418 [
419 node for node in self.get_descendants(el, no_iframe=no_iframe) # type: ignore[misc]
420 if self.is_content_string(node)
421 ]
422 )
423
424 def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]:
425 """Get Own Text."""
426
427 return [
428 node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node) # type: ignore[misc]
429 ]
430
431
432class Inputs:
433 """Class for parsing and validating input items."""
434
435 @staticmethod
436 def validate_day(year: int, month: int, day: int) -> bool:
437 """Validate day."""
438
439 max_days = LONG_MONTH
440 if month == FEB:
441 max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
442 elif month in MONTHS_30:
443 max_days = SHORT_MONTH
444 return 1 <= day <= max_days
445
446 @staticmethod
447 def validate_week(year: int, week: int) -> bool:
448 """Validate week."""
449
450 max_week = datetime.strptime(f"{12}-{31}-{year}", "%m-%d-%Y").isocalendar()[1]
451 if max_week == 1:
452 max_week = 53
453 return 1 <= week <= max_week
454
455 @staticmethod
456 def validate_month(month: int) -> bool:
457 """Validate month."""
458
459 return 1 <= month <= 12
460
461 @staticmethod
462 def validate_year(year: int) -> bool:
463 """Validate year."""
464
465 return 1 <= year
466
467 @staticmethod
468 def validate_hour(hour: int) -> bool:
469 """Validate hour."""
470
471 return 0 <= hour <= 23
472
473 @staticmethod
474 def validate_minutes(minutes: int) -> bool:
475 """Validate minutes."""
476
477 return 0 <= minutes <= 59
478
479 @classmethod
480 def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:
481 """Parse the input value."""
482
483 parsed = None # type: tuple[float, ...] | None
484 if value is None:
485 return value
486 if itype == "date":
487 m = RE_DATE.match(value)
488 if m:
489 year = int(m.group('year'), 10)
490 month = int(m.group('month'), 10)
491 day = int(m.group('day'), 10)
492 if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
493 parsed = (year, month, day)
494 elif itype == "month":
495 m = RE_MONTH.match(value)
496 if m:
497 year = int(m.group('year'), 10)
498 month = int(m.group('month'), 10)
499 if cls.validate_year(year) and cls.validate_month(month):
500 parsed = (year, month)
501 elif itype == "week":
502 m = RE_WEEK.match(value)
503 if m:
504 year = int(m.group('year'), 10)
505 week = int(m.group('week'), 10)
506 if cls.validate_year(year) and cls.validate_week(year, week):
507 parsed = (year, week)
508 elif itype == "time":
509 m = RE_TIME.match(value)
510 if m:
511 hour = int(m.group('hour'), 10)
512 minutes = int(m.group('minutes'), 10)
513 if cls.validate_hour(hour) and cls.validate_minutes(minutes):
514 parsed = (hour, minutes)
515 elif itype == "datetime-local":
516 m = RE_DATETIME.match(value)
517 if m:
518 year = int(m.group('year'), 10)
519 month = int(m.group('month'), 10)
520 day = int(m.group('day'), 10)
521 hour = int(m.group('hour'), 10)
522 minutes = int(m.group('minutes'), 10)
523 if (
524 cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
525 cls.validate_hour(hour) and cls.validate_minutes(minutes)
526 ):
527 parsed = (year, month, day, hour, minutes)
528 elif itype in ("number", "range"):
529 m = RE_NUM.match(value)
530 if m:
531 parsed = (float(m.group('value')),)
532 return parsed
533
534
535class CSSMatch(_DocumentNav):
536 """Perform CSS matching."""
537
538 def __init__(
539 self,
540 selectors: ct.SelectorList,
541 scope: bs4.Tag | None,
542 namespaces: ct.Namespaces | None,
543 flags: int
544 ) -> None:
545 """Initialize."""
546
547 self.assert_valid_input(scope)
548 self.tag = scope
549 self.cached_meta_lang = [] # type: list[tuple[str, str]]
550 self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]]
551 self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]]
552 self.selectors = selectors
553 self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str]
554 self.flags = flags
555 self.iframe_restrict = False
556
557 # Find the root element for the whole tree
558 doc = scope
559 parent = self.get_parent(doc)
560 while parent:
561 doc = parent
562 parent = self.get_parent(doc)
563 root = None # type: bs4.Tag | None
564 if not self.is_doc(doc):
565 root = doc
566 else:
567 for child in self.get_tag_children(doc):
568 root = child
569 break
570
571 self.root = root
572 self.scope = scope if scope is not doc else root
573 self.has_html_namespace = self.has_html_ns(root)
574
575 # A document can be both XML and HTML (XHTML)
576 self.is_xml = self.is_xml_tree(doc)
577 self.is_html = not self.is_xml or self.has_html_namespace
578
579 def supports_namespaces(self) -> bool:
580 """Check if namespaces are supported in the HTML type."""
581
582 return self.is_xml or self.has_html_namespace
583
584 def get_tag_ns(self, el: bs4.Tag | None) -> str:
585 """Get tag namespace."""
586
587 namespace = ''
588 if el is None: # pragma: no cover
589 return namespace
590
591 if self.supports_namespaces():
592 ns = self.get_uri(el)
593 if ns:
594 namespace = ns
595 else:
596 namespace = NS_XHTML
597 return namespace
598
599 def is_html_tag(self, el: bs4.Tag | None) -> bool:
600 """Check if tag is in HTML namespace."""
601
602 return self.get_tag_ns(el) == NS_XHTML
603
604 def get_tag(self, el: bs4.Tag | None) -> str | None:
605 """Get tag."""
606
607 name = self.get_tag_name(el)
608 return util.lower(name) if name is not None and not self.is_xml else name
609
610 def get_prefix(self, el: bs4.Tag) -> str | None:
611 """Get prefix."""
612
613 prefix = self.get_prefix_name(el)
614 return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
615
616 def find_bidi(self, el: bs4.Tag) -> int | None:
617 """Get directionality from element text."""
618
619 for node in self.get_children(el):
620
621 # Analyze child text nodes
622 if self.is_tag(node):
623
624 # Avoid analyzing certain elements specified in the specification.
625 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) # type: ignore[arg-type]
626 name = self.get_tag(node) # type: ignore[arg-type]
627 if (
628 (name and name in ('bdi', 'script', 'style', 'textarea', 'iframe')) or
629 not self.is_html_tag(node) or # type: ignore[arg-type]
630 direction is not None
631 ):
632 continue # pragma: no cover
633
634 # Check directionality of this node's text
635 value = self.find_bidi(node) # type: ignore[arg-type]
636 if value is not None:
637 return value
638
639 # Direction could not be determined
640 continue # pragma: no cover
641
642 # Skip `doctype` comments, etc.
643 if self.is_special_string(node):
644 continue
645
646 # Analyze text nodes for directionality.
647 for c in node: # type: ignore[attr-defined]
648 bidi = unicodedata.bidirectional(c)
649 if bidi in ('AL', 'R', 'L'):
650 return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
651 return None
652
653 def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool:
654 """Filter the language tags."""
655
656 match = True
657 lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
658 ranges = lang_range.split('-')
659 subtags = lang_tag.lower().split('-')
660 length = len(ranges)
661 slength = len(subtags)
662 rindex = 0
663 sindex = 0
664 r = ranges[rindex]
665 s = subtags[sindex]
666
667 # Empty specified language should match unspecified language attributes
668 if length == 1 and slength == 1 and not r and r == s:
669 return True
670
671 # Primary tag needs to match
672 if (r != '*' and r != s) or (r == '*' and slength == 1 and not s):
673 match = False
674
675 rindex += 1
676 sindex += 1
677
678 # Match until we run out of ranges
679 while match and rindex < length:
680 r = ranges[rindex]
681 try:
682 s = subtags[sindex]
683 except IndexError:
684 # Ran out of subtags,
685 # but we still have ranges
686 match = False
687 continue
688
689 # Empty range
690 if not r:
691 match = False
692 continue
693
694 # Matched range
695 elif s == r:
696 rindex += 1
697
698 # Implicit wildcard cannot match
699 # singletons
700 elif len(s) == 1:
701 match = False
702 continue
703
704 # Implicitly matched, so grab next subtag
705 sindex += 1
706
707 return match
708
709 def match_attribute_name(
710 self,
711 el: bs4.Tag,
712 attr: str,
713 prefix: str | None
714 ) -> str | Sequence[str] | None:
715 """Match attribute name and return value if it exists."""
716
717 value = None
718 if self.supports_namespaces():
719 value = None
720 # If we have not defined namespaces, we can't very well find them, so don't bother trying.
721 if prefix:
722 ns = self.namespaces.get(prefix)
723 if ns is None and prefix != '*':
724 return None
725 else:
726 ns = None
727
728 for k, v in self.iter_attributes(el):
729
730 # Get attribute parts
731 namespace, name = self.split_namespace(el, k)
732
733 # Can't match a prefix attribute as we haven't specified one to match
734 # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
735 if ns is None:
736 if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
737 value = v
738 break
739 # Coverage is not finding this even though it is executed.
740 # Adding a print statement before this (and erasing coverage) causes coverage to find the line.
741 # Ignore the false positive message.
742 continue # pragma: no cover
743
744 # We can't match our desired prefix attribute as the attribute doesn't have a prefix
745 if namespace is None or (ns != namespace and prefix != '*'):
746 continue
747
748 # The attribute doesn't match.
749 if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
750 continue
751
752 value = v
753 break
754 else:
755 for k, v in self.iter_attributes(el):
756 if util.lower(attr) != util.lower(k):
757 continue
758 value = v
759 break
760 return value
761
762 def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
763 """Match the namespace of the element."""
764
765 match = True
766 namespace = self.get_tag_ns(el)
767 default_namespace = self.namespaces.get('')
768 tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix)
769 # We must match the default namespace if one is not provided
770 if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
771 match = False
772 # If we specified `|tag`, we must not have a namespace.
773 elif (tag.prefix is not None and tag.prefix == '' and namespace):
774 match = False
775 # Verify prefix matches
776 elif (
777 tag.prefix and
778 tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
779 ):
780 match = False
781 return match
782
783 def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool:
784 """Match attributes."""
785
786 match = True
787 if attributes:
788 for a in attributes:
789 temp = self.match_attribute_name(el, a.attribute, a.prefix)
790 pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
791 if temp is None:
792 match = False
793 break
794 value = temp if isinstance(temp, str) else ' '.join(temp)
795 if pattern is None:
796 continue
797 elif pattern.match(value) is None:
798 match = False
799 break
800 return match
801
802 def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
803 """Match tag name."""
804
805 name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
806 return not (
807 name is not None and
808 name not in (self.get_tag(el), '*')
809 )
810
811 def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:
812 """Match the tag."""
813
814 match = True
815 if tag is not None:
816 # Verify namespace
817 if not self.match_namespace(el, tag):
818 match = False
819 if not self.match_tagname(el, tag):
820 match = False
821 return match
822
823 def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
824 """Match past relationship."""
825
826 found = False
827 # I don't think this can ever happen, but it makes `mypy` happy
828 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
829 return found
830
831 if relation[0].rel_type == REL_PARENT:
832 parent = self.get_parent(el, no_iframe=self.iframe_restrict)
833 while not found and parent:
834 found = self.match_selectors(parent, relation)
835 parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
836 elif relation[0].rel_type == REL_CLOSE_PARENT:
837 parent = self.get_parent(el, no_iframe=self.iframe_restrict)
838 if parent:
839 found = self.match_selectors(parent, relation)
840 elif relation[0].rel_type == REL_SIBLING:
841 sibling = self.get_previous_tag(el)
842 while not found and sibling:
843 found = self.match_selectors(sibling, relation)
844 sibling = self.get_previous_tag(sibling)
845 elif relation[0].rel_type == REL_CLOSE_SIBLING:
846 sibling = self.get_previous_tag(el)
847 if sibling and self.is_tag(sibling):
848 found = self.match_selectors(sibling, relation)
849 return found
850
851 def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool:
852 """Match future child."""
853
854 match = False
855 if recursive:
856 children = self.get_tag_descendants # type: Callable[..., Iterator[bs4.Tag]]
857 else:
858 children = self.get_tag_children
859 for child in children(parent, no_iframe=self.iframe_restrict):
860 match = self.match_selectors(child, relation)
861 if match:
862 break
863 return match
864
865 def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
866 """Match future relationship."""
867
868 found = False
869 # I don't think this can ever happen, but it makes `mypy` happy
870 if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
871 return found
872
873 if relation[0].rel_type == REL_HAS_PARENT:
874 found = self.match_future_child(el, relation, True)
875 elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
876 found = self.match_future_child(el, relation)
877 elif relation[0].rel_type == REL_HAS_SIBLING:
878 sibling = self.get_next_tag(el)
879 while not found and sibling:
880 found = self.match_selectors(sibling, relation)
881 sibling = self.get_next_tag(sibling)
882 elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
883 sibling = self.get_next_tag(el)
884 if sibling and self.is_tag(sibling):
885 found = self.match_selectors(sibling, relation)
886 return found
887
888 def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
889 """Match relationship to other elements."""
890
891 found = False
892
893 if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None:
894 return found
895
896 if relation[0].rel_type.startswith(':'):
897 found = self.match_future_relations(el, relation)
898 else:
899 found = self.match_past_relations(el, relation)
900
901 return found
902
903 def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool:
904 """Match element's ID."""
905
906 found = True
907 for i in ids:
908 if i != self.get_attribute_by_name(el, 'id', ''):
909 found = False
910 break
911 return found
912
913 def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool:
914 """Match element's classes."""
915
916 current_classes = self.get_classes(el)
917 found = True
918 for c in classes:
919 if c not in current_classes:
920 found = False
921 break
922 return found
923
924 def match_root(self, el: bs4.Tag) -> bool:
925 """Match element as root."""
926
927 is_root = self.is_root(el)
928 if is_root:
929 sibling = self.get_previous(el) # type: Any
930 while is_root and sibling is not None:
931 if (
932 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
933 self.is_cdata(sibling)
934 ):
935 is_root = False
936 else:
937 sibling = self.get_previous(sibling)
938 if is_root:
939 sibling = self.get_next(el)
940 while is_root and sibling is not None:
941 if (
942 self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
943 self.is_cdata(sibling)
944 ):
945 is_root = False
946 else:
947 sibling = self.get_next(sibling)
948 return is_root
949
950 def match_scope(self, el: bs4.Tag) -> bool:
951 """Match element as scope."""
952
953 return self.scope is el
954
955 def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool:
956 """Match tag type for `nth` matches."""
957
958 return (
959 (self.get_tag(child) == self.get_tag(el)) and
960 (self.get_tag_ns(child) == self.get_tag_ns(el))
961 )
962
963 def match_nth(self, el: bs4.Tag, nth: tuple[ct.SelectorNth, ...]) -> bool:
964 """Match `nth` elements."""
965
966 matched = True
967
968 for n in nth:
969 matched = False
970 if n.selectors and not self.match_selectors(el, n.selectors):
971 break
972 parent = self.get_parent(el) # type: bs4.Tag | None
973 if parent is None:
974 parent = cast('bs4.Tag', self.create_fake_parent(el))
975 last = n.last
976 last_index = len(parent) - 1
977 index = last_index if last else 0
978 relative_index = 0
979 a = n.a
980 b = n.b
981 var = n.n
982 count = 0
983 count_incr = 1
984 factor = -1 if last else 1
985 idx = last_idx = a * count + b if var else a
986
987 # We can only adjust bounds within a variable index
988 if var:
989 # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
990 # Otherwise, increment to try to get in bounds.
991 adjust = None
992 while idx < 1 or idx > last_index:
993 if idx < 0:
994 diff_low = 0 - idx
995 if adjust is not None and adjust == 1:
996 break
997 adjust = -1
998 count += count_incr
999 idx = last_idx = a * count + b if var else a
1000 diff = 0 - idx
1001 if diff >= diff_low:
1002 break
1003 else:
1004 diff_high = idx - last_index
1005 if adjust is not None and adjust == -1:
1006 break
1007 adjust = 1
1008 count += count_incr
1009 idx = last_idx = a * count + b if var else a
1010 diff = idx - last_index
1011 if diff >= diff_high:
1012 break
1013 diff_high = diff
1014
1015 # If a < 0, our count is working backwards, so floor the index by increasing the count.
1016 # Find the count that yields the lowest, in bound value and use that.
1017 # Lastly reverse count increment so that we'll increase our index.
1018 lowest = count
1019 if a < 0:
1020 while idx >= 1:
1021 lowest = count
1022 count += count_incr
1023 idx = last_idx = a * count + b if var else a
1024 count_incr = -1
1025 count = lowest
1026 idx = last_idx = a * count + b if var else a
1027
1028 # Evaluate elements while our calculated nth index is still in range
1029 while 1 <= idx <= last_index + 1:
1030 child = None # type: bs4.element.PageElement | None
1031 # Evaluate while our child index is still in range.
1032 for child in self.get_children(parent, start=index, reverse=factor < 0):
1033 index += factor
1034 if not isinstance(child, bs4.Tag):
1035 continue
1036 # Handle `of S` in `nth-child`
1037 if n.selectors and not self.match_selectors(child, n.selectors):
1038 continue
1039 # Handle `of-type`
1040 if n.of_type and not self.match_nth_tag_type(el, child):
1041 continue
1042 relative_index += 1
1043 if relative_index == idx:
1044 if child is el:
1045 matched = True
1046 else:
1047 break
1048 if child is el:
1049 break
1050 if child is el:
1051 break
1052 last_idx = idx
1053 count += count_incr
1054 if count < 0:
1055 # Count is counting down and has now ventured into invalid territory.
1056 break
1057 idx = a * count + b if var else a
1058 if last_idx == idx:
1059 break
1060 if not matched:
1061 break
1062 return matched
1063
1064 def match_empty(self, el: bs4.Tag) -> bool:
1065 """Check if element is empty (if requested)."""
1066
1067 is_empty = True
1068 for child in self.get_children(el):
1069 if self.is_tag(child):
1070 is_empty = False
1071 break
1072 elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): # type: ignore[call-overload]
1073 is_empty = False
1074 break
1075 return is_empty
1076
1077 def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool:
1078 """Match selectors."""
1079
1080 match = True
1081 for sel in selectors:
1082 if not self.match_selectors(el, sel):
1083 match = False
1084 return match
1085
1086 def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool:
1087 """Match element if it contains text."""
1088
1089 match = True
1090 content = None # type: str | Sequence[str] | None
1091 for contain_list in contains:
1092 if content is None:
1093 if contain_list.own:
1094 content = self.get_own_text(el, no_iframe=self.is_html)
1095 else:
1096 content = self.get_text(el, no_iframe=self.is_html)
1097 found = False
1098 for text in contain_list.text:
1099 if contain_list.own:
1100 for c in content:
1101 if text in c:
1102 found = True
1103 break
1104 if found:
1105 break
1106 else:
1107 if text in content:
1108 found = True
1109 break
1110 if not found:
1111 match = False
1112 return match
1113
1114 def match_default(self, el: bs4.Tag) -> bool:
1115 """Match default."""
1116
1117 match = False
1118
1119 # Find this input's form
1120 form = None # type: bs4.Tag | None
1121 parent = self.get_parent(el, no_iframe=True)
1122 while parent and form is None:
1123 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
1124 form = parent
1125 else:
1126 parent = self.get_parent(parent, no_iframe=True)
1127
1128 if form is not None:
1129 # Look in form cache to see if we've already located its default button
1130 found_form = False
1131 for f, t in self.cached_default_forms:
1132 if f is form:
1133 found_form = True
1134 if t is el:
1135 match = True
1136 break
1137
1138 # We didn't have the form cached, so look for its default button
1139 if not found_form:
1140 for child in self.get_tag_descendants(form, no_iframe=True):
1141 name = self.get_tag(child)
1142 # Can't do nested forms (haven't figured out why we never hit this)
1143 if name == 'form': # pragma: no cover
1144 break
1145 if name in ('input', 'button'):
1146 v = self.get_attribute_by_name(child, 'type', '')
1147 if v and util.lower(v) == 'submit':
1148 self.cached_default_forms.append((form, child))
1149 if el is child:
1150 match = True
1151 break
1152 return match
1153
1154 def match_indeterminate(self, el: bs4.Tag) -> bool:
1155 """Match default."""
1156
1157 match = False
1158 name = cast(str, self.get_attribute_by_name(el, 'name'))
1159
1160 def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:
1161 """Find this input's form."""
1162 form = None
1163 parent = self.get_parent(el, no_iframe=True)
1164 while form is None:
1165 if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
1166 form = parent
1167 break
1168 last_parent = parent
1169 parent = self.get_parent(parent, no_iframe=True)
1170 if parent is None:
1171 form = last_parent
1172 break
1173 return form
1174
1175 form = get_parent_form(el)
1176
1177 # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
1178 if form is not None:
1179 found_form = False
1180 for f, n, i in self.cached_indeterminate_forms:
1181 if f is form and n == name:
1182 found_form = True
1183 if i is True:
1184 match = True
1185 break
1186
1187 # We didn't have the form cached, so validate that the radio button is indeterminate
1188 if not found_form:
1189 checked = False
1190 for child in self.get_tag_descendants(form, no_iframe=True):
1191 if child is el:
1192 continue
1193 tag_name = self.get_tag(child)
1194 if tag_name == 'input':
1195 is_radio = False
1196 check = False
1197 has_name = False
1198 for k, v in self.iter_attributes(child):
1199 if util.lower(k) == 'type' and util.lower(v) == 'radio':
1200 is_radio = True
1201 elif util.lower(k) == 'name' and v == name:
1202 has_name = True
1203 elif util.lower(k) == 'checked':
1204 check = True
1205 if is_radio and check and has_name and get_parent_form(child) is form:
1206 checked = True
1207 break
1208 if checked:
1209 break
1210 if not checked:
1211 match = True
1212 self.cached_indeterminate_forms.append((form, name, match))
1213
1214 return match
1215
1216 def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool:
1217 """Match languages."""
1218
1219 match = False
1220 has_ns = self.supports_namespaces()
1221 root = self.root
1222 has_html_namespace = self.has_html_namespace
1223
1224 # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
1225 parent = el # type: bs4.Tag | None
1226 found_lang = None
1227 last = None
1228 while not found_lang:
1229 has_html_ns = self.has_html_ns(parent)
1230 for k, v in self.iter_attributes(parent):
1231 attr_ns, attr = self.split_namespace(parent, k)
1232 if (
1233 ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
1234 (
1235 has_ns and not has_html_ns and attr_ns == NS_XML and
1236 (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
1237 )
1238 ):
1239 found_lang = v
1240 break
1241 last = parent
1242 parent = self.get_parent(parent, no_iframe=self.is_html)
1243
1244 if parent is None:
1245 root = last
1246 has_html_namespace = self.has_html_ns(root)
1247 parent = last
1248 break
1249
1250 # Use cached meta language.
1251 if found_lang is None and self.cached_meta_lang:
1252 for cache in self.cached_meta_lang:
1253 if root is cache[0]:
1254 found_lang = cache[1]
1255
1256 # If we couldn't find a language, and the document is HTML, look to meta to determine language.
1257 if found_lang is None and (not self.is_xml or (has_html_namespace and root and root.name == 'html')):
1258 # Find head
1259 found = False
1260 for tag in ('html', 'head'):
1261 found = False
1262 for child in self.get_tag_children(parent, no_iframe=self.is_html):
1263 if self.get_tag(child) == tag and self.is_html_tag(child):
1264 found = True
1265 parent = child
1266 break
1267 if not found: # pragma: no cover
1268 break
1269
1270 # Search meta tags
1271 if found and parent is not None:
1272 for child2 in parent:
1273 if isinstance(child2, bs4.Tag) and self.get_tag(child2) == 'meta' and self.is_html_tag(parent):
1274 c_lang = False
1275 content = None
1276 for k, v in self.iter_attributes(child2):
1277 if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
1278 c_lang = True
1279 if util.lower(k) == 'content':
1280 content = v
1281 if c_lang and content:
1282 found_lang = content
1283 self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))
1284 break
1285 if found_lang is not None:
1286 break
1287 if found_lang is None:
1288 self.cached_meta_lang.append((cast(str, root), ''))
1289
1290 # If we determined a language, compare.
1291 if found_lang is not None:
1292 for patterns in langs:
1293 match = False
1294 for pattern in patterns:
1295 if self.extended_language_filter(pattern, cast(str, found_lang)):
1296 match = True
1297 if not match:
1298 break
1299
1300 return match
1301
1302 def match_dir(self, el: bs4.Tag | None, directionality: int) -> bool:
1303 """Check directionality."""
1304
1305 # If we have to match both left and right, we can't match either.
1306 if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
1307 return False
1308
1309 if el is None or not self.is_html_tag(el):
1310 return False
1311
1312 # Element has defined direction of left to right or right to left
1313 direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
1314 if direction not in (None, 0):
1315 return direction == directionality
1316
1317 # Element is the document element (the root) and no direction assigned, assume left to right.
1318 is_root = self.is_root(el)
1319 if is_root and direction is None:
1320 return ct.SEL_DIR_LTR == directionality
1321
1322 # If `input[type=telephone]` and no direction is assigned, assume left to right.
1323 name = self.get_tag(el)
1324 is_input = name == 'input'
1325 is_textarea = name == 'textarea'
1326 is_bdi = name == 'bdi'
1327 itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
1328 if is_input and itype == 'tel' and direction is None:
1329 return ct.SEL_DIR_LTR == directionality
1330
1331 # Auto handling for text inputs
1332 if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
1333 if is_textarea:
1334 value = ''.join(node for node in self.get_contents(el, no_iframe=True) if self.is_content_string(node)) # type: ignore[misc]
1335 else:
1336 value = cast(str, self.get_attribute_by_name(el, 'value', ''))
1337 if value:
1338 for c in value:
1339 bidi = unicodedata.bidirectional(c)
1340 if bidi in ('AL', 'R', 'L'):
1341 direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
1342 return direction == directionality
1343 # Assume left to right
1344 return ct.SEL_DIR_LTR == directionality
1345 elif is_root:
1346 return ct.SEL_DIR_LTR == directionality
1347 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1348
1349 # Auto handling for `bdi` and other non text inputs.
1350 if (is_bdi and direction is None) or direction == 0:
1351 direction = self.find_bidi(el)
1352 if direction is not None:
1353 return direction == directionality
1354 elif is_root:
1355 return ct.SEL_DIR_LTR == directionality
1356 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1357
1358 # Match parents direction
1359 return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1360
1361 def match_range(self, el: bs4.Tag, condition: int) -> bool:
1362 """
1363 Match range.
1364
1365 Behavior is modeled after what we see in browsers. Browsers seem to evaluate
1366 if the value is out of range, and if not, it is in range. So a missing value
1367 will not evaluate out of range; therefore, value is in range. Personally, I
1368 feel like this should evaluate as neither in or out of range.
1369 """
1370
1371 out_of_range = False
1372
1373 itype = util.lower(self.get_attribute_by_name(el, 'type'))
1374 mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None)))
1375 mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None)))
1376
1377 # There is no valid min or max, so we cannot evaluate a range
1378 if mn is None and mx is None:
1379 return False
1380
1381 value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None)))
1382 if value is not None:
1383 if itype in ("date", "datetime-local", "month", "week", "number", "range"):
1384 if mn is not None and value < mn:
1385 out_of_range = True
1386 if not out_of_range and mx is not None and value > mx:
1387 out_of_range = True
1388 elif itype == "time":
1389 if mn is not None and mx is not None and mn > mx:
1390 # Time is periodic, so this is a reversed/discontinuous range
1391 if value < mn and value > mx:
1392 out_of_range = True
1393 else:
1394 if mn is not None and value < mn:
1395 out_of_range = True
1396 if not out_of_range and mx is not None and value > mx:
1397 out_of_range = True
1398
1399 return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
1400
1401 def match_defined(self, el: bs4.Tag) -> bool:
1402 """
1403 Match defined.
1404
1405 `:defined` is related to custom elements in a browser.
1406
1407 - If the document is XML (not XHTML), all tags will match.
1408 - Tags that are not custom (don't have a hyphen) are marked defined.
1409 - If the tag has a prefix (without or without a namespace), it will not match.
1410
1411 This is of course requires the parser to provide us with the proper prefix and namespace info,
1412 if it doesn't, there is nothing we can do.
1413 """
1414
1415 name = self.get_tag(el)
1416 return (
1417 name is not None and (
1418 name.find('-') == -1 or
1419 name.find(':') != -1 or
1420 self.get_prefix(el) is not None
1421 )
1422 )
1423
1424 def match_placeholder_shown(self, el: bs4.Tag) -> bool:
1425 """
1426 Match placeholder shown according to HTML spec.
1427
1428 - text area should be checked if they have content. A single newline does not count as content.
1429
1430 """
1431
1432 match = False
1433 content = self.get_text(el)
1434 if content in ('', '\n'):
1435 match = True
1436
1437 return match
1438
1439 def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool:
1440 """Check if element matches one of the selectors."""
1441
1442 match = False
1443 is_not = selectors.is_not
1444 is_html = selectors.is_html
1445
1446 # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
1447 if is_html:
1448 namespaces = self.namespaces
1449 iframe_restrict = self.iframe_restrict
1450 self.namespaces = {'html': NS_XHTML}
1451 self.iframe_restrict = True
1452
1453 if not is_html or self.is_html:
1454 for selector in selectors:
1455 match = is_not
1456 # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
1457 if isinstance(selector, ct.SelectorNull):
1458 continue
1459 # Verify tag matches
1460 if not self.match_tag(el, selector.tag):
1461 continue
1462 # Verify tag is defined
1463 if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
1464 continue
1465 # Verify element is root
1466 if selector.flags & ct.SEL_ROOT and not self.match_root(el):
1467 continue
1468 # Verify element is scope
1469 if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
1470 continue
1471 # Verify element has placeholder shown
1472 if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
1473 continue
1474 # Verify `nth` matches
1475 if not self.match_nth(el, selector.nth):
1476 continue
1477 if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
1478 continue
1479 # Verify id matches
1480 if selector.ids and not self.match_id(el, selector.ids):
1481 continue
1482 # Verify classes match
1483 if selector.classes and not self.match_classes(el, selector.classes):
1484 continue
1485 # Verify attribute(s) match
1486 if not self.match_attributes(el, selector.attributes):
1487 continue
1488 # Verify ranges
1489 if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
1490 continue
1491 # Verify language patterns
1492 if selector.lang and not self.match_lang(el, selector.lang):
1493 continue
1494 # Verify pseudo selector patterns
1495 if selector.selectors and not self.match_subselectors(el, selector.selectors):
1496 continue
1497 # Verify relationship selectors
1498 if selector.relation and not self.match_relations(el, selector.relation):
1499 continue
1500 # Validate that the current default selector match corresponds to the first submit button in the form
1501 if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
1502 continue
1503 # Validate that the unset radio button is among radio buttons with the same name in a form that are
1504 # also not set.
1505 if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
1506 continue
1507 # Validate element directionality
1508 if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
1509 continue
1510 # Validate that the tag contains the specified text.
1511 if selector.contains and not self.match_contains(el, selector.contains):
1512 continue
1513 match = not is_not
1514 break
1515
1516 # Restore actual namespaces being used for external selector lists
1517 if is_html:
1518 self.namespaces = namespaces
1519 self.iframe_restrict = iframe_restrict
1520
1521 return match
1522
1523 def select(self, limit: int = 0) -> Iterator[bs4.Tag]:
1524 """Match all tags under the targeted tag."""
1525
1526 lim = None if limit < 1 else limit
1527
1528 for child in self.get_tag_descendants(self.tag):
1529 if self.match(child):
1530 yield child
1531 if lim is not None:
1532 lim -= 1
1533 if lim < 1:
1534 break
1535
1536 def closest(self) -> bs4.Tag | None:
1537 """Match closest ancestor."""
1538
1539 current = self.tag # type: bs4.Tag | None
1540 closest = None
1541 while closest is None and current is not None:
1542 if self.match(current):
1543 closest = current
1544 else:
1545 current = self.get_parent(current)
1546 return closest
1547
1548 def filter(self) -> list[bs4.Tag]: # noqa A001
1549 """Filter tag's children."""
1550
1551 return [
1552 tag for tag in self.get_contents(self.tag)
1553 if isinstance(tag, bs4.Tag) and self.match(tag)
1554 ]
1555
1556 def match(self, el: bs4.Tag) -> bool:
1557 """Match."""
1558
1559 return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
1560
1561
1562class SoupSieve(ct.Immutable):
1563 """Compiled Soup Sieve selector matching object."""
1564
1565 pattern: str
1566 selectors: ct.SelectorList
1567 namespaces: ct.Namespaces | None
1568 custom: dict[str, str]
1569 flags: int
1570
1571 __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
1572
1573 def __init__(
1574 self,
1575 pattern: str,
1576 selectors: ct.SelectorList,
1577 namespaces: ct.Namespaces | None,
1578 custom: ct.CustomSelectors | None,
1579 flags: int
1580 ):
1581 """Initialize."""
1582
1583 super().__init__(
1584 pattern=pattern,
1585 selectors=selectors,
1586 namespaces=namespaces,
1587 custom=custom,
1588 flags=flags
1589 )
1590
1591 def match(self, tag: bs4.Tag) -> bool:
1592 """Match."""
1593
1594 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
1595
1596 def closest(self, tag: bs4.Tag) -> bs4.Tag | None:
1597 """Match closest ancestor."""
1598
1599 return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
1600
1601 def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001
1602 """
1603 Filter.
1604
1605 `CSSMatch` can cache certain searches for tags of the same document,
1606 so if we are given a tag, all tags are from the same document,
1607 and we can take advantage of the optimization.
1608
1609 Any other kind of iterable could have tags from different documents or detached tags,
1610 so for those, we use a new `CSSMatch` for each item in the iterable.
1611 """
1612
1613 if isinstance(iterable, bs4.Tag):
1614 return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
1615 else:
1616 return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
1617
1618 def select_one(self, tag: bs4.Tag) -> bs4.Tag | None:
1619 """Select a single tag."""
1620
1621 tags = self.select(tag, limit=1)
1622 return tags[0] if tags else None
1623
1624 def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]:
1625 """Select the specified tags."""
1626
1627 return list(self.iselect(tag, limit))
1628
1629 def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]:
1630 """Iterate the specified tags."""
1631
1632 yield from CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit)
1633
1634 def __repr__(self) -> str: # pragma: no cover
1635 """Representation."""
1636
1637 return (
1638 f"SoupSieve(pattern={self.pattern!r}, namespaces={self.namespaces!r}, "
1639 f"custom={self.custom!r}, flags={self.flags!r})"
1640 )
1641
1642 __str__ = __repr__
1643
1644
1645ct.pickle_register(SoupSieve)