1"""
2cssselect.parser
3================
4
5Tokenizer, parser and parsed objects for CSS selectors.
6
7
8:copyright: (c) 2007-2012 Ian Bicking and contributors.
9See AUTHORS for more details.
10:license: BSD, see LICENSE for more details.
11
12"""
13
14from __future__ import annotations
15
16import operator
17import re
18import sys
19from typing import TYPE_CHECKING, Literal, Optional, Protocol, Union, cast, overload
20
21if TYPE_CHECKING:
22 from collections.abc import Iterable, Iterator, Sequence
23
24 # typing.Self requires Python 3.11
25 from typing_extensions import Self
26
27
28def ascii_lower(string: str) -> str:
29 """Lower-case, but only in the ASCII range."""
30 return string.encode("utf8").lower().decode("utf8")
31
32
33class SelectorError(Exception):
34 """Common parent for :class:`SelectorSyntaxError` and
35 :class:`ExpressionError`.
36
37 You can just use ``except SelectorError:`` when calling
38 :meth:`~GenericTranslator.css_to_xpath` and handle both exceptions types.
39
40 """
41
42
43class SelectorSyntaxError(SelectorError, SyntaxError):
44 """Parsing a selector that does not match the grammar."""
45
46
47#### Parsed objects
48
49Tree = Union[
50 "Element",
51 "Hash",
52 "Class",
53 "Function",
54 "Pseudo",
55 "Attrib",
56 "Negation",
57 "Relation",
58 "Matching",
59 "SpecificityAdjustment",
60 "CombinedSelector",
61]
62PseudoElement = Union["FunctionalPseudoElement", str]
63
64
65class Selector:
66 """
67 Represents a parsed selector.
68
69 :meth:`~GenericTranslator.selector_to_xpath` accepts this object,
70 but ignores :attr:`pseudo_element`. It is the user’s responsibility
71 to account for pseudo-elements and reject selectors with unknown
72 or unsupported pseudo-elements.
73
74 """
75
76 def __init__(self, tree: Tree, pseudo_element: PseudoElement | None = None) -> None:
77 self.parsed_tree = tree
78 if pseudo_element is not None and not isinstance(
79 pseudo_element, FunctionalPseudoElement
80 ):
81 pseudo_element = ascii_lower(pseudo_element)
82 #: A :class:`FunctionalPseudoElement`,
83 #: or the identifier for the pseudo-element as a string,
84 # or ``None``.
85 #:
86 #: +-------------------------+----------------+--------------------------------+
87 #: | | Selector | Pseudo-element |
88 #: +=========================+================+================================+
89 #: | CSS3 syntax | ``a::before`` | ``'before'`` |
90 #: +-------------------------+----------------+--------------------------------+
91 #: | Older syntax | ``a:before`` | ``'before'`` |
92 #: +-------------------------+----------------+--------------------------------+
93 #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` |
94 #: | not in Selectors3 | | |
95 #: +-------------------------+----------------+--------------------------------+
96 #: | Invalid pseudo-class | ``li:marker`` | ``None`` |
97 #: +-------------------------+----------------+--------------------------------+
98 #: | Functional | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` |
99 #: +-------------------------+----------------+--------------------------------+
100 #:
101 #: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement
102 self.pseudo_element = pseudo_element
103
104 def __repr__(self) -> str:
105 if isinstance(self.pseudo_element, FunctionalPseudoElement):
106 pseudo_element = repr(self.pseudo_element)
107 elif self.pseudo_element:
108 pseudo_element = f"::{self.pseudo_element}"
109 else:
110 pseudo_element = ""
111 return f"{self.__class__.__name__}[{self.parsed_tree!r}{pseudo_element}]"
112
113 def canonical(self) -> str:
114 """Return a CSS representation for this selector (a string)"""
115 if isinstance(self.pseudo_element, FunctionalPseudoElement):
116 pseudo_element = f"::{self.pseudo_element.canonical()}"
117 elif self.pseudo_element:
118 pseudo_element = f"::{self.pseudo_element}"
119 else:
120 pseudo_element = ""
121 res = f"{self.parsed_tree.canonical()}{pseudo_element}"
122 if len(res) > 1:
123 res = res.lstrip("*")
124 return res
125
126 def specificity(self) -> tuple[int, int, int]:
127 """Return the specificity_ of this selector as a tuple of 3 integers.
128
129 .. _specificity: http://www.w3.org/TR/selectors/#specificity
130
131 """
132 a, b, c = self.parsed_tree.specificity()
133 if self.pseudo_element:
134 c += 1
135 return a, b, c
136
137
138class Class:
139 """
140 Represents selector.class_name
141 """
142
143 def __init__(self, selector: Tree, class_name: str) -> None:
144 self.selector = selector
145 self.class_name = class_name
146
147 def __repr__(self) -> str:
148 return f"{self.__class__.__name__}[{self.selector!r}.{self.class_name}]"
149
150 def canonical(self) -> str:
151 return f"{self.selector.canonical()}.{self.class_name}"
152
153 def specificity(self) -> tuple[int, int, int]:
154 a, b, c = self.selector.specificity()
155 b += 1
156 return a, b, c
157
158
159class FunctionalPseudoElement:
160 """
161 Represents selector::name(arguments)
162
163 .. attribute:: name
164
165 The name (identifier) of the pseudo-element, as a string.
166
167 .. attribute:: arguments
168
169 The arguments of the pseudo-element, as a list of tokens.
170
171 **Note:** tokens are not part of the public API,
172 and may change between cssselect versions.
173 Use at your own risks.
174
175 """
176
177 def __init__(self, name: str, arguments: Sequence[Token]):
178 self.name = ascii_lower(name)
179 self.arguments = arguments
180
181 def __repr__(self) -> str:
182 token_values = [token.value for token in self.arguments]
183 return f"{self.__class__.__name__}[::{self.name}({token_values!r})]"
184
185 def argument_types(self) -> list[str]:
186 return [token.type for token in self.arguments]
187
188 def canonical(self) -> str:
189 args = "".join(token.css() for token in self.arguments)
190 return f"{self.name}({args})"
191
192
193class Function:
194 """
195 Represents selector:name(expr)
196 """
197
198 def __init__(self, selector: Tree, name: str, arguments: Sequence[Token]) -> None:
199 self.selector = selector
200 self.name = ascii_lower(name)
201 self.arguments = arguments
202
203 def __repr__(self) -> str:
204 token_values = [token.value for token in self.arguments]
205 return f"{self.__class__.__name__}[{self.selector!r}:{self.name}({token_values!r})]"
206
207 def argument_types(self) -> list[str]:
208 return [token.type for token in self.arguments]
209
210 def canonical(self) -> str:
211 args = "".join(token.css() for token in self.arguments)
212 return f"{self.selector.canonical()}:{self.name}({args})"
213
214 def specificity(self) -> tuple[int, int, int]:
215 a, b, c = self.selector.specificity()
216 b += 1
217 return a, b, c
218
219
220class Pseudo:
221 """
222 Represents selector:ident
223 """
224
225 def __init__(self, selector: Tree, ident: str) -> None:
226 self.selector = selector
227 self.ident = ascii_lower(ident)
228
229 def __repr__(self) -> str:
230 return f"{self.__class__.__name__}[{self.selector!r}:{self.ident}]"
231
232 def canonical(self) -> str:
233 return f"{self.selector.canonical()}:{self.ident}"
234
235 def specificity(self) -> tuple[int, int, int]:
236 a, b, c = self.selector.specificity()
237 b += 1
238 return a, b, c
239
240
241class Negation:
242 """
243 Represents selector:not(subselector)
244 """
245
246 def __init__(self, selector: Tree, subselector: Tree) -> None:
247 self.selector = selector
248 self.subselector = subselector
249
250 def __repr__(self) -> str:
251 return f"{self.__class__.__name__}[{self.selector!r}:not({self.subselector!r})]"
252
253 def canonical(self) -> str:
254 subsel = self.subselector.canonical()
255 if len(subsel) > 1:
256 subsel = subsel.lstrip("*")
257 return f"{self.selector.canonical()}:not({subsel})"
258
259 def specificity(self) -> tuple[int, int, int]:
260 a1, b1, c1 = self.selector.specificity()
261 a2, b2, c2 = self.subselector.specificity()
262 return a1 + a2, b1 + b2, c1 + c2
263
264
265class Relation:
266 """
267 Represents selector:has(subselector)
268 """
269
270 def __init__(self, selector: Tree, combinator: Token, subselector: Selector):
271 self.selector = selector
272 self.combinator = combinator
273 self.subselector = subselector
274
275 def __repr__(self) -> str:
276 return f"{self.__class__.__name__}[{self.selector!r}:has({self.subselector!r})]"
277
278 def canonical(self) -> str:
279 try:
280 subsel = self.subselector[0].canonical() # type: ignore[index]
281 except TypeError:
282 subsel = self.subselector.canonical()
283 if len(subsel) > 1:
284 subsel = subsel.lstrip("*")
285 return f"{self.selector.canonical()}:has({subsel})"
286
287 def specificity(self) -> tuple[int, int, int]:
288 a1, b1, c1 = self.selector.specificity()
289 try:
290 a2, b2, c2 = self.subselector[-1].specificity() # type: ignore[index]
291 except TypeError:
292 a2, b2, c2 = self.subselector.specificity()
293 return a1 + a2, b1 + b2, c1 + c2
294
295
296class Matching:
297 """
298 Represents selector:is(selector_list)
299 """
300
301 def __init__(self, selector: Tree, selector_list: Iterable[Tree]):
302 self.selector = selector
303 self.selector_list = selector_list
304
305 def __repr__(self) -> str:
306 args_str = ", ".join(repr(s) for s in self.selector_list)
307 return f"{self.__class__.__name__}[{self.selector!r}:is({args_str})]"
308
309 def canonical(self) -> str:
310 selector_arguments = []
311 for s in self.selector_list:
312 selarg = s.canonical()
313 selector_arguments.append(selarg.lstrip("*"))
314 args_str = ", ".join(str(s) for s in selector_arguments)
315 return f"{self.selector.canonical()}:is({args_str})"
316
317 def specificity(self) -> tuple[int, int, int]:
318 return max(x.specificity() for x in self.selector_list)
319
320
321class SpecificityAdjustment:
322 """
323 Represents selector:where(selector_list)
324 Same as selector:is(selector_list), but its specificity is always 0
325 """
326
327 def __init__(self, selector: Tree, selector_list: list[Tree]):
328 self.selector = selector
329 self.selector_list = selector_list
330
331 def __repr__(self) -> str:
332 args_str = ", ".join(repr(s) for s in self.selector_list)
333 return f"{self.__class__.__name__}[{self.selector!r}:where({args_str})]"
334
335 def canonical(self) -> str:
336 selector_arguments = []
337 for s in self.selector_list:
338 selarg = s.canonical()
339 selector_arguments.append(selarg.lstrip("*"))
340 args_str = ", ".join(str(s) for s in selector_arguments)
341 return f"{self.selector.canonical()}:where({args_str})"
342
343 def specificity(self) -> tuple[int, int, int]:
344 return 0, 0, 0
345
346
347class Attrib:
348 """
349 Represents selector[namespace|attrib operator value]
350 """
351
352 @overload
353 def __init__(
354 self,
355 selector: Tree,
356 namespace: str | None,
357 attrib: str,
358 operator: Literal["exists"],
359 value: None,
360 ) -> None: ...
361
362 @overload
363 def __init__(
364 self,
365 selector: Tree,
366 namespace: str | None,
367 attrib: str,
368 operator: str,
369 value: Token,
370 ) -> None: ...
371
372 def __init__(
373 self,
374 selector: Tree,
375 namespace: str | None,
376 attrib: str,
377 operator: str,
378 value: Token | None,
379 ) -> None:
380 self.selector = selector
381 self.namespace = namespace
382 self.attrib = attrib
383 self.operator = operator
384 self.value = value
385
386 def __repr__(self) -> str:
387 attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib
388 if self.operator == "exists":
389 return f"{self.__class__.__name__}[{self.selector!r}[{attrib}]]"
390 assert self.value is not None
391 return f"{self.__class__.__name__}[{self.selector!r}[{attrib} {self.operator} {self.value.value!r}]]"
392
393 def canonical(self) -> str:
394 attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib
395
396 if self.operator == "exists":
397 op = attrib
398 else:
399 assert self.value is not None
400 op = f"{attrib}{self.operator}{self.value.css()}"
401
402 return f"{self.selector.canonical()}[{op}]"
403
404 def specificity(self) -> tuple[int, int, int]:
405 a, b, c = self.selector.specificity()
406 b += 1
407 return a, b, c
408
409
410class Element:
411 """
412 Represents namespace|element
413
414 `None` is for the universal selector '*'
415
416 """
417
418 def __init__(
419 self, namespace: str | None = None, element: str | None = None
420 ) -> None:
421 self.namespace = namespace
422 self.element = element
423
424 def __repr__(self) -> str:
425 return f"{self.__class__.__name__}[{self.canonical()}]"
426
427 def canonical(self) -> str:
428 element = self.element or "*"
429 if self.namespace:
430 element = f"{self.namespace}|{element}"
431 return element
432
433 def specificity(self) -> tuple[int, int, int]:
434 if self.element:
435 return 0, 0, 1
436 return 0, 0, 0
437
438
439class Hash:
440 """
441 Represents selector#id
442 """
443
444 def __init__(self, selector: Tree, id: str) -> None:
445 self.selector = selector
446 self.id = id
447
448 def __repr__(self) -> str:
449 return f"{self.__class__.__name__}[{self.selector!r}#{self.id}]"
450
451 def canonical(self) -> str:
452 return f"{self.selector.canonical()}#{self.id}"
453
454 def specificity(self) -> tuple[int, int, int]:
455 a, b, c = self.selector.specificity()
456 a += 1
457 return a, b, c
458
459
460class CombinedSelector:
461 def __init__(self, selector: Tree, combinator: str, subselector: Tree) -> None:
462 assert selector is not None
463 self.selector = selector
464 self.combinator = combinator
465 self.subselector = subselector
466
467 def __repr__(self) -> str:
468 comb = "<followed>" if self.combinator == " " else self.combinator
469 return (
470 f"{self.__class__.__name__}[{self.selector!r} {comb} {self.subselector!r}]"
471 )
472
473 def canonical(self) -> str:
474 subsel = self.subselector.canonical()
475 if len(subsel) > 1:
476 subsel = subsel.lstrip("*")
477 return f"{self.selector.canonical()} {self.combinator} {subsel}"
478
479 def specificity(self) -> tuple[int, int, int]:
480 a1, b1, c1 = self.selector.specificity()
481 a2, b2, c2 = self.subselector.specificity()
482 return a1 + a2, b1 + b2, c1 + c2
483
484
485#### Parser
486
487# foo
488_el_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$")
489
490# foo#bar or #bar
491_id_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$")
492
493# foo.bar or .bar
494_class_re = re.compile(
495 r"^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$"
496)
497
498
499def parse(css: str) -> list[Selector]:
500 """Parse a CSS *group of selectors*.
501
502 If you don't care about pseudo-elements or selector specificity,
503 you can skip this and use :meth:`~GenericTranslator.css_to_xpath`.
504
505 :param css:
506 A *group of selectors* as a string.
507 :raises:
508 :class:`SelectorSyntaxError` on invalid selectors.
509 :returns:
510 A list of parsed :class:`Selector` objects, one for each
511 selector in the comma-separated group.
512
513 """
514 # Fast path for simple cases
515 match = _el_re.match(css)
516 if match:
517 return [Selector(Element(element=match.group(1)))]
518 match = _id_re.match(css)
519 if match is not None:
520 return [Selector(Hash(Element(element=match.group(1) or None), match.group(2)))]
521 match = _class_re.match(css)
522 if match is not None:
523 return [
524 Selector(Class(Element(element=match.group(1) or None), match.group(2)))
525 ]
526
527 stream = TokenStream(tokenize(css))
528 stream.source = css
529 return list(parse_selector_group(stream))
530
531
532# except SelectorSyntaxError:
533# e = sys.exc_info()[1]
534# message = "%s at %s -> %r" % (
535# e, stream.used, stream.peek())
536# e.msg = message
537# e.args = tuple([message])
538# raise
539
540
541def parse_selector_group(stream: TokenStream) -> Iterator[Selector]:
542 stream.skip_whitespace()
543 while 1:
544 yield Selector(*parse_selector(stream))
545 if stream.peek() == ("DELIM", ","):
546 stream.next()
547 stream.skip_whitespace()
548 else:
549 break
550
551
552def parse_selector(stream: TokenStream) -> tuple[Tree, PseudoElement | None]:
553 result, pseudo_element = parse_simple_selector(stream)
554 while 1:
555 stream.skip_whitespace()
556 peek = stream.peek()
557 if peek in (("EOF", None), ("DELIM", ",")):
558 break
559 if pseudo_element:
560 raise SelectorSyntaxError(
561 f"Got pseudo-element ::{pseudo_element} not at the end of a selector"
562 )
563 if peek.is_delim("+", ">", "~"):
564 # A combinator
565 combinator = cast("str", stream.next().value)
566 stream.skip_whitespace()
567 else:
568 # By exclusion, the last parse_simple_selector() ended
569 # at peek == ' '
570 combinator = " "
571 next_selector, pseudo_element = parse_simple_selector(stream)
572 result = CombinedSelector(result, combinator, next_selector)
573 return result, pseudo_element
574
575
576def parse_simple_selector(
577 stream: TokenStream, inside_negation: bool = False
578) -> tuple[Tree, PseudoElement | None]:
579 stream.skip_whitespace()
580 selector_start = len(stream.used)
581 peek = stream.peek()
582 if peek.type == "IDENT" or peek == ("DELIM", "*"):
583 if peek.type == "IDENT":
584 namespace = stream.next().value
585 else:
586 stream.next()
587 namespace = None
588 if stream.peek() == ("DELIM", "|"):
589 stream.next()
590 element = stream.next_ident_or_star()
591 else:
592 element = namespace
593 namespace = None
594 else:
595 element = namespace = None
596 result: Tree = Element(namespace, element)
597 pseudo_element: PseudoElement | None = None
598 while 1:
599 peek = stream.peek()
600 if (
601 peek.type in ("S", "EOF")
602 or peek.is_delim(",", "+", ">", "~")
603 or (inside_negation and peek == ("DELIM", ")"))
604 ):
605 break
606 if pseudo_element:
607 raise SelectorSyntaxError(
608 f"Got pseudo-element ::{pseudo_element} not at the end of a selector"
609 )
610 if peek.type == "HASH":
611 result = Hash(result, cast("str", stream.next().value))
612 elif peek == ("DELIM", "."):
613 stream.next()
614 result = Class(result, stream.next_ident())
615 elif peek == ("DELIM", "|"):
616 stream.next()
617 result = Element(None, stream.next_ident())
618 elif peek == ("DELIM", "["):
619 stream.next()
620 result = parse_attrib(result, stream)
621 elif peek == ("DELIM", ":"):
622 stream.next()
623 if stream.peek() == ("DELIM", ":"):
624 stream.next()
625 pseudo_element = stream.next_ident()
626 if stream.peek() == ("DELIM", "("):
627 stream.next()
628 pseudo_element = FunctionalPseudoElement(
629 pseudo_element, parse_arguments(stream)
630 )
631 continue
632 ident = stream.next_ident()
633 if ident.lower() in ("first-line", "first-letter", "before", "after"):
634 # Special case: CSS 2.1 pseudo-elements can have a single ':'
635 # Any new pseudo-element must have two.
636 pseudo_element = str(ident)
637 continue
638 if stream.peek() != ("DELIM", "("):
639 result = Pseudo(result, ident)
640 if repr(result) == "Pseudo[Element[*]:scope]" and not (
641 len(stream.used) == 2
642 or (len(stream.used) == 3 and stream.used[0].type == "S")
643 or (len(stream.used) >= 3 and stream.used[-3].is_delim(","))
644 or (
645 len(stream.used) >= 4
646 and stream.used[-3].type == "S"
647 and stream.used[-4].is_delim(",")
648 )
649 ):
650 raise SelectorSyntaxError(
651 'Got immediate child pseudo-element ":scope" '
652 "not at the start of a selector"
653 )
654 continue
655 stream.next()
656 stream.skip_whitespace()
657 if ident.lower() == "not":
658 if inside_negation:
659 raise SelectorSyntaxError("Got nested :not()")
660 argument, argument_pseudo_element = parse_simple_selector(
661 stream, inside_negation=True
662 )
663 next = stream.next()
664 if argument_pseudo_element:
665 raise SelectorSyntaxError(
666 f"Got pseudo-element ::{argument_pseudo_element} inside :not() at {next.pos}"
667 )
668 if next != ("DELIM", ")"):
669 raise SelectorSyntaxError(f"Expected ')', got {next}")
670 result = Negation(result, argument)
671 elif ident.lower() == "has":
672 combinator, arguments = parse_relative_selector(stream)
673 result = Relation(result, combinator, arguments)
674
675 elif ident.lower() in ("matches", "is"):
676 selectors = parse_simple_selector_arguments(stream)
677 result = Matching(result, selectors)
678 elif ident.lower() == "where":
679 selectors = parse_simple_selector_arguments(stream)
680 result = SpecificityAdjustment(result, selectors)
681 else:
682 result = Function(result, ident, parse_arguments(stream))
683 else:
684 raise SelectorSyntaxError(f"Expected selector, got {peek}")
685 if len(stream.used) == selector_start:
686 raise SelectorSyntaxError(f"Expected selector, got {stream.peek()}")
687 return result, pseudo_element
688
689
690def parse_arguments(stream: TokenStream) -> list[Token]:
691 arguments: list[Token] = []
692 while 1: # noqa: RET503
693 stream.skip_whitespace()
694 next = stream.next()
695 if next.type in ("IDENT", "STRING", "NUMBER") or next in [
696 ("DELIM", "+"),
697 ("DELIM", "-"),
698 ]:
699 arguments.append(next)
700 elif next == ("DELIM", ")"):
701 return arguments
702 else:
703 raise SelectorSyntaxError(f"Expected an argument, got {next}")
704
705
706def parse_relative_selector(stream: TokenStream) -> tuple[Token, Selector]:
707 stream.skip_whitespace()
708 subselector = ""
709 next = stream.next()
710
711 if next in [("DELIM", "+"), ("DELIM", "-"), ("DELIM", ">"), ("DELIM", "~")]:
712 combinator = next
713 stream.skip_whitespace()
714 next = stream.next()
715 else:
716 combinator = Token("DELIM", " ", pos=0)
717
718 while 1: # noqa: RET503
719 if next.type in ("IDENT", "STRING", "NUMBER") or next in [
720 ("DELIM", "."),
721 ("DELIM", "*"),
722 ]:
723 subselector += cast("str", next.value)
724 elif next == ("DELIM", ")"):
725 result = parse(subselector)
726 return combinator, result[0]
727 else:
728 raise SelectorSyntaxError(f"Expected an argument, got {next}")
729 next = stream.next()
730
731
732def parse_simple_selector_arguments(stream: TokenStream) -> list[Tree]:
733 arguments = []
734 while 1:
735 result, pseudo_element = parse_simple_selector(stream, True)
736 if pseudo_element:
737 raise SelectorSyntaxError(
738 f"Got pseudo-element ::{pseudo_element} inside function"
739 )
740 stream.skip_whitespace()
741 next = stream.next()
742 if next in (("EOF", None), ("DELIM", ",")):
743 stream.next()
744 stream.skip_whitespace()
745 arguments.append(result)
746 elif next == ("DELIM", ")"):
747 arguments.append(result)
748 break
749 else:
750 raise SelectorSyntaxError(f"Expected an argument, got {next}")
751 return arguments
752
753
754def parse_attrib(selector: Tree, stream: TokenStream) -> Attrib:
755 stream.skip_whitespace()
756 attrib = stream.next_ident_or_star()
757 if attrib is None and stream.peek() != ("DELIM", "|"):
758 raise SelectorSyntaxError(f"Expected '|', got {stream.peek()}")
759 namespace: str | None
760 op: str | None
761 if stream.peek() == ("DELIM", "|"):
762 stream.next()
763 if stream.peek() == ("DELIM", "="):
764 namespace = None
765 stream.next()
766 op = "|="
767 else:
768 namespace = attrib
769 attrib = stream.next_ident()
770 op = None
771 else:
772 namespace = op = None
773 if op is None:
774 stream.skip_whitespace()
775 next = stream.next()
776 if next == ("DELIM", "]"):
777 return Attrib(selector, namespace, cast("str", attrib), "exists", None)
778 if next == ("DELIM", "="):
779 op = "="
780 elif next.is_delim("^", "$", "*", "~", "|", "!") and (
781 stream.peek() == ("DELIM", "=")
782 ):
783 op = cast("str", next.value) + "="
784 stream.next()
785 else:
786 raise SelectorSyntaxError(f"Operator expected, got {next}")
787 stream.skip_whitespace()
788 value = stream.next()
789 if value.type not in ("IDENT", "STRING"):
790 raise SelectorSyntaxError(f"Expected string or ident, got {value}")
791 stream.skip_whitespace()
792 next = stream.next()
793 if next != ("DELIM", "]"):
794 raise SelectorSyntaxError(f"Expected ']', got {next}")
795 return Attrib(selector, namespace, cast("str", attrib), op, value)
796
797
798def parse_series(tokens: Iterable[Token]) -> tuple[int, int]:
799 """
800 Parses the arguments for :nth-child() and friends.
801
802 :raises: A list of tokens
803 :returns: :``(a, b)``
804
805 """
806 for token in tokens:
807 if token.type == "STRING":
808 raise ValueError("String tokens not allowed in series.")
809 s = "".join(cast("str", token.value) for token in tokens).strip()
810 if s == "odd":
811 return 2, 1
812 if s == "even":
813 return 2, 0
814 if s == "n":
815 return 1, 0
816 if "n" not in s:
817 # Just b
818 return 0, int(s)
819 a, b = s.split("n", 1)
820 a_as_int: int
821 if not a:
822 a_as_int = 1
823 elif a in {"-", "+"}:
824 a_as_int = int(a + "1")
825 else:
826 a_as_int = int(a)
827 b_as_int = int(b) if b else 0
828 return a_as_int, b_as_int
829
830
831#### Token objects
832
833
834class Token(tuple[str, Optional[str]]): # noqa: SLOT001
835 @overload
836 def __new__(
837 cls,
838 type_: Literal["IDENT", "HASH", "STRING", "S", "DELIM", "NUMBER"],
839 value: str,
840 pos: int,
841 ) -> Self: ...
842
843 @overload
844 def __new__(cls, type_: Literal["EOF"], value: None, pos: int) -> Self: ...
845
846 def __new__(cls, type_: str, value: str | None, pos: int) -> Self:
847 obj = tuple.__new__(cls, (type_, value))
848 obj.pos = pos
849 return obj
850
851 def __repr__(self) -> str:
852 return f"<{self.type} '{self.value}' at {self.pos}>"
853
854 def is_delim(self, *values: str) -> bool:
855 return self.type == "DELIM" and self.value in values
856
857 pos: int
858
859 @property
860 def type(self) -> str:
861 return self[0]
862
863 @property
864 def value(self) -> str | None:
865 return self[1]
866
867 def css(self) -> str:
868 if self.type == "STRING":
869 return repr(self.value)
870 return cast("str", self.value)
871
872
873class EOFToken(Token):
874 def __new__(cls, pos: int) -> Self:
875 return Token.__new__(cls, "EOF", None, pos)
876
877 def __repr__(self) -> str:
878 return f"<{self.type} at {self.pos}>"
879
880
881#### Tokenizer
882
883
884class TokenMacros:
885 unicode_escape = r"\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?"
886 escape = unicode_escape + r"|\\[^\n\r\f0-9a-f]"
887 string_escape = r"\\(?:\n|\r\n|\r|\f)|" + escape
888 nonascii = r"[^\0-\177]"
889 nmchar = f"[_a-z0-9-]|{escape}|{nonascii}"
890 nmstart = f"[_a-z]|{escape}|{nonascii}"
891
892
893class MatchFunc(Protocol):
894 def __call__(
895 self, string: str, pos: int = ..., endpos: int = ...
896 ) -> re.Match[str] | None: ...
897
898
899def _compile(pattern: str) -> MatchFunc:
900 return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match
901
902
903_match_whitespace = _compile(r"[ \t\r\n\f]+")
904_match_number = _compile(r"[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)")
905_match_hash = _compile("#(?:%(nmchar)s)+")
906_match_ident = _compile("-?(?:%(nmstart)s)(?:%(nmchar)s)*")
907_match_string_by_quote = {
908 "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"),
909 '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'),
910}
911
912_sub_simple_escape = re.compile(r"\\(.)").sub
913_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.IGNORECASE).sub
914_sub_newline_escape = re.compile(r"\\(?:\n|\r\n|\r|\f)").sub
915
916# Same as r'\1', but faster on CPython
917_replace_simple = operator.methodcaller("group", 1)
918
919
920def _replace_unicode(match: re.Match[str]) -> str:
921 codepoint = int(match.group(1), 16)
922 if codepoint > sys.maxunicode:
923 codepoint = 0xFFFD
924 return chr(codepoint)
925
926
927def unescape_ident(value: str) -> str:
928 value = _sub_unicode_escape(_replace_unicode, value)
929 return _sub_simple_escape(_replace_simple, value)
930
931
932def tokenize(s: str) -> Iterator[Token]:
933 pos = 0
934 len_s = len(s)
935 while pos < len_s:
936 match = _match_whitespace(s, pos=pos)
937 if match:
938 yield Token("S", " ", pos)
939 pos = match.end()
940 continue
941
942 match = _match_ident(s, pos=pos)
943 if match:
944 value = _sub_simple_escape(
945 _replace_simple, _sub_unicode_escape(_replace_unicode, match.group())
946 )
947 yield Token("IDENT", value, pos)
948 pos = match.end()
949 continue
950
951 match = _match_hash(s, pos=pos)
952 if match:
953 value = _sub_simple_escape(
954 _replace_simple,
955 _sub_unicode_escape(_replace_unicode, match.group()[1:]),
956 )
957 yield Token("HASH", value, pos)
958 pos = match.end()
959 continue
960
961 quote = s[pos]
962 if quote in _match_string_by_quote:
963 match = _match_string_by_quote[quote](s, pos=pos + 1)
964 assert match, "Should have found at least an empty match"
965 end_pos = match.end()
966 if end_pos == len_s:
967 raise SelectorSyntaxError(f"Unclosed string at {pos}")
968 if s[end_pos] != quote:
969 raise SelectorSyntaxError(f"Invalid string at {pos}")
970 value = _sub_simple_escape(
971 _replace_simple,
972 _sub_unicode_escape(
973 _replace_unicode, _sub_newline_escape("", match.group())
974 ),
975 )
976 yield Token("STRING", value, pos)
977 pos = end_pos + 1
978 continue
979
980 match = _match_number(s, pos=pos)
981 if match:
982 value = match.group()
983 yield Token("NUMBER", value, pos)
984 pos = match.end()
985 continue
986
987 pos2 = pos + 2
988 if s[pos:pos2] == "/*":
989 pos = s.find("*/", pos2)
990 if pos == -1:
991 pos = len_s
992 else:
993 pos += 2
994 continue
995
996 yield Token("DELIM", s[pos], pos)
997 pos += 1
998
999 assert pos == len_s
1000 yield EOFToken(pos)
1001
1002
1003class TokenStream:
1004 def __init__(self, tokens: Iterable[Token], source: str | None = None) -> None:
1005 self.used: list[Token] = []
1006 self.tokens = iter(tokens)
1007 self.source = source
1008 self.peeked: Token | None = None
1009 self._peeking = False
1010 self.next_token = self.tokens.__next__
1011
1012 def next(self) -> Token:
1013 if self._peeking:
1014 self._peeking = False
1015 assert self.peeked is not None
1016 self.used.append(self.peeked)
1017 return self.peeked
1018 next = self.next_token()
1019 self.used.append(next)
1020 return next
1021
1022 def peek(self) -> Token:
1023 if not self._peeking:
1024 self.peeked = self.next_token()
1025 self._peeking = True
1026 assert self.peeked is not None
1027 return self.peeked
1028
1029 def next_ident(self) -> str:
1030 next = self.next()
1031 if next.type != "IDENT":
1032 raise SelectorSyntaxError(f"Expected ident, got {next}")
1033 return cast("str", next.value)
1034
1035 def next_ident_or_star(self) -> str | None:
1036 next = self.next()
1037 if next.type == "IDENT":
1038 return next.value
1039 if next == ("DELIM", "*"):
1040 return None
1041 raise SelectorSyntaxError(f"Expected ident or '*', got {next}")
1042
1043 def skip_whitespace(self) -> None:
1044 peek = self.peek()
1045 if peek.type == "S":
1046 self.next()