Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/cssselect/parser.py: 80%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3 cssselect.parser
4 ================
6 Tokenizer, parser and parsed objects for CSS selectors.
9 :copyright: (c) 2007-2012 Ian Bicking and contributors.
10 See AUTHORS for more details.
11 :license: BSD, see LICENSE for more details.
13"""
15import operator
16import re
17import sys
18import typing
19from typing import Iterable, Iterator, List, Optional, Sequence, Tuple, Union
22def ascii_lower(string: str) -> str:
23 """Lower-case, but only in the ASCII range."""
24 return string.encode("utf8").lower().decode("utf8")
27class SelectorError(Exception):
28 """Common parent for :class:`SelectorSyntaxError` and
29 :class:`ExpressionError`.
31 You can just use ``except SelectorError:`` when calling
32 :meth:`~GenericTranslator.css_to_xpath` and handle both exceptions types.
34 """
37class SelectorSyntaxError(SelectorError, SyntaxError):
38 """Parsing a selector that does not match the grammar."""
41#### Parsed objects
43Tree = Union[
44 "Element",
45 "Hash",
46 "Class",
47 "Function",
48 "Pseudo",
49 "Attrib",
50 "Negation",
51 "Relation",
52 "Matching",
53 "SpecificityAdjustment",
54 "CombinedSelector",
55]
56PseudoElement = Union["FunctionalPseudoElement", str]
59class Selector:
60 """
61 Represents a parsed selector.
63 :meth:`~GenericTranslator.selector_to_xpath` accepts this object,
64 but ignores :attr:`pseudo_element`. It is the user’s responsibility
65 to account for pseudo-elements and reject selectors with unknown
66 or unsupported pseudo-elements.
68 """
70 def __init__(
71 self, tree: Tree, pseudo_element: Optional[PseudoElement] = None
72 ) -> None:
73 self.parsed_tree = tree
74 if pseudo_element is not None and not isinstance(
75 pseudo_element, FunctionalPseudoElement
76 ):
77 pseudo_element = ascii_lower(pseudo_element)
78 #: A :class:`FunctionalPseudoElement`,
79 #: or the identifier for the pseudo-element as a string,
80 # or ``None``.
81 #:
82 #: +-------------------------+----------------+--------------------------------+
83 #: | | Selector | Pseudo-element |
84 #: +=========================+================+================================+
85 #: | CSS3 syntax | ``a::before`` | ``'before'`` |
86 #: +-------------------------+----------------+--------------------------------+
87 #: | Older syntax | ``a:before`` | ``'before'`` |
88 #: +-------------------------+----------------+--------------------------------+
89 #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` |
90 #: | not in Selectors3 | | |
91 #: +-------------------------+----------------+--------------------------------+
92 #: | Invalid pseudo-class | ``li:marker`` | ``None`` |
93 #: +-------------------------+----------------+--------------------------------+
94 #: | Functional | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` |
95 #: +-------------------------+----------------+--------------------------------+
96 #:
97 #: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement
98 self.pseudo_element = pseudo_element
100 def __repr__(self) -> str:
101 if isinstance(self.pseudo_element, FunctionalPseudoElement):
102 pseudo_element = repr(self.pseudo_element)
103 elif self.pseudo_element:
104 pseudo_element = "::%s" % self.pseudo_element
105 else:
106 pseudo_element = ""
107 return "%s[%r%s]" % (self.__class__.__name__, self.parsed_tree, pseudo_element)
109 def canonical(self) -> str:
110 """Return a CSS representation for this selector (a string)"""
111 if isinstance(self.pseudo_element, FunctionalPseudoElement):
112 pseudo_element = "::%s" % self.pseudo_element.canonical()
113 elif self.pseudo_element:
114 pseudo_element = "::%s" % self.pseudo_element
115 else:
116 pseudo_element = ""
117 res = "%s%s" % (self.parsed_tree.canonical(), pseudo_element)
118 if len(res) > 1:
119 res = res.lstrip("*")
120 return res
122 def specificity(self) -> Tuple[int, int, int]:
123 """Return the specificity_ of this selector as a tuple of 3 integers.
125 .. _specificity: http://www.w3.org/TR/selectors/#specificity
127 """
128 a, b, c = self.parsed_tree.specificity()
129 if self.pseudo_element:
130 c += 1
131 return a, b, c
134class Class:
135 """
136 Represents selector.class_name
137 """
139 def __init__(self, selector: Tree, class_name: str) -> None:
140 self.selector = selector
141 self.class_name = class_name
143 def __repr__(self) -> str:
144 return "%s[%r.%s]" % (self.__class__.__name__, self.selector, self.class_name)
146 def canonical(self) -> str:
147 return "%s.%s" % (self.selector.canonical(), self.class_name)
149 def specificity(self) -> Tuple[int, int, int]:
150 a, b, c = self.selector.specificity()
151 b += 1
152 return a, b, c
155class FunctionalPseudoElement:
156 """
157 Represents selector::name(arguments)
159 .. attribute:: name
161 The name (identifier) of the pseudo-element, as a string.
163 .. attribute:: arguments
165 The arguments of the pseudo-element, as a list of tokens.
167 **Note:** tokens are not part of the public API,
168 and may change between cssselect versions.
169 Use at your own risks.
171 """
173 def __init__(self, name: str, arguments: Sequence["Token"]):
174 self.name = ascii_lower(name)
175 self.arguments = arguments
177 def __repr__(self) -> str:
178 return "%s[::%s(%r)]" % (
179 self.__class__.__name__,
180 self.name,
181 [token.value for token in self.arguments],
182 )
184 def argument_types(self) -> List[str]:
185 return [token.type for token in self.arguments]
187 def canonical(self) -> str:
188 args = "".join(token.css() for token in self.arguments)
189 return "%s(%s)" % (self.name, args)
192class Function:
193 """
194 Represents selector:name(expr)
195 """
197 def __init__(self, selector: Tree, name: str, arguments: Sequence["Token"]) -> None:
198 self.selector = selector
199 self.name = ascii_lower(name)
200 self.arguments = arguments
202 def __repr__(self) -> str:
203 return "%s[%r:%s(%r)]" % (
204 self.__class__.__name__,
205 self.selector,
206 self.name,
207 [token.value for token in self.arguments],
208 )
210 def argument_types(self) -> List[str]:
211 return [token.type for token in self.arguments]
213 def canonical(self) -> str:
214 args = "".join(token.css() for token in self.arguments)
215 return "%s:%s(%s)" % (self.selector.canonical(), self.name, args)
217 def specificity(self) -> Tuple[int, int, int]:
218 a, b, c = self.selector.specificity()
219 b += 1
220 return a, b, c
223class Pseudo:
224 """
225 Represents selector:ident
226 """
228 def __init__(self, selector: Tree, ident: str) -> None:
229 self.selector = selector
230 self.ident = ascii_lower(ident)
232 def __repr__(self) -> str:
233 return "%s[%r:%s]" % (self.__class__.__name__, self.selector, self.ident)
235 def canonical(self) -> str:
236 return "%s:%s" % (self.selector.canonical(), self.ident)
238 def specificity(self) -> Tuple[int, int, int]:
239 a, b, c = self.selector.specificity()
240 b += 1
241 return a, b, c
244class Negation:
245 """
246 Represents selector:not(subselector)
247 """
249 def __init__(self, selector: Tree, subselector: Tree) -> None:
250 self.selector = selector
251 self.subselector = subselector
253 def __repr__(self) -> str:
254 return "%s[%r:not(%r)]" % (
255 self.__class__.__name__,
256 self.selector,
257 self.subselector,
258 )
260 def canonical(self) -> str:
261 subsel = self.subselector.canonical()
262 if len(subsel) > 1:
263 subsel = subsel.lstrip("*")
264 return "%s:not(%s)" % (self.selector.canonical(), subsel)
266 def specificity(self) -> Tuple[int, int, int]:
267 a1, b1, c1 = self.selector.specificity()
268 a2, b2, c2 = self.subselector.specificity()
269 return a1 + a2, b1 + b2, c1 + c2
272class Relation:
273 """
274 Represents selector:has(subselector)
275 """
277 def __init__(self, selector: Tree, combinator: "Token", subselector: Selector):
278 self.selector = selector
279 self.combinator = combinator
280 self.subselector = subselector
282 def __repr__(self) -> str:
283 return "%s[%r:has(%r)]" % (
284 self.__class__.__name__,
285 self.selector,
286 self.subselector,
287 )
289 def canonical(self) -> str:
290 try:
291 subsel = self.subselector[0].canonical() # type: ignore
292 except TypeError:
293 subsel = self.subselector.canonical()
294 if len(subsel) > 1:
295 subsel = subsel.lstrip("*")
296 return "%s:has(%s)" % (self.selector.canonical(), subsel)
298 def specificity(self) -> Tuple[int, int, int]:
299 a1, b1, c1 = self.selector.specificity()
300 try:
301 a2, b2, c2 = self.subselector[-1].specificity() # type: ignore
302 except TypeError:
303 a2, b2, c2 = self.subselector.specificity()
304 return a1 + a2, b1 + b2, c1 + c2
307class Matching:
308 """
309 Represents selector:is(selector_list)
310 """
312 def __init__(self, selector: Tree, selector_list: Iterable[Tree]):
313 self.selector = selector
314 self.selector_list = selector_list
316 def __repr__(self) -> str:
317 return "%s[%r:is(%s)]" % (
318 self.__class__.__name__,
319 self.selector,
320 ", ".join(map(repr, self.selector_list)),
321 )
323 def canonical(self) -> str:
324 selector_arguments = []
325 for s in self.selector_list:
326 selarg = s.canonical()
327 selector_arguments.append(selarg.lstrip("*"))
328 return "%s:is(%s)" % (
329 self.selector.canonical(),
330 ", ".join(map(str, selector_arguments)),
331 )
333 def specificity(self) -> Tuple[int, int, int]:
334 return max(x.specificity() for x in self.selector_list)
337class SpecificityAdjustment:
338 """
339 Represents selector:where(selector_list)
340 Same as selector:is(selector_list), but its specificity is always 0
341 """
343 def __init__(self, selector: Tree, selector_list: List[Tree]):
344 self.selector = selector
345 self.selector_list = selector_list
347 def __repr__(self) -> str:
348 return "%s[%r:where(%s)]" % (
349 self.__class__.__name__,
350 self.selector,
351 ", ".join(map(repr, self.selector_list)),
352 )
354 def canonical(self) -> str:
355 selector_arguments = []
356 for s in self.selector_list:
357 selarg = s.canonical()
358 selector_arguments.append(selarg.lstrip("*"))
359 return "%s:where(%s)" % (
360 self.selector.canonical(),
361 ", ".join(map(str, selector_arguments)),
362 )
364 def specificity(self) -> Tuple[int, int, int]:
365 return 0, 0, 0
368class Attrib:
369 """
370 Represents selector[namespace|attrib operator value]
371 """
373 @typing.overload
374 def __init__(
375 self,
376 selector: Tree,
377 namespace: Optional[str],
378 attrib: str,
379 operator: 'typing.Literal["exists"]',
380 value: None,
381 ) -> None: ...
383 @typing.overload
384 def __init__(
385 self,
386 selector: Tree,
387 namespace: Optional[str],
388 attrib: str,
389 operator: str,
390 value: "Token",
391 ) -> None: ...
393 def __init__(
394 self,
395 selector: Tree,
396 namespace: Optional[str],
397 attrib: str,
398 operator: str,
399 value: Optional["Token"],
400 ) -> None:
401 self.selector = selector
402 self.namespace = namespace
403 self.attrib = attrib
404 self.operator = operator
405 self.value = value
407 def __repr__(self) -> str:
408 if self.namespace:
409 attrib = "%s|%s" % (self.namespace, self.attrib)
410 else:
411 attrib = self.attrib
412 if self.operator == "exists":
413 return "%s[%r[%s]]" % (self.__class__.__name__, self.selector, attrib)
414 else:
415 return "%s[%r[%s %s %r]]" % (
416 self.__class__.__name__,
417 self.selector,
418 attrib,
419 self.operator,
420 typing.cast("Token", self.value).value,
421 )
423 def canonical(self) -> str:
424 if self.namespace:
425 attrib = "%s|%s" % (self.namespace, self.attrib)
426 else:
427 attrib = self.attrib
429 if self.operator == "exists":
430 op = attrib
431 else:
432 op = "%s%s%s" % (
433 attrib,
434 self.operator,
435 typing.cast("Token", self.value).css(),
436 )
438 return "%s[%s]" % (self.selector.canonical(), op)
440 def specificity(self) -> Tuple[int, int, int]:
441 a, b, c = self.selector.specificity()
442 b += 1
443 return a, b, c
446class Element:
447 """
448 Represents namespace|element
450 `None` is for the universal selector '*'
452 """
454 def __init__(
455 self, namespace: Optional[str] = None, element: Optional[str] = None
456 ) -> None:
457 self.namespace = namespace
458 self.element = element
460 def __repr__(self) -> str:
461 return "%s[%s]" % (self.__class__.__name__, self.canonical())
463 def canonical(self) -> str:
464 element = self.element or "*"
465 if self.namespace:
466 element = "%s|%s" % (self.namespace, element)
467 return element
469 def specificity(self) -> Tuple[int, int, int]:
470 if self.element:
471 return 0, 0, 1
472 else:
473 return 0, 0, 0
476class Hash:
477 """
478 Represents selector#id
479 """
481 def __init__(self, selector: Tree, id: str) -> None:
482 self.selector = selector
483 self.id = id
485 def __repr__(self) -> str:
486 return "%s[%r#%s]" % (self.__class__.__name__, self.selector, self.id)
488 def canonical(self) -> str:
489 return "%s#%s" % (self.selector.canonical(), self.id)
491 def specificity(self) -> Tuple[int, int, int]:
492 a, b, c = self.selector.specificity()
493 a += 1
494 return a, b, c
497class CombinedSelector:
498 def __init__(self, selector: Tree, combinator: str, subselector: Tree) -> None:
499 assert selector is not None
500 self.selector = selector
501 self.combinator = combinator
502 self.subselector = subselector
504 def __repr__(self) -> str:
505 if self.combinator == " ":
506 comb = "<followed>"
507 else:
508 comb = self.combinator
509 return "%s[%r %s %r]" % (
510 self.__class__.__name__,
511 self.selector,
512 comb,
513 self.subselector,
514 )
516 def canonical(self) -> str:
517 subsel = self.subselector.canonical()
518 if len(subsel) > 1:
519 subsel = subsel.lstrip("*")
520 return "%s %s %s" % (self.selector.canonical(), self.combinator, subsel)
522 def specificity(self) -> Tuple[int, int, int]:
523 a1, b1, c1 = self.selector.specificity()
524 a2, b2, c2 = self.subselector.specificity()
525 return a1 + a2, b1 + b2, c1 + c2
528#### Parser
530# foo
531_el_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$")
533# foo#bar or #bar
534_id_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$")
536# foo.bar or .bar
537_class_re = re.compile(
538 r"^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$"
539)
542def parse(css: str) -> List[Selector]:
543 """Parse a CSS *group of selectors*.
545 If you don't care about pseudo-elements or selector specificity,
546 you can skip this and use :meth:`~GenericTranslator.css_to_xpath`.
548 :param css:
549 A *group of selectors* as a string.
550 :raises:
551 :class:`SelectorSyntaxError` on invalid selectors.
552 :returns:
553 A list of parsed :class:`Selector` objects, one for each
554 selector in the comma-separated group.
556 """
557 # Fast path for simple cases
558 match = _el_re.match(css)
559 if match:
560 return [Selector(Element(element=match.group(1)))]
561 match = _id_re.match(css)
562 if match is not None:
563 return [Selector(Hash(Element(element=match.group(1) or None), match.group(2)))]
564 match = _class_re.match(css)
565 if match is not None:
566 return [
567 Selector(Class(Element(element=match.group(1) or None), match.group(2)))
568 ]
570 stream = TokenStream(tokenize(css))
571 stream.source = css
572 return list(parse_selector_group(stream))
575# except SelectorSyntaxError:
576# e = sys.exc_info()[1]
577# message = "%s at %s -> %r" % (
578# e, stream.used, stream.peek())
579# e.msg = message
580# e.args = tuple([message])
581# raise
584def parse_selector_group(stream: "TokenStream") -> Iterator[Selector]:
585 stream.skip_whitespace()
586 while 1:
587 yield Selector(*parse_selector(stream))
588 if stream.peek() == ("DELIM", ","):
589 stream.next()
590 stream.skip_whitespace()
591 else:
592 break
595def parse_selector(stream: "TokenStream") -> Tuple[Tree, Optional[PseudoElement]]:
596 result, pseudo_element = parse_simple_selector(stream)
597 while 1:
598 stream.skip_whitespace()
599 peek = stream.peek()
600 if peek in (("EOF", None), ("DELIM", ",")):
601 break
602 if pseudo_element:
603 raise SelectorSyntaxError(
604 "Got pseudo-element ::%s not at the end of a selector" % pseudo_element
605 )
606 if peek.is_delim("+", ">", "~"):
607 # A combinator
608 combinator = typing.cast(str, stream.next().value)
609 stream.skip_whitespace()
610 else:
611 # By exclusion, the last parse_simple_selector() ended
612 # at peek == ' '
613 combinator = " "
614 next_selector, pseudo_element = parse_simple_selector(stream)
615 result = CombinedSelector(result, combinator, next_selector)
616 return result, pseudo_element
619def parse_simple_selector(
620 stream: "TokenStream", inside_negation: bool = False
621) -> Tuple[Tree, Optional[PseudoElement]]:
622 stream.skip_whitespace()
623 selector_start = len(stream.used)
624 peek = stream.peek()
625 if peek.type == "IDENT" or peek == ("DELIM", "*"):
626 if peek.type == "IDENT":
627 namespace = stream.next().value
628 else:
629 stream.next()
630 namespace = None
631 if stream.peek() == ("DELIM", "|"):
632 stream.next()
633 element = stream.next_ident_or_star()
634 else:
635 element = namespace
636 namespace = None
637 else:
638 element = namespace = None
639 result: Tree = Element(namespace, element)
640 pseudo_element: Optional[PseudoElement] = None
641 while 1:
642 peek = stream.peek()
643 if (
644 peek.type in ("S", "EOF")
645 or peek.is_delim(",", "+", ">", "~")
646 or (inside_negation and peek == ("DELIM", ")"))
647 ):
648 break
649 if pseudo_element:
650 raise SelectorSyntaxError(
651 "Got pseudo-element ::%s not at the end of a selector" % pseudo_element
652 )
653 if peek.type == "HASH":
654 result = Hash(result, typing.cast(str, stream.next().value))
655 elif peek == ("DELIM", "."):
656 stream.next()
657 result = Class(result, stream.next_ident())
658 elif peek == ("DELIM", "|"):
659 stream.next()
660 result = Element(None, stream.next_ident())
661 elif peek == ("DELIM", "["):
662 stream.next()
663 result = parse_attrib(result, stream)
664 elif peek == ("DELIM", ":"):
665 stream.next()
666 if stream.peek() == ("DELIM", ":"):
667 stream.next()
668 pseudo_element = stream.next_ident()
669 if stream.peek() == ("DELIM", "("):
670 stream.next()
671 pseudo_element = FunctionalPseudoElement(
672 pseudo_element, parse_arguments(stream)
673 )
674 continue
675 ident = stream.next_ident()
676 if ident.lower() in ("first-line", "first-letter", "before", "after"):
677 # Special case: CSS 2.1 pseudo-elements can have a single ':'
678 # Any new pseudo-element must have two.
679 pseudo_element = str(ident)
680 continue
681 if stream.peek() != ("DELIM", "("):
682 result = Pseudo(result, ident)
683 if repr(result) == "Pseudo[Element[*]:scope]":
684 if not (
685 len(stream.used) == 2
686 or (len(stream.used) == 3 and stream.used[0].type == "S")
687 or (len(stream.used) >= 3 and stream.used[-3].is_delim(","))
688 or (
689 len(stream.used) >= 4
690 and stream.used[-3].type == "S"
691 and stream.used[-4].is_delim(",")
692 )
693 ):
694 raise SelectorSyntaxError(
695 'Got immediate child pseudo-element ":scope" '
696 "not at the start of a selector"
697 )
698 continue
699 stream.next()
700 stream.skip_whitespace()
701 if ident.lower() == "not":
702 if inside_negation:
703 raise SelectorSyntaxError("Got nested :not()")
704 argument, argument_pseudo_element = parse_simple_selector(
705 stream, inside_negation=True
706 )
707 next = stream.next()
708 if argument_pseudo_element:
709 raise SelectorSyntaxError(
710 "Got pseudo-element ::%s inside :not() at %s"
711 % (argument_pseudo_element, next.pos)
712 )
713 if next != ("DELIM", ")"):
714 raise SelectorSyntaxError("Expected ')', got %s" % (next,))
715 result = Negation(result, argument)
716 elif ident.lower() == "has":
717 combinator, arguments = parse_relative_selector(stream)
718 result = Relation(result, combinator, arguments)
720 elif ident.lower() in ("matches", "is"):
721 selectors = parse_simple_selector_arguments(stream)
722 result = Matching(result, selectors)
723 elif ident.lower() == "where":
724 selectors = parse_simple_selector_arguments(stream)
725 result = SpecificityAdjustment(result, selectors)
726 else:
727 result = Function(result, ident, parse_arguments(stream))
728 else:
729 raise SelectorSyntaxError("Expected selector, got %s" % (peek,))
730 if len(stream.used) == selector_start:
731 raise SelectorSyntaxError("Expected selector, got %s" % (stream.peek(),))
732 return result, pseudo_element
735def parse_arguments(stream: "TokenStream") -> List["Token"]:
736 arguments: List["Token"] = []
737 while 1:
738 stream.skip_whitespace()
739 next = stream.next()
740 if next.type in ("IDENT", "STRING", "NUMBER") or next in [
741 ("DELIM", "+"),
742 ("DELIM", "-"),
743 ]:
744 arguments.append(next)
745 elif next == ("DELIM", ")"):
746 return arguments
747 else:
748 raise SelectorSyntaxError("Expected an argument, got %s" % (next,))
751def parse_relative_selector(stream: "TokenStream") -> Tuple["Token", Selector]:
752 stream.skip_whitespace()
753 subselector = ""
754 next = stream.next()
756 if next in [("DELIM", "+"), ("DELIM", "-"), ("DELIM", ">"), ("DELIM", "~")]:
757 combinator = next
758 stream.skip_whitespace()
759 next = stream.next()
760 else:
761 combinator = Token("DELIM", " ", pos=0)
763 while 1:
764 if next.type in ("IDENT", "STRING", "NUMBER") or next in [
765 ("DELIM", "."),
766 ("DELIM", "*"),
767 ]:
768 subselector += typing.cast(str, next.value)
769 elif next == ("DELIM", ")"):
770 result = parse(subselector)
771 return combinator, result[0]
772 else:
773 raise SelectorSyntaxError("Expected an argument, got %s" % (next,))
774 next = stream.next()
777def parse_simple_selector_arguments(stream: "TokenStream") -> List[Tree]:
778 arguments = []
779 while 1:
780 result, pseudo_element = parse_simple_selector(stream, True)
781 if pseudo_element:
782 raise SelectorSyntaxError(
783 "Got pseudo-element ::%s inside function" % (pseudo_element,)
784 )
785 stream.skip_whitespace()
786 next = stream.next()
787 if next in (("EOF", None), ("DELIM", ",")):
788 stream.next()
789 stream.skip_whitespace()
790 arguments.append(result)
791 elif next == ("DELIM", ")"):
792 arguments.append(result)
793 break
794 else:
795 raise SelectorSyntaxError("Expected an argument, got %s" % (next,))
796 return arguments
799def parse_attrib(selector: Tree, stream: "TokenStream") -> Attrib:
800 stream.skip_whitespace()
801 attrib = stream.next_ident_or_star()
802 if attrib is None and stream.peek() != ("DELIM", "|"):
803 raise SelectorSyntaxError("Expected '|', got %s" % (stream.peek(),))
804 namespace: Optional[str]
805 op: Optional[str]
806 if stream.peek() == ("DELIM", "|"):
807 stream.next()
808 if stream.peek() == ("DELIM", "="):
809 namespace = None
810 stream.next()
811 op = "|="
812 else:
813 namespace = attrib
814 attrib = stream.next_ident()
815 op = None
816 else:
817 namespace = op = None
818 if op is None:
819 stream.skip_whitespace()
820 next = stream.next()
821 if next == ("DELIM", "]"):
822 return Attrib(selector, namespace, typing.cast(str, attrib), "exists", None)
823 elif next == ("DELIM", "="):
824 op = "="
825 elif next.is_delim("^", "$", "*", "~", "|", "!") and (
826 stream.peek() == ("DELIM", "=")
827 ):
828 op = typing.cast(str, next.value) + "="
829 stream.next()
830 else:
831 raise SelectorSyntaxError("Operator expected, got %s" % (next,))
832 stream.skip_whitespace()
833 value = stream.next()
834 if value.type not in ("IDENT", "STRING"):
835 raise SelectorSyntaxError("Expected string or ident, got %s" % (value,))
836 stream.skip_whitespace()
837 next = stream.next()
838 if next != ("DELIM", "]"):
839 raise SelectorSyntaxError("Expected ']', got %s" % (next,))
840 return Attrib(selector, namespace, typing.cast(str, attrib), op, value)
843def parse_series(tokens: Iterable["Token"]) -> Tuple[int, int]:
844 """
845 Parses the arguments for :nth-child() and friends.
847 :raises: A list of tokens
848 :returns: :``(a, b)``
850 """
851 for token in tokens:
852 if token.type == "STRING":
853 raise ValueError("String tokens not allowed in series.")
854 s = "".join(typing.cast(str, token.value) for token in tokens).strip()
855 if s == "odd":
856 return 2, 1
857 elif s == "even":
858 return 2, 0
859 elif s == "n":
860 return 1, 0
861 if "n" not in s:
862 # Just b
863 return 0, int(s)
864 a, b = s.split("n", 1)
865 a_as_int: int
866 if not a:
867 a_as_int = 1
868 elif a == "-" or a == "+":
869 a_as_int = int(a + "1")
870 else:
871 a_as_int = int(a)
872 b_as_int: int
873 if not b:
874 b_as_int = 0
875 else:
876 b_as_int = int(b)
877 return a_as_int, b_as_int
880#### Token objects
883class Token(Tuple[str, Optional[str]]):
884 @typing.overload
885 def __new__(
886 cls,
887 type_: 'typing.Literal["IDENT", "HASH", "STRING", "S", "DELIM", "NUMBER"]',
888 value: str,
889 pos: int,
890 ) -> "Token": ...
892 @typing.overload
893 def __new__(
894 cls, type_: 'typing.Literal["EOF"]', value: None, pos: int
895 ) -> "Token": ...
897 def __new__(cls, type_: str, value: Optional[str], pos: int) -> "Token":
898 obj = tuple.__new__(cls, (type_, value))
899 obj.pos = pos
900 return obj
902 def __repr__(self) -> str:
903 return "<%s '%s' at %i>" % (self.type, self.value, self.pos)
905 def is_delim(self, *values: str) -> bool:
906 return self.type == "DELIM" and self.value in values
908 pos: int
910 @property
911 def type(self) -> str:
912 return self[0]
914 @property
915 def value(self) -> Optional[str]:
916 return self[1]
918 def css(self) -> str:
919 if self.type == "STRING":
920 return repr(self.value)
921 else:
922 return typing.cast(str, self.value)
925class EOFToken(Token):
926 def __new__(cls, pos: int) -> "EOFToken":
927 return typing.cast("EOFToken", Token.__new__(cls, "EOF", None, pos))
929 def __repr__(self) -> str:
930 return "<%s at %i>" % (self.type, self.pos)
933#### Tokenizer
936class TokenMacros:
937 unicode_escape = r"\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?"
938 escape = unicode_escape + r"|\\[^\n\r\f0-9a-f]"
939 string_escape = r"\\(?:\n|\r\n|\r|\f)|" + escape
940 nonascii = r"[^\0-\177]"
941 nmchar = "[_a-z0-9-]|%s|%s" % (escape, nonascii)
942 nmstart = "[_a-z]|%s|%s" % (escape, nonascii)
945if typing.TYPE_CHECKING:
947 class MatchFunc(typing.Protocol):
948 def __call__(
949 self, string: str, pos: int = ..., endpos: int = ...
950 ) -> Optional["re.Match[str]"]: ...
953def _compile(pattern: str) -> "MatchFunc":
954 return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match
957_match_whitespace = _compile(r"[ \t\r\n\f]+")
958_match_number = _compile(r"[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)")
959_match_hash = _compile("#(?:%(nmchar)s)+")
960_match_ident = _compile("-?(?:%(nmstart)s)(?:%(nmchar)s)*")
961_match_string_by_quote = {
962 "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"),
963 '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'),
964}
966_sub_simple_escape = re.compile(r"\\(.)").sub
967_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.I).sub
968_sub_newline_escape = re.compile(r"\\(?:\n|\r\n|\r|\f)").sub
970# Same as r'\1', but faster on CPython
971_replace_simple = operator.methodcaller("group", 1)
974def _replace_unicode(match: "re.Match[str]") -> str:
975 codepoint = int(match.group(1), 16)
976 if codepoint > sys.maxunicode:
977 codepoint = 0xFFFD
978 return chr(codepoint)
981def unescape_ident(value: str) -> str:
982 value = _sub_unicode_escape(_replace_unicode, value)
983 value = _sub_simple_escape(_replace_simple, value)
984 return value
987def tokenize(s: str) -> Iterator[Token]:
988 pos = 0
989 len_s = len(s)
990 while pos < len_s:
991 match = _match_whitespace(s, pos=pos)
992 if match:
993 yield Token("S", " ", pos)
994 pos = match.end()
995 continue
997 match = _match_ident(s, pos=pos)
998 if match:
999 value = _sub_simple_escape(
1000 _replace_simple, _sub_unicode_escape(_replace_unicode, match.group())
1001 )
1002 yield Token("IDENT", value, pos)
1003 pos = match.end()
1004 continue
1006 match = _match_hash(s, pos=pos)
1007 if match:
1008 value = _sub_simple_escape(
1009 _replace_simple,
1010 _sub_unicode_escape(_replace_unicode, match.group()[1:]),
1011 )
1012 yield Token("HASH", value, pos)
1013 pos = match.end()
1014 continue
1016 quote = s[pos]
1017 if quote in _match_string_by_quote:
1018 match = _match_string_by_quote[quote](s, pos=pos + 1)
1019 assert match, "Should have found at least an empty match"
1020 end_pos = match.end()
1021 if end_pos == len_s:
1022 raise SelectorSyntaxError("Unclosed string at %s" % pos)
1023 if s[end_pos] != quote:
1024 raise SelectorSyntaxError("Invalid string at %s" % pos)
1025 value = _sub_simple_escape(
1026 _replace_simple,
1027 _sub_unicode_escape(
1028 _replace_unicode, _sub_newline_escape("", match.group())
1029 ),
1030 )
1031 yield Token("STRING", value, pos)
1032 pos = end_pos + 1
1033 continue
1035 match = _match_number(s, pos=pos)
1036 if match:
1037 value = match.group()
1038 yield Token("NUMBER", value, pos)
1039 pos = match.end()
1040 continue
1042 pos2 = pos + 2
1043 if s[pos:pos2] == "/*":
1044 pos = s.find("*/", pos2)
1045 if pos == -1:
1046 pos = len_s
1047 else:
1048 pos += 2
1049 continue
1051 yield Token("DELIM", s[pos], pos)
1052 pos += 1
1054 assert pos == len_s
1055 yield EOFToken(pos)
1058class TokenStream:
1059 def __init__(self, tokens: Iterable[Token], source: Optional[str] = None) -> None:
1060 self.used: List[Token] = []
1061 self.tokens = iter(tokens)
1062 self.source = source
1063 self.peeked: Optional[Token] = None
1064 self._peeking = False
1065 self.next_token = self.tokens.__next__
1067 def next(self) -> Token:
1068 if self._peeking:
1069 self._peeking = False
1070 self.used.append(typing.cast(Token, self.peeked))
1071 return typing.cast(Token, self.peeked)
1072 else:
1073 next = self.next_token()
1074 self.used.append(next)
1075 return next
1077 def peek(self) -> Token:
1078 if not self._peeking:
1079 self.peeked = self.next_token()
1080 self._peeking = True
1081 return typing.cast(Token, self.peeked)
1083 def next_ident(self) -> str:
1084 next = self.next()
1085 if next.type != "IDENT":
1086 raise SelectorSyntaxError("Expected ident, got %s" % (next,))
1087 return typing.cast(str, next.value)
1089 def next_ident_or_star(self) -> Optional[str]:
1090 next = self.next()
1091 if next.type == "IDENT":
1092 return next.value
1093 elif next == ("DELIM", "*"):
1094 return None
1095 else:
1096 raise SelectorSyntaxError("Expected ident or '*', got %s" % (next,))
1098 def skip_whitespace(self) -> None:
1099 peek = self.peek()
1100 if peek.type == "S":
1101 self.next()