Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/soupsieve/css_parser.py: 87%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""CSS selector parser."""
2from __future__ import annotations
3import re
4from functools import lru_cache
5from . import util
6from . import css_match as cm
7from . import css_types as ct
8from .util import SelectorSyntaxError
9import warnings
10from typing import Match, Any, Iterator, cast
11from dataclasses import dataclass
12from collections import UserDict
13import threading
15RE_LOCK = threading.Lock()
16SEL_LOCK = threading.RLock()
18UNICODE_REPLACEMENT_CHAR = 0xFFFD
20SELECTOR_LIMIT = 8192
22# Simple pseudo classes that take no parameters
23PSEUDO_SIMPLE = {
24 ":any-link",
25 ":empty",
26 ":first-child",
27 ":first-of-type",
28 ":in-range",
29 ":open",
30 ":out-of-range",
31 ":last-child",
32 ":last-of-type",
33 ":link",
34 ":only-child",
35 ":only-of-type",
36 ":root",
37 ':checked',
38 ':default',
39 ':disabled',
40 ':enabled',
41 ':indeterminate',
42 ':optional',
43 ':placeholder-shown',
44 ':read-only',
45 ':read-write',
46 ':required',
47 ':scope',
48 ':defined',
49 ':muted'
50}
52# Supported, simple pseudo classes that match nothing in the Soup Sieve environment
53PSEUDO_SIMPLE_NO_MATCH = {
54 ':active',
55 ':autofill',
56 ':buffering',
57 ':current',
58 ':focus',
59 ':focus-visible',
60 ':focus-within',
61 ':fullscreen',
62 ':future',
63 ':host',
64 ':hover',
65 ':local-link',
66 ':past',
67 ':paused',
68 ':picture-in-picture',
69 ':playing',
70 ':popover-open',
71 ':seeking',
72 ':stalled',
73 ':target',
74 ':target-within',
75 ':user-invalid',
76 ':volume-locked',
77 ':visited'
78}
80# Complex pseudo classes that take selector lists
81PSEUDO_COMPLEX = {
82 ':contains',
83 ':-soup-contains',
84 ':-soup-contains-own',
85 ':has',
86 ':is',
87 ':matches',
88 ':not',
89 ':where'
90}
92PSEUDO_COMPLEX_NO_MATCH = {
93 ':current',
94 ':host',
95 ':host-context'
96}
98# Complex pseudo classes that take very specific parameters and are handled special
99PSEUDO_SPECIAL = {
100 ':dir',
101 ':lang',
102 ':nth-child',
103 ':nth-last-child',
104 ':nth-last-of-type',
105 ':nth-of-type'
106}
108PSEUDO_SUPPORTED = PSEUDO_SIMPLE | PSEUDO_SIMPLE_NO_MATCH | PSEUDO_COMPLEX | PSEUDO_COMPLEX_NO_MATCH | PSEUDO_SPECIAL
110# Sub-patterns parts
111# Whitespace
112NEWLINE = r'(?:\r\n|(?!\r\n)[\n\f\r])'
113WS = fr'(?:[ \t]|{NEWLINE})'
114# Comments
115COMMENTS = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)'
116# Whitespace with comments included
117WSC = fr'(?:{WS}|{COMMENTS})'
118# CSS escapes
119CSS_ESCAPES = fr'(?:\\(?:[a-f0-9]{{1,6}}{WS}?|[^\r\n\f]|$))'
120CSS_STRING_ESCAPES = fr'(?:\\(?:[a-f0-9]{{1,6}}{WS}?|[^\r\n\f]|$|{NEWLINE}))'
121# CSS Identifier
122IDENTIFIER = fr'''
123(?:(?:-?(?:[^\x00-\x2f\x30-\x40\x5B-\x5E\x60\x7B-\x9f]|{CSS_ESCAPES})+|--)
124(?:[^\x00-\x2c\x2e\x2f\x3A-\x40\x5B-\x5E\x60\x7B-\x9f]|{CSS_ESCAPES})*)
125'''
126# `nth` content
127NTH = fr'(?:[-+])?(?:[0-9]+n?|n)(?:(?<=n){WSC}*(?:[-+]){WSC}*(?:[0-9]+))?'
128# Value: quoted string or identifier
129VALUE = fr'''(?:"(?:\\(?:.|{NEWLINE})|[^\\"\r\n\f])*?"|'(?:\\(?:.|{NEWLINE})|[^\\'\r\n\f])*?'|{IDENTIFIER})'''
130# Attribute value comparison. `!=` is handled special as it is non-standard.
131ATTR = fr'(?:{WSC}*(?P<cmp>[!~^|*$]?=){WSC}*(?P<value>{VALUE})(?:{WSC}*(?P<case>[is]))?)?{WSC}*'
133# Selector patterns
134# IDs (`#id`)
135PAT_ID = fr'\#{IDENTIFIER}'
136# Classes (`.class`)
137PAT_CLASS = fr'\.{IDENTIFIER}'
138# Prefix:Tag (`prefix|tag`)
139PAT_TAG = fr'(?P<tag_ns>(?:{IDENTIFIER}|\*)?\|)?(?P<tag_name>{IDENTIFIER}|\*)'
140# Attributes (`[attr]`, `[attr=value]`, etc.)
141PAT_ATTR = fr'\[{WSC}*(?P<attr_ns>(?:{IDENTIFIER}|\*)?\|)?(?P<attr_name>{IDENTIFIER}){ATTR}\]'
142# Pseudo class (`:pseudo-class`, `:pseudo-class(`)
143PAT_PSEUDO_CLASS = fr'(?P<name>:{IDENTIFIER})(?P<open>\({WSC}*)?'
144# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
145PAT_PSEUDO_CLASS_SPECIAL = fr'(?P<name>:{IDENTIFIER})(?P<open>\({WSC}*)'
146# Custom pseudo class (`:--custom-pseudo`)
147PAT_PSEUDO_CLASS_CUSTOM = fr'(?P<name>:(?=--){IDENTIFIER})'
148# Nesting ampersand selector. Matches `&`
149PAT_AMP = r'&'
150# Closing pseudo group (`)`)
151PAT_PSEUDO_CLOSE = fr'{WSC}*\)'
152# Pseudo element (`::pseudo-element`)
153PAT_PSEUDO_ELEMENT = fr':{PAT_PSEUDO_CLASS}'
154# At rule (`@page`, etc.) (not supported)
155PAT_AT_RULE = fr'@P{IDENTIFIER}'
156# Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.)
157PAT_PSEUDO_NTH_CHILD = fr'''
158(?P<pseudo_nth_child>{PAT_PSEUDO_CLASS_SPECIAL}
159(?P<nth_child>{NTH}|even|odd))(?:{WSC}*\)|(?P<of>{COMMENTS}*{WS}{WSC}*of{COMMENTS}*{WS}{WSC}*))
160'''
161# Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.)
162PAT_PSEUDO_NTH_TYPE = fr'''
163(?P<pseudo_nth_type>{PAT_PSEUDO_CLASS_SPECIAL}
164(?P<nth_type>{NTH}|even|odd)){WSC}*\)
165'''
166# Pseudo class language (`:lang("*-de", en)`)
167PAT_PSEUDO_LANG = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<values>{VALUE}(?:{WSC}*,{WSC}*{VALUE})*){WSC}*\)'
168# Pseudo class direction (`:dir(ltr)`)
169PAT_PSEUDO_DIR = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<dir>ltr|rtl){WSC}*\)'
170# Combining characters (`>`, `~`, ` `, `+`, `,`)
171PAT_COMBINE = fr'{WSC}*?(?P<relation>[,+>~]|{WS}(?![,+>~])){WSC}*'
172# Extra: Contains (`:contains(text)`)
173PAT_PSEUDO_CONTAINS = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<values>{VALUE}(?:{WSC}*,{WSC}*{VALUE})*){WSC}*\)'
175# Regular expressions
176# CSS escape pattern
177RE_CSS_ESC = re.compile(fr'(?:(\\[a-f0-9]{{1,6}}{WSC}?)|(\\[^\r\n\f])|(\\$))', re.I)
178RE_CSS_STR_ESC = re.compile(fr'(?:(\\[a-f0-9]{{1,6}}{WS}?)|(\\[^\r\n\f])|(\\$)|(\\{NEWLINE}))', re.I)
179# Pattern to break up `nth` specifiers
180RE_NTH = re.compile(fr'(?P<s1>[-+])?(?P<a>[0-9]+n?|n)(?:(?<=n){WSC}*(?P<s2>[-+]){WSC}*(?P<b>[0-9]+))?', re.I)
181# Pattern to iterate multiple values.
182RE_VALUES = re.compile(fr'(?:(?P<value>{VALUE})|(?P<split>{WSC}*,{WSC}*))', re.X)
183# Whitespace checks
184RE_WS = re.compile(WS)
185RE_WS_BEGIN = re.compile(fr'^{WSC}*')
186RE_WS_END = re.compile(fr'{WSC}*$')
187RE_CUSTOM = re.compile(fr'^{PAT_PSEUDO_CLASS_CUSTOM}$', re.X)
188RE_PSEUDO_CLASS_SPECIAL = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
190# Constants
191# List split token
192COMMA_COMBINATOR = ','
193# Relation token for descendant
194WS_COMBINATOR = " "
196# Parse flags
197FLG_PSEUDO = 0x01
198FLG_NOT = 0x02
199FLG_RELATIVE = 0x04
200FLG_DEFAULT = 0x08
201FLG_HTML = 0x10
202FLG_INDETERMINATE = 0x20
203FLG_OPEN = 0x40
204FLG_IN_RANGE = 0x80
205FLG_OUT_OF_RANGE = 0x100
206FLG_PLACEHOLDER_SHOWN = 0x200
207FLG_FORGIVE = 0x400
209# Maximum cached patterns to store
210_MAXCACHE = 500
213@lru_cache(maxsize=_MAXCACHE)
214def _cached_css_compile(
215 pattern: str,
216 namespaces: ct.Namespaces | None,
217 custom: ct.CustomSelectors | None,
218 flags: int
219) -> cm.SoupSieve:
220 """Cached CSS compile."""
222 custom_selectors = process_custom(custom)
223 return cm.SoupSieve(
224 pattern,
225 CSSParser(
226 pattern,
227 custom=custom_selectors,
228 flags=flags
229 ).process_selectors(),
230 namespaces,
231 custom,
232 flags
233 )
236def _purge_cache() -> None:
237 """Purge the cache."""
239 _cached_css_compile.cache_clear()
242def process_custom(custom: ct.CustomSelectors | None) -> dict[str, str | ct.SelectorList]:
243 """Process custom."""
245 custom_selectors = {}
246 if custom is not None:
247 for key, value in custom.items():
248 name = util.lower(key)
249 if RE_CUSTOM.match(name) is None:
250 raise SelectorSyntaxError(f"The name '{name}' is not a valid custom pseudo-class name")
251 if name in custom_selectors:
252 raise KeyError(f"The custom selector '{name}' has already been registered")
253 custom_selectors[css_unescape(name)] = value
254 return custom_selectors
257def css_unescape(content: str, string: bool = False) -> str:
258 """
259 Unescape CSS value.
261 Strings allow for spanning the value on multiple strings by escaping a new line.
262 """
264 def replace(m: Match[str]) -> str:
265 """Replace with the appropriate substitute."""
267 if m.group(1):
268 codepoint = int(m.group(1)[1:], 16)
269 if codepoint == 0:
270 codepoint = UNICODE_REPLACEMENT_CHAR
271 value = chr(codepoint)
272 elif m.group(2):
273 value = m.group(2)[1:]
274 elif m.group(3):
275 value = '\ufffd'
276 else:
277 value = ''
279 return value
281 return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content)
284def escape(ident: str) -> str:
285 """Escape identifier."""
287 string = []
288 length = len(ident)
289 start_dash = length > 0 and ident[0] == '-'
290 if length == 1 and start_dash:
291 # Need to escape identifier that is a single `-` with no other characters
292 string.append(f'\\{ident}')
293 else:
294 for index, c in enumerate(ident):
295 codepoint = ord(c)
296 if codepoint == 0x00:
297 string.append('\ufffd')
298 elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
299 string.append(f'\\{codepoint:x} ')
300 elif (index == 0 or (start_dash and index == 1)) and (0x30 <= codepoint <= 0x39):
301 string.append(f'\\{codepoint:x} ')
302 elif (
303 codepoint in (0x2D, 0x5F) or codepoint >= 0x80 or (0x30 <= codepoint <= 0x39) or
304 (0x30 <= codepoint <= 0x39) or (0x41 <= codepoint <= 0x5A) or (0x61 <= codepoint <= 0x7A)
305 ):
306 string.append(c)
307 else:
308 string.append(f'\\{c}')
309 return ''.join(string)
312class SelectorPattern:
313 """Selector pattern."""
315 def __init__(self, name: str, pattern: str) -> None:
316 """Initialize."""
318 self.name = name
319 self.pattern = pattern
320 self._re_pattern: re.Pattern[str] | None = None
322 @property
323 def re_pattern(self) -> re.Pattern[str]:
324 """Retrieve the compiled regular expression pattern."""
326 with RE_LOCK:
327 if self._re_pattern is None:
328 self._re_pattern = re.compile(self.pattern, re.I | re.X | re.U)
329 return self._re_pattern
331 def get_name(self) -> str:
332 """Get name."""
334 return self.name
336 def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
337 """Match the selector."""
339 return self.re_pattern.match(selector, index)
342class SpecialPseudoPattern(SelectorPattern):
343 """Selector pattern."""
345 def __init__(self, patterns: tuple[tuple[str, tuple[str, ...], str, type[SelectorPattern]], ...]) -> None:
346 """Initialize."""
348 self.patterns = {}
349 for p in patterns:
350 name = p[0]
351 pattern = p[3](name, p[2])
352 for pseudo in p[1]:
353 self.patterns[pseudo] = pattern
355 self.matched_name = None # type: SelectorPattern | None
357 def get_name(self) -> str:
358 """Get name."""
360 return '' if self.matched_name is None else self.matched_name.get_name()
362 def match(self, selector: str, index: int, flags: int) -> Match[str] | None:
363 """Match the selector."""
365 pseudo = None
366 m = RE_PSEUDO_CLASS_SPECIAL.match(selector, index)
367 if m:
368 name = util.lower(css_unescape(m.group('name')))
369 pattern = self.patterns.get(name)
370 if pattern:
371 pseudo = pattern.match(selector, index, flags)
372 if pseudo:
373 self.matched_name = pattern
375 return pseudo
378class _Selector:
379 """
380 Intermediate selector class.
382 This stores selector data for a compound selector as we are acquiring them.
383 Once we are done collecting the data for a compound selector, we freeze
384 the data in an object that can be pickled and hashed.
385 """
387 def __init__(self, **kwargs: Any) -> None:
388 """Initialize."""
390 self.tag = kwargs.get('tag', None) # type: ct.SelectorTag | None
391 self.ids = kwargs.get('ids', []) # type: list[str]
392 self.classes = kwargs.get('classes', []) # type: list[str]
393 self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
394 self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
395 self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
396 self.relations = kwargs.get('relations', []) # type: list[_Selector]
397 self.rel_type = kwargs.get('rel_type', None) # type: str | None
398 self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
399 self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
400 self.flags = kwargs.get('flags', 0) # type: int
401 self.no_match = kwargs.get('no_match', False) # type: bool
403 def _freeze_relations(self, relations: list[_Selector]) -> ct.SelectorList:
404 """Freeze relation."""
406 if relations:
407 sel = relations[0]
408 sel.relations.extend(relations[1:])
409 return ct.SelectorList([sel.freeze()])
410 else:
411 return ct.SelectorList()
413 def freeze(self) -> ct.Selector | ct.SelectorNull:
414 """Freeze self."""
416 if self.no_match:
417 return ct.SelectorNull()
418 else:
419 return ct.Selector(
420 self.tag,
421 tuple(self.ids),
422 tuple(self.classes),
423 tuple(self.attributes),
424 tuple(self.nth),
425 tuple(self.selectors),
426 self._freeze_relations(self.relations),
427 self.rel_type,
428 tuple(self.contains),
429 tuple(self.lang),
430 self.flags
431 )
433 def __str__(self) -> str: # pragma: no cover
434 """String representation."""
436 return (
437 f'_Selector(tag={self.tag!r}, ids={self.ids!r}, classes={self.classes!r}, attributes={self.attributes!r}, '
438 f'nth={self.nth!r}, selectors={self.selectors!r}, relations={self.relations!r}, '
439 f'rel_type={self.rel_type!r}, contains={self.contains!r}, lang={self.lang!r}, flags={self.flags!r}, '
440 f'no_match={self.no_match!r})'
441 )
443 __repr__ = __str__
446@dataclass
447class CSSPattern:
448 """A CSS pattern that hasn't been processed by `CSSParser` yet."""
450 selector: str
451 flags: int
454class PseudoSelectorMap(UserDict[str, CSSPattern | ct.SelectorList]):
455 """Pseudo selector map."""
457 def __setitem__(self, key: str, value: CSSPattern | ct.SelectorList) -> None:
458 """Set item."""
460 self.data[key] = value
462 def __getitem__(self, key: str) -> ct.SelectorList:
463 """Get item."""
465 with SEL_LOCK:
466 value = self.data[key]
467 if isinstance(value, CSSPattern):
468 value = CSSParser(value.selector).process_selectors(flags=value.flags)
469 self.data[key] = value
471 return value
474# CSS pattern for `:link` and `:any-link`
475CSS_LINK = CSSPattern('html|*:is(a, area)[href]', FLG_PSEUDO | FLG_HTML)
476# CSS pattern for `:checked`
477CSS_CHECKED = CSSPattern(
478 '''
479 html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected]
480 ''',
481 FLG_PSEUDO | FLG_HTML
482)
483# CSS pattern for `:default` (must compile CSS_CHECKED first)
484CSS_DEFAULT = CSSPattern(
485 '''
486 :checked,
488 /*
489 This pattern must be at the end.
490 Special logic is applied to the last selector.
491 */
492 html|form html|*:is(button, input)[type="submit"]
493 ''',
494 FLG_PSEUDO | FLG_HTML | FLG_DEFAULT
495)
496# CSS pattern for `:indeterminate`
497CSS_INDETERMINATE = CSSPattern(
498 '''
499 html|input[type="checkbox"][indeterminate],
500 html|input[type="radio"]:is(:not([name]), [name=""]):not([checked]),
501 html|progress:not([value]),
503 /*
504 This pattern must be at the end.
505 Special logic is applied to the last selector.
506 */
507 html|input[type="radio"][name]:not([name='']):not([checked])
508 ''',
509 FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE
510)
511# CSS pattern for `:disabled`
512CSS_DISABLED = CSSPattern(
513 '''
514 html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
515 html|optgroup[disabled] > html|option,
516 html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset),
517 html|fieldset[disabled] >
518 html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset)
519 ''',
520 FLG_PSEUDO | FLG_HTML
521)
522# CSS pattern for `:enabled`
523CSS_ENABLED = CSSPattern(
524 '''
525 html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
526 ''',
527 FLG_PSEUDO | FLG_HTML
528)
529# CSS pattern for `:required`
530CSS_REQUIRED = CSSPattern('html|*:is(input, textarea, select)[required]', FLG_PSEUDO | FLG_HTML)
531# CSS pattern for `:optional`
532CSS_OPTIONAL = CSSPattern('html|*:is(input, textarea, select):not([required])', FLG_PSEUDO | FLG_HTML)
533# CSS pattern for `:placeholder-shown`
534CSS_PLACEHOLDER_SHOWN = CSSPattern(
535 '''
536 html|input:is(
537 :not([type]),
538 [type=""],
539 [type=text],
540 [type=search],
541 [type=url],
542 [type=tel],
543 [type=email],
544 [type=password],
545 [type=number]
546 )[placeholder]:not([placeholder='']):is(:not([value]), [value=""]),
547 html|textarea[placeholder]:not([placeholder=''])
548 ''',
549 FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN
550)
551# CSS pattern for `:read-write` (CSS_DISABLED must be compiled first)
552CSS_READ_WRITE = CSSPattern(
553 '''
554 html|*:is(
555 textarea,
556 input:is(
557 :not([type]),
558 [type=""],
559 [type=text],
560 [type=search],
561 [type=url],
562 [type=tel],
563 [type=email],
564 [type=number],
565 [type=password],
566 [type=date],
567 [type=datetime-local],
568 [type=month],
569 [type=time],
570 [type=week]
571 )
572 ):not([readonly], :disabled),
573 html|*:is([contenteditable=""], [contenteditable="true" i])
574 ''',
575 FLG_PSEUDO | FLG_HTML
576)
577# CSS pattern for `:read-only`
578CSS_READ_ONLY = CSSPattern('html|*:not(:read-write)', FLG_PSEUDO | FLG_HTML)
579# CSS pattern for `:in-range`
580CSS_IN_RANGE = CSSPattern(
581 '''
582 html|input:is(
583 [type="date"],
584 [type="month"],
585 [type="week"],
586 [type="time"],
587 [type="datetime-local"],
588 [type="number"],
589 [type="range"]
590 ):is(
591 [min],
592 [max]
593 )
594 ''',
595 FLG_PSEUDO | FLG_HTML | FLG_IN_RANGE
596)
597# CSS pattern for `:out-of-range`
598CSS_OUT_OF_RANGE = CSSPattern(
599 '''
600 html|input:is(
601 [type="date"],
602 [type="month"],
603 [type="week"],
604 [type="time"],
605 [type="datetime-local"],
606 [type="number"],
607 [type="range"]
608 ):is(
609 [min],
610 [max]
611 )
612 ''',
613 FLG_PSEUDO | FLG_HTML | FLG_OUT_OF_RANGE
614)
615# CSS pattern for :open
616CSS_OPEN = CSSPattern('html|*:is(details, dialog)[open]', FLG_PSEUDO | FLG_HTML)
617# CSS pattern for :muted
618CSS_MUTED = CSSPattern('html|*:is(video, audio)[muted]', FLG_PSEUDO | FLG_HTML)
619# CSS pattern default for `:nth-child` "of S" feature
620CSS_NTH_OF_S_DEFAULT = CSSPattern("*|*", FLG_PSEUDO)
623class CSSParser:
624 """Parse CSS selectors."""
626 CSS_TOKENS = (
627 SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
628 SpecialPseudoPattern(
629 (
630 (
631 "pseudo_contains",
632 (':contains', ':-soup-contains', ':-soup-contains-own'),
633 PAT_PSEUDO_CONTAINS,
634 SelectorPattern
635 ),
636 ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
637 ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
638 ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
639 ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
640 )
641 ),
642 SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
643 SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS),
644 SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT),
645 SelectorPattern("amp", PAT_AMP),
646 SelectorPattern("at_rule", PAT_AT_RULE),
647 SelectorPattern("id", PAT_ID),
648 SelectorPattern("class", PAT_CLASS),
649 SelectorPattern("tag", PAT_TAG),
650 SelectorPattern("attribute", PAT_ATTR),
651 SelectorPattern("combine", PAT_COMBINE)
652 )
654 # Pseudos that expand to selectors
655 PSEUDO_SELECTORS = PseudoSelectorMap(
656 {
657 ':link': CSS_LINK,
658 ':any-link': CSS_LINK,
659 ':checked': CSS_CHECKED,
660 ':default': CSS_DEFAULT,
661 ':indeterminate': CSS_INDETERMINATE,
662 ':disabled': CSS_DISABLED,
663 ':enabled': CSS_ENABLED,
664 ':required': CSS_REQUIRED,
665 ':muted': CSS_MUTED,
666 ':open': CSS_OPEN,
667 ':optional': CSS_OPTIONAL,
668 ':read-only': CSS_READ_ONLY,
669 ':read-write': CSS_READ_WRITE,
670 ':in-range': CSS_IN_RANGE,
671 ':out-of-range': CSS_OUT_OF_RANGE,
672 ':placeholder-shown': CSS_PLACEHOLDER_SHOWN,
673 '<nth-of-s>': CSS_NTH_OF_S_DEFAULT
674 }
675 )
677 def __init__(
678 self,
679 selector: str,
680 custom: dict[str, str | ct.SelectorList] | None = None,
681 flags: int = 0
682 ) -> None:
683 """Initialize."""
685 self.pattern = selector.replace('\x00', '\ufffd')
686 self.flags = flags
687 self.debug = self.flags & util.DEBUG
688 self.custom = {} if custom is None else custom
689 self.count = 0
691 def check_count(self) -> None:
692 """Check the current selector count."""
694 if self.count > SELECTOR_LIMIT:
695 raise ValueError(f'Selector exceeds pseudo-class nesting limit of {SELECTOR_LIMIT}')
697 def parse_attribute_selector(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
698 """Create attribute selector from the returned regex match."""
700 inverse = False
701 op = m.group('cmp')
702 case = util.lower(m.group('case')) if m.group('case') else None
703 ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else ''
704 attr = css_unescape(m.group('attr_name'))
705 is_type = False
706 pattern2 = None
707 value = ''
709 if case:
710 flags = (re.I if case == 'i' else 0) | re.DOTALL
711 elif util.lower(attr) == 'type':
712 flags = re.I | re.DOTALL
713 is_type = True
714 else:
715 flags = re.DOTALL
717 if op:
718 if m.group('value').startswith(('"', "'")):
719 value = css_unescape(m.group('value')[1:-1], True)
720 else:
721 value = css_unescape(m.group('value'))
723 if not op:
724 # Attribute name
725 pattern = None
726 elif op.startswith('^'):
727 # Value start with
728 pattern = re.compile(r'^%s.*' % re.escape(value), flags)
729 elif op.startswith('$'):
730 # Value ends with
731 pattern = re.compile(r'.*?%s$' % re.escape(value), flags)
732 elif op.startswith('*'):
733 # Value contains
734 pattern = re.compile(r'.*?%s.*' % re.escape(value), flags)
735 elif op.startswith('~'):
736 # Value contains word within space separated list
737 # `~=` should match nothing if it is empty or contains whitespace,
738 # so if either of these cases is present, use `[^\s\S]` which cannot be matched.
739 value = r'[^\s\S]' if not value or RE_WS.search(value) else re.escape(value)
740 pattern = re.compile(r'.*?(?:(?<=^)|(?<=[ \t\r\n\f]))%s(?=(?:[ \t\r\n\f]|$)).*' % value, flags)
741 elif op.startswith('|'):
742 # Value starts with word in dash separated list
743 pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
744 else:
745 # Value matches
746 pattern = re.compile(r'^%s$' % re.escape(value), flags)
747 if op.startswith('!'):
748 # Equivalent to `:not([attr=value])`
749 inverse = True
750 if is_type and pattern:
751 pattern2 = re.compile(pattern.pattern)
753 # Append the attribute selector
754 sel_attr = ct.SelectorAttribute(attr, ns, pattern, pattern2)
755 if inverse:
756 # If we are using `!=`, we need to nest the pattern under a `:not()`.
757 sub_sel = _Selector()
758 sub_sel.attributes.append(sel_attr)
759 not_list = ct.SelectorList([sub_sel.freeze()], True, False)
760 sel.selectors.append(not_list)
761 else:
762 sel.attributes.append(sel_attr)
764 has_selector = True
765 return has_selector
767 def parse_tag_pattern(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
768 """Parse tag pattern from regex match."""
770 prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
771 tag = css_unescape(m.group('tag_name'))
772 sel.tag = ct.SelectorTag(tag, prefix)
773 has_selector = True
774 return has_selector
776 def parse_pseudo_class_custom(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
777 """
778 Parse custom pseudo class alias.
780 Compile custom selectors as we need them. When compiling a custom selector,
781 set it to `None` in the dictionary so we can avoid an infinite loop.
782 """
784 pseudo = util.lower(css_unescape(m.group('name')))
785 selector = self.custom.get(pseudo)
786 if selector is None:
787 raise SelectorSyntaxError(
788 f"Undefined custom selector '{pseudo}' found at position {m.end(0)}",
789 self.pattern,
790 m.end(0)
791 )
793 if not isinstance(selector, ct.SelectorList):
794 del self.custom[pseudo]
795 selector = CSSParser(
796 selector, custom=self.custom, flags=self.flags
797 ).process_selectors(flags=FLG_PSEUDO)
798 self.custom[pseudo] = selector
800 self.count += selector.count
801 self.check_count()
803 sel.selectors.append(selector)
804 has_selector = True
805 return has_selector
807 def parse_pseudo_class(
808 self,
809 sel: _Selector,
810 m: Match[str],
811 has_selector: bool,
812 iselector: Iterator[tuple[str, Match[str]]],
813 is_html: bool
814 ) -> tuple[bool, bool]:
815 """Parse pseudo class."""
817 complex_pseudo = False
818 pseudo = util.lower(css_unescape(m.group('name')))
819 if m.group('open'):
820 complex_pseudo = True
821 if complex_pseudo and pseudo in PSEUDO_COMPLEX:
822 has_selector = self.parse_pseudo_open(sel, pseudo, has_selector, iselector, m.end(0))
823 elif not complex_pseudo and pseudo in PSEUDO_SIMPLE:
824 if pseudo == ':root':
825 sel.flags |= ct.SEL_ROOT
826 elif pseudo == ':defined':
827 sel.flags |= ct.SEL_DEFINED
828 is_html = True
829 elif pseudo == ':scope':
830 sel.flags |= ct.SEL_SCOPE
831 elif pseudo == ':empty':
832 sel.flags |= ct.SEL_EMPTY
833 elif pseudo in self.PSEUDO_SELECTORS:
834 pseudo_selector = self.PSEUDO_SELECTORS[pseudo]
835 self.count += pseudo_selector.count
836 self.check_count()
837 sel.selectors.append(pseudo_selector)
838 elif pseudo == ':first-child':
839 sel.nth.append(ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()))
840 elif pseudo == ':last-child':
841 sel.nth.append(ct.SelectorNth(1, False, 0, False, True, ct.SelectorList()))
842 elif pseudo == ':first-of-type':
843 sel.nth.append(ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()))
844 elif pseudo == ':last-of-type':
845 sel.nth.append(ct.SelectorNth(1, False, 0, True, True, ct.SelectorList()))
846 elif pseudo == ':only-child':
847 sel.nth.extend(
848 [
849 ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()),
850 ct.SelectorNth(1, False, 0, False, True, ct.SelectorList())
851 ]
852 )
853 elif pseudo == ':only-of-type':
854 sel.nth.extend(
855 [
856 ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()),
857 ct.SelectorNth(1, False, 0, True, True, ct.SelectorList())
858 ]
859 )
860 has_selector = True
861 elif complex_pseudo and pseudo in PSEUDO_COMPLEX_NO_MATCH:
862 self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
863 sel.no_match = True
864 has_selector = True
865 elif not complex_pseudo and pseudo in PSEUDO_SIMPLE_NO_MATCH:
866 sel.no_match = True
867 has_selector = True
868 elif pseudo in PSEUDO_SUPPORTED:
869 raise SelectorSyntaxError(
870 f"Invalid syntax for pseudo class '{pseudo}'",
871 self.pattern,
872 m.start(0)
873 )
874 else:
875 raise SelectorSyntaxError(
876 f"'{pseudo}' was detected as a pseudo-class and is either unsupported or invalid. "
877 "If the syntax was not intended to be recognized as a pseudo-class, please escape the colon.",
878 self.pattern,
879 m.start(0)
880 )
882 return has_selector, is_html
884 def parse_pseudo_nth(
885 self,
886 sel: _Selector,
887 m: Match[str],
888 has_selector: bool,
889 iselector: Iterator[tuple[str, Match[str]]]
890 ) -> bool:
891 """Parse `nth` pseudo."""
893 mdict = m.groupdict()
894 if mdict.get('pseudo_nth_child'):
895 postfix = '_child'
896 else:
897 postfix = '_type'
898 mdict['name'] = util.lower(css_unescape(mdict['name']))
899 content = util.lower(mdict.get('nth' + postfix))
900 if content == 'even':
901 # 2n
902 s1 = 2
903 s2 = 0
904 var = True
905 elif content == 'odd':
906 # 2n+1
907 s1 = 2
908 s2 = 1
909 var = True
910 else:
911 nth_parts = cast(Match[str], RE_NTH.match(content))
912 _s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else ''
913 a = nth_parts.group('a')
914 var = a.endswith('n')
915 if a.startswith('n'):
916 _s1 += '1'
917 elif var:
918 _s1 += a[:-1]
919 else:
920 _s1 += a
921 _s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else ''
922 if nth_parts.group('b'):
923 _s2 += nth_parts.group('b')
924 else:
925 _s2 = '0'
926 s1 = int(_s1, 10)
927 s2 = int(_s2, 10)
929 pseudo_sel = mdict['name']
930 if postfix == '_child':
931 if m.group('of'):
932 # Parse the rest of `of S`.
933 nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
934 else:
935 # Use default `*|*` for `of S`.
936 nth_sel = self.PSEUDO_SELECTORS['<nth-of-s>']
937 self.count += nth_sel.count
938 self.check_count()
939 if pseudo_sel == ':nth-child':
940 sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel))
941 elif pseudo_sel == ':nth-last-child':
942 sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel))
943 else:
944 if pseudo_sel == ':nth-of-type':
945 sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList()))
946 elif pseudo_sel == ':nth-last-of-type':
947 sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList()))
948 has_selector = True
949 return has_selector
951 def parse_pseudo_open(
952 self,
953 sel: _Selector,
954 name: str,
955 has_selector: bool,
956 iselector: Iterator[tuple[str, Match[str]]],
957 index: int
958 ) -> bool:
959 """Parse pseudo with opening bracket."""
961 flags = FLG_PSEUDO | FLG_OPEN
962 if name == ':not':
963 flags |= FLG_NOT
964 elif name == ':has':
965 flags |= FLG_RELATIVE
966 elif name in (':where', ':is'):
967 flags |= FLG_FORGIVE
969 sel.selectors.append(self.parse_selectors(iselector, index, flags))
970 has_selector = True
972 return has_selector
974 def parse_has_combinator(
975 self,
976 sel: _Selector,
977 m: Match[str],
978 has_selector: bool,
979 selectors: list[_Selector],
980 rel_type: str,
981 index: int
982 ) -> tuple[bool, _Selector, str]:
983 """Parse combinator tokens."""
985 combinator = m.group('relation').strip()
986 if not combinator:
987 combinator = WS_COMBINATOR
988 if combinator == COMMA_COMBINATOR:
989 sel.rel_type = rel_type
990 selectors[-1].relations.append(sel)
991 rel_type = ":" + WS_COMBINATOR
992 selectors.append(_Selector())
993 else:
994 if has_selector:
995 # End the current selector and associate the leading combinator with this selector.
996 sel.rel_type = rel_type
997 selectors[-1].relations.append(sel)
998 elif rel_type[1:] != WS_COMBINATOR:
999 # It's impossible to have two whitespace combinators after each other as the patterns
1000 # will gobble up trailing whitespace. It is also impossible to have a whitespace
1001 # combinator after any other kind for the same reason. But we could have
1002 # multiple non-whitespace combinators. So if the current combinator is not a whitespace,
1003 # then we've hit the multiple combinator case, so we should fail.
1004 raise SelectorSyntaxError(
1005 f'The multiple combinators at position {index}',
1006 self.pattern,
1007 index
1008 )
1010 # Set the leading combinator for the next selector.
1011 rel_type = ':' + combinator
1013 sel = _Selector()
1014 has_selector = False
1015 return has_selector, sel, rel_type
1017 def parse_combinator(
1018 self,
1019 sel: _Selector,
1020 m: Match[str],
1021 has_selector: bool,
1022 selectors: list[_Selector],
1023 relations: list[_Selector],
1024 is_pseudo: bool,
1025 is_forgive: bool,
1026 index: int
1027 ) -> tuple[bool, _Selector]:
1028 """Parse combinator tokens."""
1030 combinator = m.group('relation').strip()
1031 if not combinator:
1032 combinator = WS_COMBINATOR
1033 if not has_selector:
1034 if not is_forgive or combinator != COMMA_COMBINATOR:
1035 raise SelectorSyntaxError(
1036 f"The combinator '{combinator}' at position {index}, must have a selector before it",
1037 self.pattern,
1038 index
1039 )
1041 # If we are in a forgiving pseudo class, just make the selector a "no match"
1042 if combinator == COMMA_COMBINATOR:
1043 sel.no_match = True
1044 del relations[:]
1045 selectors.append(sel)
1046 else:
1047 if combinator == COMMA_COMBINATOR:
1048 if not sel.tag and not is_pseudo:
1049 # Implied `*`
1050 sel.tag = ct.SelectorTag('*', None)
1051 sel.relations.extend(relations)
1052 selectors.append(sel)
1053 del relations[:]
1054 else:
1055 sel.relations.extend(relations)
1056 sel.rel_type = combinator
1057 del relations[:]
1058 relations.append(sel)
1060 sel = _Selector()
1061 has_selector = False
1063 return has_selector, sel
1065 def parse_class_id(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
1066 """Parse HTML classes and ids."""
1068 selector = m.group(0)
1069 if selector.startswith('.'):
1070 sel.classes.append(css_unescape(selector[1:]))
1071 else:
1072 sel.ids.append(css_unescape(selector[1:]))
1073 has_selector = True
1074 return has_selector
1076 def parse_pseudo_contains(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
1077 """Parse contains."""
1079 pseudo = util.lower(css_unescape(m.group('name')))
1080 if pseudo == ":contains":
1081 warnings.warn( # noqa: B028
1082 "The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.",
1083 FutureWarning
1084 )
1085 contains_own = pseudo == ":-soup-contains-own"
1086 values = css_unescape(m.group('values'))
1087 patterns = []
1088 for token in RE_VALUES.finditer(values):
1089 if token.group('split'):
1090 continue
1091 value = token.group('value')
1092 if value.startswith(("'", '"')):
1093 value = css_unescape(value[1:-1], True)
1094 else:
1095 value = css_unescape(value)
1096 patterns.append(value)
1097 sel.contains.append(ct.SelectorContains(patterns, contains_own))
1098 has_selector = True
1099 return has_selector
1101 def parse_pseudo_lang(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
1102 """Parse pseudo language."""
1104 values = m.group('values')
1105 patterns = []
1106 for token in RE_VALUES.finditer(values):
1107 if token.group('split'):
1108 continue
1109 value = token.group('value')
1110 if value.startswith(('"', "'")):
1111 value = css_unescape(value[1:-1], True)
1112 else:
1113 value = css_unescape(value)
1115 patterns.append(value)
1117 sel.lang.append(ct.SelectorLang(patterns))
1118 has_selector = True
1120 return has_selector
1122 def parse_pseudo_dir(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
1123 """Parse pseudo direction."""
1125 value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL
1126 sel.flags |= value
1127 has_selector = True
1128 return has_selector
1130 def parse_selectors(
1131 self,
1132 iselector: Iterator[tuple[str, Match[str]]],
1133 index: int = 0,
1134 flags: int = 0
1135 ) -> ct.SelectorList:
1136 """Parse selectors."""
1138 # Initialize important variables
1139 sel = _Selector()
1140 selectors = []
1141 has_selector = False
1142 closed = False
1143 relations = [] # type: list[_Selector]
1144 rel_type = ":" + WS_COMBINATOR
1145 count = self.count
1147 # Setup various flags
1148 is_open = bool(flags & FLG_OPEN)
1149 is_pseudo = bool(flags & FLG_PSEUDO)
1150 is_relative = bool(flags & FLG_RELATIVE)
1151 is_not = bool(flags & FLG_NOT)
1152 is_html = bool(flags & FLG_HTML)
1153 is_default = bool(flags & FLG_DEFAULT)
1154 is_indeterminate = bool(flags & FLG_INDETERMINATE)
1155 is_in_range = bool(flags & FLG_IN_RANGE)
1156 is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
1157 is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
1158 is_forgive = bool(flags & FLG_FORGIVE)
1160 # Print out useful debug stuff
1161 if self.debug: # pragma: no cover
1162 if is_pseudo:
1163 print(' is_pseudo: True')
1164 if is_open:
1165 print(' is_open: True')
1166 if is_relative:
1167 print(' is_relative: True')
1168 if is_not:
1169 print(' is_not: True')
1170 if is_html:
1171 print(' is_html: True')
1172 if is_default:
1173 print(' is_default: True')
1174 if is_indeterminate:
1175 print(' is_indeterminate: True')
1176 if is_in_range:
1177 print(' is_in_range: True')
1178 if is_out_of_range:
1179 print(' is_out_of_range: True')
1180 if is_placeholder_shown:
1181 print(' is_placeholder_shown: True')
1182 if is_forgive:
1183 print(' is_forgive: True')
1185 # The algorithm for relative selectors require an initial selector in the selector list
1186 if is_relative:
1187 selectors.append(_Selector())
1189 try:
1190 while True:
1191 key, m = next(iselector)
1193 if key not in ('combine', 'pseudo_close'):
1194 self.count += 1
1195 self.check_count()
1197 # Handle parts
1198 if key == "at_rule":
1199 raise NotImplementedError(f"At-rules found at position {m.start(0)}")
1200 elif key == "amp":
1201 sel.flags |= ct.SEL_SCOPE
1202 has_selector = True
1203 elif key == 'pseudo_class_custom':
1204 has_selector = self.parse_pseudo_class_custom(sel, m, has_selector)
1205 elif key == 'pseudo_class':
1206 has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
1207 elif key == 'pseudo_element':
1208 raise NotImplementedError(f"Pseudo-element found at position {m.start(0)}")
1209 elif key == 'pseudo_contains':
1210 has_selector = self.parse_pseudo_contains(sel, m, has_selector)
1211 elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
1212 has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector)
1213 elif key == 'pseudo_lang':
1214 has_selector = self.parse_pseudo_lang(sel, m, has_selector)
1215 elif key == 'pseudo_dir':
1216 has_selector = self.parse_pseudo_dir(sel, m, has_selector)
1217 # Currently only supports HTML
1218 is_html = True
1219 elif key == 'pseudo_close':
1220 if not has_selector:
1221 if not is_forgive:
1222 raise SelectorSyntaxError(
1223 f"Expected a selector at position {m.start(0)}",
1224 self.pattern,
1225 m.start(0)
1226 )
1227 sel.no_match = True
1228 if is_open:
1229 closed = True
1230 break
1231 else:
1232 raise SelectorSyntaxError(
1233 f"Unmatched pseudo-class close at position {m.start(0)}",
1234 self.pattern,
1235 m.start(0)
1236 )
1237 elif key == 'combine':
1238 if is_relative:
1239 has_selector, sel, rel_type = self.parse_has_combinator(
1240 sel, m, has_selector, selectors, rel_type, index
1241 )
1242 else:
1243 has_selector, sel = self.parse_combinator(
1244 sel, m, has_selector, selectors, relations, is_pseudo, is_forgive, index
1245 )
1246 elif key == 'attribute':
1247 has_selector = self.parse_attribute_selector(sel, m, has_selector)
1248 elif key == 'tag':
1249 if has_selector:
1250 raise SelectorSyntaxError(
1251 f"Tag name found at position {m.start(0)} instead of at the start",
1252 self.pattern,
1253 m.start(0)
1254 )
1255 has_selector = self.parse_tag_pattern(sel, m, has_selector)
1256 elif key in ('class', 'id'):
1257 has_selector = self.parse_class_id(sel, m, has_selector)
1259 index = m.end(0)
1260 except StopIteration:
1261 pass
1263 # Handle selectors that are not closed
1264 if is_open and not closed:
1265 raise SelectorSyntaxError(
1266 f"Unclosed pseudo-class at position {index}",
1267 self.pattern,
1268 index
1269 )
1271 # Cleanup completed selector piece
1272 if has_selector:
1273 if not sel.tag and not is_pseudo:
1274 # Implied `*`
1275 sel.tag = ct.SelectorTag('*', None)
1276 if is_relative:
1277 sel.rel_type = rel_type
1278 selectors[-1].relations.append(sel)
1279 else:
1280 sel.relations.extend(relations)
1281 del relations[:]
1282 selectors.append(sel)
1284 # Forgive empty slots in pseudo-classes that have lists (and are forgiving)
1285 elif is_forgive and (not selectors or not relations):
1286 # Handle normal pseudo-classes with empty slots like `:is()` etc.
1287 sel.no_match = True
1288 del relations[:]
1289 selectors.append(sel)
1290 has_selector = True
1292 if not has_selector:
1293 # We will always need to finish a selector when `:has()` is used as it leads with combining.
1294 # May apply to others as well.
1295 raise SelectorSyntaxError(
1296 f'Expected a selector at position {index}',
1297 self.pattern,
1298 index
1299 )
1301 # Some patterns require additional logic, such as default. We try to make these the
1302 # last pattern, and append the appropriate flag to that selector which communicates
1303 # to the matcher what additional logic is required.
1304 if is_default:
1305 selectors[-1].flags = ct.SEL_DEFAULT
1306 if is_indeterminate:
1307 selectors[-1].flags = ct.SEL_INDETERMINATE
1308 if is_in_range:
1309 selectors[-1].flags = ct.SEL_IN_RANGE
1310 if is_out_of_range:
1311 selectors[-1].flags = ct.SEL_OUT_OF_RANGE
1312 if is_placeholder_shown:
1313 selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
1315 # Return selector list
1316 return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html, self.count - count)
1318 def selector_iter(self, pattern: str) -> Iterator[tuple[str, Match[str]]]:
1319 """Iterate selector tokens."""
1321 # Ignore whitespace and comments at start and end of pattern
1322 m = RE_WS_BEGIN.search(pattern)
1323 index = m.end(0) if m else 0
1324 m = RE_WS_END.search(pattern)
1325 end = (m.start(0) - 1) if m else (len(pattern) - 1)
1327 if self.debug: # pragma: no cover
1328 print(f'## PARSING: {pattern!r}')
1329 while index <= end:
1330 m = None
1331 for v in self.CSS_TOKENS:
1332 m = v.match(pattern, index, self.flags)
1333 if m:
1334 name = v.get_name()
1335 if self.debug: # pragma: no cover
1336 print(f"TOKEN: '{name}' --> {m.group(0)!r} at position {m.start(0)}")
1337 index = m.end(0)
1338 yield name, m
1339 break
1340 if m is None:
1341 c = pattern[index]
1342 # If the character represents the start of one of the known selector types,
1343 # throw an exception mentioning that the known selector type is in error;
1344 # otherwise, report the invalid character.
1345 if c == '[':
1346 msg = f"Malformed attribute selector at position {index}"
1347 elif c == '.':
1348 msg = f"Malformed class selector at position {index}"
1349 elif c == '#':
1350 msg = f"Malformed id selector at position {index}"
1351 elif c == ':':
1352 msg = f"Malformed pseudo-class selector at position {index}"
1353 else:
1354 msg = f"Invalid character {c!r} position {index}"
1355 raise SelectorSyntaxError(msg, self.pattern, index)
1356 if self.debug: # pragma: no cover
1357 print('## END PARSING')
1359 def process_selectors(self, index: int = 0, flags: int = 0) -> ct.SelectorList:
1360 """Process selectors."""
1362 return self.parse_selectors(self.selector_iter(self.pattern), index, flags)