Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/lark/lexer.py: 70%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Lexer Implementation
3from abc import abstractmethod, ABC
4import re
5from contextlib import suppress
6from typing import (
7 TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any,
8 ClassVar, TYPE_CHECKING, overload
9)
10from types import ModuleType
11import warnings
12try:
13 import interegular
14except ImportError:
15 pass
16if TYPE_CHECKING:
17 from .common import LexerConf
18 from .parsers.lalr_parser_state import ParserState
20from .utils import classify, get_regexp_width, Serialize, logger
21from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
22from .grammar import TOKEN_DEFAULT_PRIORITY
25###{standalone
26from copy import copy
28try: # For the standalone parser, we need to make sure that has_interegular is False to avoid NameErrors later on
29 has_interegular = bool(interegular)
30except NameError:
31 has_interegular = False
33class Pattern(Serialize, ABC):
34 "An abstraction over regular expressions."
36 value: str
37 flags: Collection[str]
38 raw: Optional[str]
39 type: ClassVar[str]
41 def __init__(self, value: str, flags: Collection[str] = (), raw: Optional[str] = None) -> None:
42 self.value = value
43 self.flags = frozenset(flags)
44 self.raw = raw
46 def __repr__(self):
47 return repr(self.to_regexp())
49 # Pattern Hashing assumes all subclasses have a different priority!
50 def __hash__(self):
51 return hash((type(self), self.value, self.flags))
53 def __eq__(self, other):
54 return type(self) == type(other) and self.value == other.value and self.flags == other.flags
56 @abstractmethod
57 def to_regexp(self) -> str:
58 raise NotImplementedError()
60 @property
61 @abstractmethod
62 def min_width(self) -> int:
63 raise NotImplementedError()
65 @property
66 @abstractmethod
67 def max_width(self) -> int:
68 raise NotImplementedError()
70 def _get_flags(self, value):
71 for f in self.flags:
72 value = ('(?%s:%s)' % (f, value))
73 return value
76class PatternStr(Pattern):
77 __serialize_fields__ = 'value', 'flags', 'raw'
79 type: ClassVar[str] = "str"
81 def to_regexp(self) -> str:
82 return self._get_flags(re.escape(self.value))
84 @property
85 def min_width(self) -> int:
86 return len(self.value)
88 @property
89 def max_width(self) -> int:
90 return len(self.value)
93class PatternRE(Pattern):
94 __serialize_fields__ = 'value', 'flags', 'raw', '_width'
96 type: ClassVar[str] = "re"
98 def to_regexp(self) -> str:
99 return self._get_flags(self.value)
101 _width = None
102 def _get_width(self):
103 if self._width is None:
104 self._width = get_regexp_width(self.to_regexp())
105 return self._width
107 @property
108 def min_width(self) -> int:
109 return self._get_width()[0]
111 @property
112 def max_width(self) -> int:
113 return self._get_width()[1]
116class TerminalDef(Serialize):
117 "A definition of a terminal"
118 __serialize_fields__ = 'name', 'pattern', 'priority'
119 __serialize_namespace__ = PatternStr, PatternRE
121 name: str
122 pattern: Pattern
123 priority: int
125 def __init__(self, name: str, pattern: Pattern, priority: int = TOKEN_DEFAULT_PRIORITY) -> None:
126 assert isinstance(pattern, Pattern), pattern
127 self.name = name
128 self.pattern = pattern
129 self.priority = priority
131 def __repr__(self):
132 return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
134 def user_repr(self) -> str:
135 if self.name.startswith('__'): # We represent a generated terminal
136 return self.pattern.raw or self.name
137 else:
138 return self.name
140_T = TypeVar('_T', bound="Token")
142class Token(str):
143 """A string with meta-information, that is produced by the lexer.
145 When parsing text, the resulting chunks of the input that haven't been discarded,
146 will end up in the tree as Token instances. The Token class inherits from Python's ``str``,
147 so normal string comparisons and operations will work as expected.
149 Attributes:
150 type: Name of the token (as specified in grammar)
151 value: Value of the token (redundant, as ``token.value == token`` will always be true)
152 start_pos: The index of the token in the text
153 line: The line of the token in the text (starting with 1)
154 column: The column of the token in the text (starting with 1)
155 end_line: The line where the token ends
156 end_column: The next column after the end of the token. For example,
157 if the token is a single character with a column value of 4,
158 end_column will be 5.
159 end_pos: the index where the token ends (basically ``start_pos + len(token)``)
160 """
161 __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')
163 __match_args__ = ('type', 'value')
165 type: str
166 start_pos: Optional[int]
167 value: Any
168 line: Optional[int]
169 column: Optional[int]
170 end_line: Optional[int]
171 end_column: Optional[int]
172 end_pos: Optional[int]
175 @overload
176 def __new__(
177 cls,
178 type: str,
179 value: Any,
180 start_pos: Optional[int] = None,
181 line: Optional[int] = None,
182 column: Optional[int] = None,
183 end_line: Optional[int] = None,
184 end_column: Optional[int] = None,
185 end_pos: Optional[int] = None
186 ) -> 'Token':
187 ...
189 @overload
190 def __new__(
191 cls,
192 type_: str,
193 value: Any,
194 start_pos: Optional[int] = None,
195 line: Optional[int] = None,
196 column: Optional[int] = None,
197 end_line: Optional[int] = None,
198 end_column: Optional[int] = None,
199 end_pos: Optional[int] = None
200 ) -> 'Token': ...
202 def __new__(cls, *args, **kwargs):
203 if "type_" in kwargs:
204 warnings.warn("`type_` is deprecated use `type` instead", DeprecationWarning)
206 if "type" in kwargs:
207 raise TypeError("Error: using both 'type' and the deprecated 'type_' as arguments.")
208 kwargs["type"] = kwargs.pop("type_")
210 return cls._future_new(*args, **kwargs)
213 @classmethod
214 def _future_new(cls, type, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None):
215 inst = super(Token, cls).__new__(cls, value)
217 inst.type = type
218 inst.start_pos = start_pos
219 inst.value = value
220 inst.line = line
221 inst.column = column
222 inst.end_line = end_line
223 inst.end_column = end_column
224 inst.end_pos = end_pos
225 return inst
227 @overload
228 def update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
229 ...
231 @overload
232 def update(self, type_: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
233 ...
235 def update(self, *args, **kwargs):
236 if "type_" in kwargs:
237 warnings.warn("`type_` is deprecated use `type` instead", DeprecationWarning)
239 if "type" in kwargs:
240 raise TypeError("Error: using both 'type' and the deprecated 'type_' as arguments.")
241 kwargs["type"] = kwargs.pop("type_")
243 return self._future_update(*args, **kwargs)
245 def _future_update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
246 return Token.new_borrow_pos(
247 type if type is not None else self.type,
248 value if value is not None else self.value,
249 self
250 )
252 @classmethod
253 def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: 'Token') -> _T:
254 return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos)
256 def __reduce__(self):
257 return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column))
259 def __repr__(self):
260 return 'Token(%r, %r)' % (self.type, self.value)
262 def __deepcopy__(self, memo):
263 return Token(self.type, self.value, self.start_pos, self.line, self.column)
265 def __eq__(self, other):
266 if isinstance(other, Token) and self.type != other.type:
267 return False
269 return str.__eq__(self, other)
271 __hash__ = str.__hash__
274class LineCounter:
275 "A utility class for keeping track of line & column information"
277 __slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char'
279 def __init__(self, newline_char):
280 self.newline_char = newline_char
281 self.char_pos = 0
282 self.line = 1
283 self.column = 1
284 self.line_start_pos = 0
286 def __eq__(self, other):
287 if not isinstance(other, LineCounter):
288 return NotImplemented
290 return self.char_pos == other.char_pos and self.newline_char == other.newline_char
292 def feed(self, token: Token, test_newline=True):
293 """Consume a token and calculate the new line & column.
295 As an optional optimization, set test_newline=False if token doesn't contain a newline.
296 """
297 if test_newline:
298 newlines = token.count(self.newline_char)
299 if newlines:
300 self.line += newlines
301 self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
303 self.char_pos += len(token)
304 self.column = self.char_pos - self.line_start_pos + 1
307class UnlessCallback:
308 def __init__(self, scanner):
309 self.scanner = scanner
311 def __call__(self, t):
312 res = self.scanner.match(t.value, 0)
313 if res:
314 _value, t.type = res
315 return t
318class CallChain:
319 def __init__(self, callback1, callback2, cond):
320 self.callback1 = callback1
321 self.callback2 = callback2
322 self.cond = cond
324 def __call__(self, t):
325 t2 = self.callback1(t)
326 return self.callback2(t) if self.cond(t2) else t2
329def _get_match(re_, regexp, s, flags):
330 m = re_.match(regexp, s, flags)
331 if m:
332 return m.group(0)
334def _create_unless(terminals, g_regex_flags, re_, use_bytes):
335 tokens_by_type = classify(terminals, lambda t: type(t.pattern))
336 assert len(tokens_by_type) <= 2, tokens_by_type.keys()
337 embedded_strs = set()
338 callback = {}
339 for retok in tokens_by_type.get(PatternRE, []):
340 unless = []
341 for strtok in tokens_by_type.get(PatternStr, []):
342 if strtok.priority != retok.priority:
343 continue
344 s = strtok.pattern.value
345 if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags):
346 unless.append(strtok)
347 if strtok.pattern.flags <= retok.pattern.flags:
348 embedded_strs.add(strtok)
349 if unless:
350 callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
352 new_terminals = [t for t in terminals if t not in embedded_strs]
353 return new_terminals, callback
356class Scanner:
357 def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
358 self.terminals = terminals
359 self.g_regex_flags = g_regex_flags
360 self.re_ = re_
361 self.use_bytes = use_bytes
362 self.match_whole = match_whole
364 self.allowed_types = {t.name for t in self.terminals}
366 self._mres = self._build_mres(terminals, len(terminals))
368 def _build_mres(self, terminals, max_size):
369 # Python sets an unreasonable group limit (currently 100) in its re module
370 # Worse, the only way to know we reached it is by catching an AssertionError!
371 # This function recursively tries less and less groups until it's successful.
372 postfix = '$' if self.match_whole else ''
373 mres = []
374 while terminals:
375 pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
376 if self.use_bytes:
377 pattern = pattern.encode('latin-1')
378 try:
379 mre = self.re_.compile(pattern, self.g_regex_flags)
380 except AssertionError: # Yes, this is what Python provides us.. :/
381 return self._build_mres(terminals, max_size // 2)
383 mres.append(mre)
384 terminals = terminals[max_size:]
385 return mres
387 def match(self, text, pos):
388 for mre in self._mres:
389 m = mre.match(text, pos)
390 if m:
391 return m.group(0), m.lastgroup
394def _regexp_has_newline(r: str):
395 r"""Expressions that may indicate newlines in a regexp:
396 - newlines (\n)
397 - escaped newline (\\n)
398 - anything but ([^...])
399 - any-char (.) when the flag (?s) exists
400 - spaces (\s)
401 """
402 return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)
405class LexerState:
406 """Represents the current state of the lexer as it scans the text
407 (Lexer objects are only instantiated per grammar, not per text)
408 """
410 __slots__ = 'text', 'line_ctr', 'last_token'
412 text: str
413 line_ctr: LineCounter
414 last_token: Optional[Token]
416 def __init__(self, text: str, line_ctr: Optional[LineCounter]=None, last_token: Optional[Token]=None):
417 self.text = text
418 self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n')
419 self.last_token = last_token
421 def __eq__(self, other):
422 if not isinstance(other, LexerState):
423 return NotImplemented
425 return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
427 def __copy__(self):
428 return type(self)(self.text, copy(self.line_ctr), self.last_token)
431class LexerThread:
432 """A thread that ties a lexer instance and a lexer state, to be used by the parser
433 """
435 def __init__(self, lexer: 'Lexer', lexer_state: LexerState):
436 self.lexer = lexer
437 self.state = lexer_state
439 @classmethod
440 def from_text(cls, lexer: 'Lexer', text: str) -> 'LexerThread':
441 return cls(lexer, LexerState(text))
443 def lex(self, parser_state):
444 return self.lexer.lex(self.state, parser_state)
446 def __copy__(self):
447 return type(self)(self.lexer, copy(self.state))
449 _Token = Token
452_Callback = Callable[[Token], Token]
454class Lexer(ABC):
455 """Lexer interface
457 Method Signatures:
458 lex(self, lexer_state, parser_state) -> Iterator[Token]
459 """
460 @abstractmethod
461 def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
462 return NotImplemented
464 def make_lexer_state(self, text):
465 "Deprecated"
466 return LexerState(text)
469def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8):
470 if not comparator:
471 comparator = interegular.Comparator.from_regexes(terminal_to_regexp)
473 # When in strict mode, we only ever try to provide one example, so taking
474 # a long time for that should be fine
475 max_time = 2 if strict_mode else 0.2
477 # We don't want to show too many collisions.
478 if comparator.count_marked_pairs() >= max_collisions_to_show:
479 return
480 for group in classify(terminal_to_regexp, lambda t: t.priority).values():
481 for a, b in comparator.check(group, skip_marked=True):
482 assert a.priority == b.priority
483 # Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision
484 comparator.mark(a, b)
486 # Notify the user
487 message = f"Collision between Terminals {a.name} and {b.name}. "
488 try:
489 example = comparator.get_example_overlap(a, b, max_time).format_multiline()
490 except ValueError:
491 # Couldn't find an example within max_time steps.
492 example = "No example could be found fast enough. However, the collision does still exists"
493 if strict_mode:
494 raise LexError(f"{message}\n{example}")
495 logger.warning("%s The lexer will choose between them arbitrarily.\n%s", message, example)
496 if comparator.count_marked_pairs() >= max_collisions_to_show:
497 logger.warning("Found 8 regex collisions, will not check for more.")
498 return
501class AbstractBasicLexer(Lexer):
502 terminals_by_name: Dict[str, TerminalDef]
504 @abstractmethod
505 def __init__(self, conf: 'LexerConf', comparator=None) -> None:
506 ...
508 @abstractmethod
509 def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
510 ...
512 def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]:
513 with suppress(EOFError):
514 while True:
515 yield self.next_token(state, parser_state)
518class BasicLexer(AbstractBasicLexer):
519 terminals: Collection[TerminalDef]
520 ignore_types: FrozenSet[str]
521 newline_types: FrozenSet[str]
522 user_callbacks: Dict[str, _Callback]
523 callback: Dict[str, _Callback]
524 re: ModuleType
526 def __init__(self, conf: 'LexerConf', comparator=None) -> None:
527 terminals = list(conf.terminals)
528 assert all(isinstance(t, TerminalDef) for t in terminals), terminals
530 self.re = conf.re_module
532 if not conf.skip_validation:
533 # Sanitization
534 terminal_to_regexp = {}
535 for t in terminals:
536 regexp = t.pattern.to_regexp()
537 try:
538 self.re.compile(regexp, conf.g_regex_flags)
539 except self.re.error:
540 raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
542 if t.pattern.min_width == 0:
543 raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
544 if t.pattern.type == "re":
545 terminal_to_regexp[t] = regexp
547 if not (set(conf.ignore) <= {t.name for t in terminals}):
548 raise LexError("Ignore terminals are not defined: %s" % (set(conf.ignore) - {t.name for t in terminals}))
550 if has_interegular:
551 _check_regex_collisions(terminal_to_regexp, comparator, conf.strict)
552 elif conf.strict:
553 raise LexError("interegular must be installed for strict mode. Use `pip install 'lark[interegular]'`.")
555 # Init
556 self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp()))
557 self.ignore_types = frozenset(conf.ignore)
559 terminals.sort(key=lambda x: (-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
560 self.terminals = terminals
561 self.user_callbacks = conf.callbacks
562 self.g_regex_flags = conf.g_regex_flags
563 self.use_bytes = conf.use_bytes
564 self.terminals_by_name = conf.terminals_by_name
566 self._scanner = None
568 def _build_scanner(self):
569 terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
570 assert all(self.callback.values())
572 for type_, f in self.user_callbacks.items():
573 if type_ in self.callback:
574 # Already a callback there, probably UnlessCallback
575 self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_)
576 else:
577 self.callback[type_] = f
579 self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
581 @property
582 def scanner(self):
583 if self._scanner is None:
584 self._build_scanner()
585 return self._scanner
587 def match(self, text, pos):
588 return self.scanner.match(text, pos)
590 def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
591 line_ctr = lex_state.line_ctr
592 while line_ctr.char_pos < len(lex_state.text):
593 res = self.match(lex_state.text, line_ctr.char_pos)
594 if not res:
595 allowed = self.scanner.allowed_types - self.ignore_types
596 if not allowed:
597 allowed = {"<END-OF-FILE>"}
598 raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
599 allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
600 state=parser_state, terminals_by_name=self.terminals_by_name)
602 value, type_ = res
604 ignored = type_ in self.ignore_types
605 t = None
606 if not ignored or type_ in self.callback:
607 t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
608 line_ctr.feed(value, type_ in self.newline_types)
609 if t is not None:
610 t.end_line = line_ctr.line
611 t.end_column = line_ctr.column
612 t.end_pos = line_ctr.char_pos
613 if t.type in self.callback:
614 t = self.callback[t.type](t)
615 if not ignored:
616 if not isinstance(t, Token):
617 raise LexError("Callbacks must return a token (returned %r)" % t)
618 lex_state.last_token = t
619 return t
621 # EOF
622 raise EOFError(self)
625class ContextualLexer(Lexer):
626 lexers: Dict[int, AbstractBasicLexer]
627 root_lexer: AbstractBasicLexer
629 BasicLexer: Type[AbstractBasicLexer] = BasicLexer
631 def __init__(self, conf: 'LexerConf', states: Dict[int, Collection[str]], always_accept: Collection[str]=()) -> None:
632 terminals = list(conf.terminals)
633 terminals_by_name = conf.terminals_by_name
635 trad_conf = copy(conf)
636 trad_conf.terminals = terminals
638 if has_interegular and not conf.skip_validation:
639 comparator = interegular.Comparator.from_regexes({t: t.pattern.to_regexp() for t in terminals})
640 else:
641 comparator = None
642 lexer_by_tokens: Dict[FrozenSet[str], AbstractBasicLexer] = {}
643 self.lexers = {}
644 for state, accepts in states.items():
645 key = frozenset(accepts)
646 try:
647 lexer = lexer_by_tokens[key]
648 except KeyError:
649 accepts = set(accepts) | set(conf.ignore) | set(always_accept)
650 lexer_conf = copy(trad_conf)
651 lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name]
652 lexer = self.BasicLexer(lexer_conf, comparator)
653 lexer_by_tokens[key] = lexer
655 self.lexers[state] = lexer
657 assert trad_conf.terminals is terminals
658 trad_conf.skip_validation = True # We don't need to verify all terminals again
659 self.root_lexer = self.BasicLexer(trad_conf, comparator)
661 def lex(self, lexer_state: LexerState, parser_state: 'ParserState') -> Iterator[Token]:
662 try:
663 while True:
664 lexer = self.lexers[parser_state.position]
665 yield lexer.next_token(lexer_state, parser_state)
666 except EOFError:
667 pass
668 except UnexpectedCharacters as e:
669 # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
670 # This tests the input against the global context, to provide a nicer error.
671 try:
672 last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token
673 token = self.root_lexer.next_token(lexer_state, parser_state)
674 raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name)
675 except UnexpectedCharacters:
676 raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set.
678###}