Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/lark/lexer.py: 70%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

456 statements  

1# Lexer Implementation 

2 

3from abc import abstractmethod, ABC 

4import re 

5from typing import ( 

6 TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, 

7 ClassVar, TYPE_CHECKING, overload 

8) 

9from types import ModuleType 

10import warnings 

11try: 

12 import interegular 

13except ImportError: 

14 pass 

15if TYPE_CHECKING: 

16 from .common import LexerConf 

17 from .parsers.lalr_parser_state import ParserState 

18 

19from .utils import classify, get_regexp_width, Serialize, logger, TextSlice, TextOrSlice 

20from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken 

21from .grammar import TOKEN_DEFAULT_PRIORITY 

22 

23 

24###{standalone 

25from contextlib import suppress 

26from copy import copy 

27 

28try: # For the standalone parser, we need to make sure that has_interegular is False to avoid NameErrors later on 

29 has_interegular = bool(interegular) 

30except NameError: 

31 has_interegular = False 

32 

33class Pattern(Serialize, ABC): 

34 "An abstraction over regular expressions." 

35 

36 value: str 

37 flags: Collection[str] 

38 raw: Optional[str] 

39 type: ClassVar[str] 

40 

41 def __init__(self, value: str, flags: Collection[str] = (), raw: Optional[str] = None) -> None: 

42 self.value = value 

43 self.flags = frozenset(flags) 

44 self.raw = raw 

45 

46 def __repr__(self): 

47 return repr(self.to_regexp()) 

48 

49 # Pattern Hashing assumes all subclasses have a different priority! 

50 def __hash__(self): 

51 return hash((type(self), self.value, self.flags)) 

52 

53 def __eq__(self, other): 

54 return type(self) == type(other) and self.value == other.value and self.flags == other.flags 

55 

56 @abstractmethod 

57 def to_regexp(self) -> str: 

58 raise NotImplementedError() 

59 

60 @property 

61 @abstractmethod 

62 def min_width(self) -> int: 

63 raise NotImplementedError() 

64 

65 @property 

66 @abstractmethod 

67 def max_width(self) -> int: 

68 raise NotImplementedError() 

69 

70 def _get_flags(self, value): 

71 for f in self.flags: 

72 value = ('(?%s:%s)' % (f, value)) 

73 return value 

74 

75 

76class PatternStr(Pattern): 

77 __serialize_fields__ = 'value', 'flags', 'raw' 

78 

79 type: ClassVar[str] = "str" 

80 

81 def to_regexp(self) -> str: 

82 return self._get_flags(re.escape(self.value)) 

83 

84 @property 

85 def min_width(self) -> int: 

86 return len(self.value) 

87 

88 @property 

89 def max_width(self) -> int: 

90 return len(self.value) 

91 

92 

93class PatternRE(Pattern): 

94 __serialize_fields__ = 'value', 'flags', 'raw', '_width' 

95 

96 type: ClassVar[str] = "re" 

97 

98 def to_regexp(self) -> str: 

99 return self._get_flags(self.value) 

100 

101 _width = None 

102 def _get_width(self): 

103 if self._width is None: 

104 self._width = get_regexp_width(self.to_regexp()) 

105 return self._width 

106 

107 @property 

108 def min_width(self) -> int: 

109 return self._get_width()[0] 

110 

111 @property 

112 def max_width(self) -> int: 

113 return self._get_width()[1] 

114 

115 

116class TerminalDef(Serialize): 

117 "A definition of a terminal" 

118 __serialize_fields__ = 'name', 'pattern', 'priority' 

119 __serialize_namespace__ = PatternStr, PatternRE 

120 

121 name: str 

122 pattern: Pattern 

123 priority: int 

124 

125 def __init__(self, name: str, pattern: Pattern, priority: int = TOKEN_DEFAULT_PRIORITY) -> None: 

126 assert isinstance(pattern, Pattern), pattern 

127 self.name = name 

128 self.pattern = pattern 

129 self.priority = priority 

130 

131 def __repr__(self): 

132 return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) 

133 

134 def user_repr(self) -> str: 

135 if self.name.startswith('__'): # We represent a generated terminal 

136 return self.pattern.raw or self.name 

137 else: 

138 return self.name 

139 

140_T = TypeVar('_T', bound="Token") 

141 

142class Token(str): 

143 """A string with meta-information, that is produced by the lexer. 

144 

145 When parsing text, the resulting chunks of the input that haven't been discarded, 

146 will end up in the tree as Token instances. The Token class inherits from Python's ``str``, 

147 so normal string comparisons and operations will work as expected. 

148 

149 Attributes: 

150 type: Name of the token (as specified in grammar) 

151 value: Value of the token (redundant, as ``token.value == token`` will always be true) 

152 start_pos: The index of the token in the text 

153 line: The line of the token in the text (starting with 1) 

154 column: The column of the token in the text (starting with 1) 

155 end_line: The line where the token ends 

156 end_column: The next column after the end of the token. For example, 

157 if the token is a single character with a column value of 4, 

158 end_column will be 5. 

159 end_pos: the index where the token ends (basically ``start_pos + len(token)``) 

160 """ 

161 __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') 

162 

163 __match_args__ = ('type', 'value') 

164 

165 type: str 

166 start_pos: Optional[int] 

167 value: Any 

168 line: Optional[int] 

169 column: Optional[int] 

170 end_line: Optional[int] 

171 end_column: Optional[int] 

172 end_pos: Optional[int] 

173 

174 

175 @overload 

176 def __new__( 

177 cls, 

178 type: str, 

179 value: Any, 

180 start_pos: Optional[int] = None, 

181 line: Optional[int] = None, 

182 column: Optional[int] = None, 

183 end_line: Optional[int] = None, 

184 end_column: Optional[int] = None, 

185 end_pos: Optional[int] = None 

186 ) -> 'Token': 

187 ... 

188 

189 @overload 

190 def __new__( 

191 cls, 

192 type_: str, 

193 value: Any, 

194 start_pos: Optional[int] = None, 

195 line: Optional[int] = None, 

196 column: Optional[int] = None, 

197 end_line: Optional[int] = None, 

198 end_column: Optional[int] = None, 

199 end_pos: Optional[int] = None 

200 ) -> 'Token': ... 

201 

202 def __new__(cls, *args, **kwargs): 

203 if "type_" in kwargs: 

204 warnings.warn("`type_` is deprecated use `type` instead", DeprecationWarning) 

205 

206 if "type" in kwargs: 

207 raise TypeError("Error: using both 'type' and the deprecated 'type_' as arguments.") 

208 kwargs["type"] = kwargs.pop("type_") 

209 

210 return cls._future_new(*args, **kwargs) 

211 

212 

213 @classmethod 

214 def _future_new(cls, type, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): 

215 inst = super(Token, cls).__new__(cls, value) 

216 

217 inst.type = type 

218 inst.start_pos = start_pos 

219 inst.value = value 

220 inst.line = line 

221 inst.column = column 

222 inst.end_line = end_line 

223 inst.end_column = end_column 

224 inst.end_pos = end_pos 

225 return inst 

226 

227 @overload 

228 def update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token': 

229 ... 

230 

231 @overload 

232 def update(self, type_: Optional[str] = None, value: Optional[Any] = None) -> 'Token': 

233 ... 

234 

235 def update(self, *args, **kwargs): 

236 if "type_" in kwargs: 

237 warnings.warn("`type_` is deprecated use `type` instead", DeprecationWarning) 

238 

239 if "type" in kwargs: 

240 raise TypeError("Error: using both 'type' and the deprecated 'type_' as arguments.") 

241 kwargs["type"] = kwargs.pop("type_") 

242 

243 return self._future_update(*args, **kwargs) 

244 

245 def _future_update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token': 

246 return Token.new_borrow_pos( 

247 type if type is not None else self.type, 

248 value if value is not None else self.value, 

249 self 

250 ) 

251 

252 @classmethod 

253 def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: 'Token') -> _T: 

254 return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos) 

255 

256 def __reduce__(self): 

257 return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column)) 

258 

259 def __repr__(self): 

260 return 'Token(%r, %r)' % (self.type, self.value) 

261 

262 def __deepcopy__(self, memo): 

263 return Token(self.type, self.value, self.start_pos, self.line, self.column) 

264 

265 def __eq__(self, other): 

266 if isinstance(other, Token) and self.type != other.type: 

267 return False 

268 

269 return str.__eq__(self, other) 

270 

271 __hash__ = str.__hash__ 

272 

273 

274class LineCounter: 

275 "A utility class for keeping track of line & column information" 

276 

277 __slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char' 

278 

279 def __init__(self, newline_char): 

280 self.newline_char = newline_char 

281 self.char_pos = 0 

282 self.line = 1 

283 self.column = 1 

284 self.line_start_pos = 0 

285 

286 def __eq__(self, other): 

287 if not isinstance(other, LineCounter): 

288 return NotImplemented 

289 

290 return self.char_pos == other.char_pos and self.newline_char == other.newline_char 

291 

292 def feed(self, token: TextOrSlice, test_newline=True): 

293 """Consume a token and calculate the new line & column. 

294 

295 As an optional optimization, set test_newline=False if token doesn't contain a newline. 

296 """ 

297 if test_newline: 

298 newlines = token.count(self.newline_char) 

299 if newlines: 

300 self.line += newlines 

301 self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 

302 

303 self.char_pos += len(token) 

304 self.column = self.char_pos - self.line_start_pos + 1 

305 

306 

307class UnlessCallback: 

308 def __init__(self, scanner: 'Scanner'): 

309 self.scanner = scanner 

310 

311 def __call__(self, t: Token): 

312 res = self.scanner.fullmatch(t.value) 

313 if res is not None: 

314 t.type = res 

315 return t 

316 

317 

318class CallChain: 

319 def __init__(self, callback1, callback2, cond): 

320 self.callback1 = callback1 

321 self.callback2 = callback2 

322 self.cond = cond 

323 

324 def __call__(self, t): 

325 t2 = self.callback1(t) 

326 return self.callback2(t) if self.cond(t2) else t2 

327 

328 

329def _get_match(re_, regexp, s, flags): 

330 m = re_.match(regexp, s, flags) 

331 if m: 

332 return m.group(0) 

333 

334def _create_unless(terminals, g_regex_flags, re_, use_bytes): 

335 tokens_by_type = classify(terminals, lambda t: type(t.pattern)) 

336 assert len(tokens_by_type) <= 2, tokens_by_type.keys() 

337 embedded_strs = set() 

338 callback = {} 

339 for retok in tokens_by_type.get(PatternRE, []): 

340 unless = [] 

341 for strtok in tokens_by_type.get(PatternStr, []): 

342 if strtok.priority != retok.priority: 

343 continue 

344 s = strtok.pattern.value 

345 if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags): 

346 unless.append(strtok) 

347 if strtok.pattern.flags <= retok.pattern.flags: 

348 embedded_strs.add(strtok) 

349 if unless: 

350 callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, use_bytes=use_bytes)) 

351 

352 new_terminals = [t for t in terminals if t not in embedded_strs] 

353 return new_terminals, callback 

354 

355 

356class Scanner: 

357 def __init__(self, terminals, g_regex_flags, re_, use_bytes): 

358 self.terminals = terminals 

359 self.g_regex_flags = g_regex_flags 

360 self.re_ = re_ 

361 self.use_bytes = use_bytes 

362 

363 self.allowed_types = {t.name for t in self.terminals} 

364 

365 self._mres = self._build_mres(terminals, len(terminals)) 

366 

367 def _build_mres(self, terminals, max_size): 

368 # Python sets an unreasonable group limit (currently 100) in its re module 

369 # Worse, the only way to know we reached it is by catching an AssertionError! 

370 # This function recursively tries less and less groups until it's successful. 

371 mres = [] 

372 while terminals: 

373 pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp()) for t in terminals[:max_size]) 

374 if self.use_bytes: 

375 pattern = pattern.encode('latin-1') 

376 try: 

377 mre = self.re_.compile(pattern, self.g_regex_flags) 

378 except AssertionError: # Yes, this is what Python provides us.. :/ 

379 return self._build_mres(terminals, max_size // 2) 

380 

381 mres.append(mre) 

382 terminals = terminals[max_size:] 

383 return mres 

384 

385 def match(self, text: TextSlice, pos): 

386 for mre in self._mres: 

387 m = mre.match(text.text, pos, text.end) 

388 if m: 

389 return m.group(0), m.lastgroup 

390 

391 

392 def fullmatch(self, text: str) -> Optional[str]: 

393 for mre in self._mres: 

394 m = mre.fullmatch(text) 

395 if m: 

396 return m.lastgroup 

397 return None 

398 

399def _regexp_has_newline(r: str): 

400 r"""Expressions that may indicate newlines in a regexp: 

401 - newlines (\n) 

402 - escaped newline (\\n) 

403 - anything but ([^...]) 

404 - any-char (.) when the flag (?s) exists 

405 - spaces (\s) 

406 """ 

407 return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) 

408 

409 

410class LexerState: 

411 """Represents the current state of the lexer as it scans the text 

412 (Lexer objects are only instantiated per grammar, not per text) 

413 """ 

414 

415 __slots__ = 'text', 'line_ctr', 'last_token' 

416 

417 text: TextSlice 

418 line_ctr: LineCounter 

419 last_token: Optional[Token] 

420 

421 def __init__(self, text: TextSlice, line_ctr: Optional[LineCounter] = None, last_token: Optional[Token]=None): 

422 if isinstance(text, TextSlice): 

423 if line_ctr is None: 

424 line_ctr = LineCounter(b'\n' if isinstance(text.text, bytes) else '\n') 

425 

426 if text.start > 0: 

427 # Advance the line-count until line_ctr.char_pos == text.start 

428 line_ctr.feed(TextSlice(text.text, 0, text.start)) 

429 

430 if not (text.start <= line_ctr.char_pos <= text.end): 

431 raise ValueError("LineCounter.char_pos is out of bounds") 

432 

433 self.text = text 

434 self.line_ctr = line_ctr 

435 self.last_token = last_token 

436 

437 

438 def __eq__(self, other): 

439 if not isinstance(other, LexerState): 

440 return NotImplemented 

441 

442 return self.text == other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token 

443 

444 def __copy__(self): 

445 return type(self)(self.text, copy(self.line_ctr), self.last_token) 

446 

447 

448class LexerThread: 

449 """A thread that ties a lexer instance and a lexer state, to be used by the parser 

450 """ 

451 

452 def __init__(self, lexer: 'Lexer', lexer_state: Optional[LexerState]): 

453 self.lexer = lexer 

454 self.state = lexer_state 

455 

456 @classmethod 

457 def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice) -> 'LexerThread': 

458 text = TextSlice.cast_from(text_or_slice) 

459 return cls(lexer, LexerState(text)) 

460 

461 @classmethod 

462 def from_custom_input(cls, lexer: 'Lexer', text: Any) -> 'LexerThread': 

463 return cls(lexer, LexerState(text)) 

464 

465 def lex(self, parser_state): 

466 if self.state is None: 

467 raise TypeError("Cannot lex: No text assigned to lexer state") 

468 return self.lexer.lex(self.state, parser_state) 

469 

470 def __copy__(self): 

471 return type(self)(self.lexer, copy(self.state)) 

472 

473 _Token = Token 

474 

475 

476_Callback = Callable[[Token], Token] 

477 

478class Lexer(ABC): 

479 """Lexer interface 

480 

481 Method Signatures: 

482 lex(self, lexer_state, parser_state) -> Iterator[Token] 

483 """ 

484 @abstractmethod 

485 def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]: 

486 return NotImplemented 

487 

488 def make_lexer_state(self, text: str): 

489 "Deprecated" 

490 return LexerState(TextSlice.cast_from(text)) 

491 

492 

493def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8): 

494 if not comparator: 

495 comparator = interegular.Comparator.from_regexes(terminal_to_regexp) 

496 

497 # When in strict mode, we only ever try to provide one example, so taking 

498 # a long time for that should be fine 

499 max_time = 2 if strict_mode else 0.2 

500 

501 # We don't want to show too many collisions. 

502 if comparator.count_marked_pairs() >= max_collisions_to_show: 

503 return 

504 for group in classify(terminal_to_regexp, lambda t: t.priority).values(): 

505 for a, b in comparator.check(group, skip_marked=True): 

506 assert a.priority == b.priority 

507 # Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision 

508 comparator.mark(a, b) 

509 

510 # Notify the user 

511 message = f"Collision between Terminals {a.name} and {b.name}. " 

512 try: 

513 example = comparator.get_example_overlap(a, b, max_time).format_multiline() 

514 except ValueError: 

515 # Couldn't find an example within max_time steps. 

516 example = "No example could be found fast enough. However, the collision does still exists" 

517 if strict_mode: 

518 raise LexError(f"{message}\n{example}") 

519 logger.warning("%s The lexer will choose between them arbitrarily.\n%s", message, example) 

520 if comparator.count_marked_pairs() >= max_collisions_to_show: 

521 logger.warning("Found 8 regex collisions, will not check for more.") 

522 return 

523 

524 

525class AbstractBasicLexer(Lexer): 

526 terminals_by_name: Dict[str, TerminalDef] 

527 

528 @abstractmethod 

529 def __init__(self, conf: 'LexerConf', comparator=None) -> None: 

530 ... 

531 

532 @abstractmethod 

533 def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token: 

534 ... 

535 

536 def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]: 

537 with suppress(EOFError): 

538 while True: 

539 yield self.next_token(state, parser_state) 

540 

541 

542class BasicLexer(AbstractBasicLexer): 

543 terminals: Collection[TerminalDef] 

544 ignore_types: FrozenSet[str] 

545 newline_types: FrozenSet[str] 

546 user_callbacks: Dict[str, _Callback] 

547 callback: Dict[str, _Callback] 

548 re: ModuleType 

549 

550 def __init__(self, conf: 'LexerConf', comparator=None) -> None: 

551 terminals = list(conf.terminals) 

552 assert all(isinstance(t, TerminalDef) for t in terminals), terminals 

553 

554 self.re = conf.re_module 

555 

556 if not conf.skip_validation: 

557 # Sanitization 

558 terminal_to_regexp = {} 

559 for t in terminals: 

560 regexp = t.pattern.to_regexp() 

561 try: 

562 self.re.compile(regexp, conf.g_regex_flags) 

563 except self.re.error: 

564 raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) 

565 

566 if t.pattern.min_width == 0: 

567 raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) 

568 if t.pattern.type == "re": 

569 terminal_to_regexp[t] = regexp 

570 

571 if not (set(conf.ignore) <= {t.name for t in terminals}): 

572 raise LexError("Ignore terminals are not defined: %s" % (set(conf.ignore) - {t.name for t in terminals})) 

573 

574 if has_interegular: 

575 _check_regex_collisions(terminal_to_regexp, comparator, conf.strict) 

576 elif conf.strict: 

577 raise LexError("interegular must be installed for strict mode. Use `pip install 'lark[interegular]'`.") 

578 

579 # Init 

580 self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())) 

581 self.ignore_types = frozenset(conf.ignore) 

582 

583 terminals.sort(key=lambda x: (-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) 

584 self.terminals = terminals 

585 self.user_callbacks = conf.callbacks 

586 self.g_regex_flags = conf.g_regex_flags 

587 self.use_bytes = conf.use_bytes 

588 self.terminals_by_name = conf.terminals_by_name 

589 

590 self._scanner: Optional[Scanner] = None 

591 

592 def _build_scanner(self) -> Scanner: 

593 terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) 

594 assert all(self.callback.values()) 

595 

596 for type_, f in self.user_callbacks.items(): 

597 if type_ in self.callback: 

598 # Already a callback there, probably UnlessCallback 

599 self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_) 

600 else: 

601 self.callback[type_] = f 

602 

603 return Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes) 

604 

605 @property 

606 def scanner(self) -> Scanner: 

607 if self._scanner is None: 

608 self._scanner = self._build_scanner() 

609 return self._scanner 

610 

611 def match(self, text, pos): 

612 return self.scanner.match(text, pos) 

613 

614 def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token: 

615 line_ctr = lex_state.line_ctr 

616 while line_ctr.char_pos < lex_state.text.end: 

617 res = self.match(lex_state.text, line_ctr.char_pos) 

618 if not res: 

619 allowed = self.scanner.allowed_types - self.ignore_types 

620 if not allowed: 

621 allowed = {"<END-OF-FILE>"} 

622 raise UnexpectedCharacters(lex_state.text.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, 

623 allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], 

624 state=parser_state, terminals_by_name=self.terminals_by_name) 

625 

626 value, type_ = res 

627 

628 ignored = type_ in self.ignore_types 

629 t = None 

630 if not ignored or type_ in self.callback: 

631 t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) 

632 line_ctr.feed(value, type_ in self.newline_types) 

633 if t is not None: 

634 t.end_line = line_ctr.line 

635 t.end_column = line_ctr.column 

636 t.end_pos = line_ctr.char_pos 

637 if t.type in self.callback: 

638 t = self.callback[t.type](t) 

639 if not ignored: 

640 if not isinstance(t, Token): 

641 raise LexError("Callbacks must return a token (returned %r)" % t) 

642 lex_state.last_token = t 

643 return t 

644 

645 # EOF 

646 raise EOFError(self) 

647 

648 

649class ContextualLexer(Lexer): 

650 lexers: Dict[int, AbstractBasicLexer] 

651 root_lexer: AbstractBasicLexer 

652 

653 BasicLexer: Type[AbstractBasicLexer] = BasicLexer 

654 

655 def __init__(self, conf: 'LexerConf', states: Dict[int, Collection[str]], always_accept: Collection[str]=()) -> None: 

656 terminals = list(conf.terminals) 

657 terminals_by_name = conf.terminals_by_name 

658 

659 trad_conf = copy(conf) 

660 trad_conf.terminals = terminals 

661 

662 if has_interegular and not conf.skip_validation: 

663 comparator = interegular.Comparator.from_regexes({t: t.pattern.to_regexp() for t in terminals}) 

664 else: 

665 comparator = None 

666 lexer_by_tokens: Dict[FrozenSet[str], AbstractBasicLexer] = {} 

667 self.lexers = {} 

668 for state, accepts in states.items(): 

669 key = frozenset(accepts) 

670 try: 

671 lexer = lexer_by_tokens[key] 

672 except KeyError: 

673 accepts = set(accepts) | set(conf.ignore) | set(always_accept) 

674 lexer_conf = copy(trad_conf) 

675 lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] 

676 lexer = self.BasicLexer(lexer_conf, comparator) 

677 lexer_by_tokens[key] = lexer 

678 

679 self.lexers[state] = lexer 

680 

681 assert trad_conf.terminals is terminals 

682 trad_conf.skip_validation = True # We don't need to verify all terminals again 

683 self.root_lexer = self.BasicLexer(trad_conf, comparator) 

684 

685 def lex(self, lexer_state: LexerState, parser_state: 'ParserState') -> Iterator[Token]: 

686 try: 

687 while True: 

688 lexer = self.lexers[parser_state.position] 

689 yield lexer.next_token(lexer_state, parser_state) 

690 except EOFError: 

691 pass 

692 except UnexpectedCharacters as e: 

693 # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. 

694 # This tests the input against the global context, to provide a nicer error. 

695 try: 

696 last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token 

697 token = self.root_lexer.next_token(lexer_state, parser_state) 

698 raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name) 

699 except UnexpectedCharacters: 

700 raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. 

701 

702###}