Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pycparser/c_lexer.py: 74%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

284 statements  

1# ------------------------------------------------------------------------------ 

2# pycparser: c_lexer.py 

3# 

4# CLexer class: lexer for the C language 

5# 

6# Eli Bendersky [https://eli.thegreenplace.net/] 

7# License: BSD 

8# ------------------------------------------------------------------------------ 

9import re 

10from dataclasses import dataclass 

11from enum import Enum 

12from typing import Callable, Dict, List, Optional, Tuple 

13 

14 

15@dataclass(slots=True) 

16class _Token: 

17 type: str 

18 value: str 

19 lineno: int 

20 column: int 

21 

22 

23class CLexer: 

24 """A standalone lexer for C. 

25 

26 Parameters for construction: 

27 error_func: 

28 Called with (msg, line, column) on lexing errors. 

29 on_lbrace_func: 

30 Called when an LBRACE token is produced (used for scope tracking). 

31 on_rbrace_func: 

32 Called when an RBRACE token is produced (used for scope tracking). 

33 type_lookup_func: 

34 Called with an identifier name; expected to return True if it is 

35 a typedef name and should be tokenized as TYPEID. 

36 

37 Call input(text) to initialize lexing, and then keep calling token() to 

38 get the next token, until it returns None (at end of input). 

39 """ 

40 

41 def __init__( 

42 self, 

43 error_func: Callable[[str, int, int], None], 

44 on_lbrace_func: Callable[[], None], 

45 on_rbrace_func: Callable[[], None], 

46 type_lookup_func: Callable[[str], bool], 

47 ) -> None: 

48 self.error_func = error_func 

49 self.on_lbrace_func = on_lbrace_func 

50 self.on_rbrace_func = on_rbrace_func 

51 self.type_lookup_func = type_lookup_func 

52 self._init_state() 

53 

54 def input(self, text: str, filename: str = "") -> None: 

55 """Initialize the lexer to the given input text. 

56 

57 filename is an optional name identifying the file from which the input 

58 comes. The lexer can modify it if #line directives are encountered. 

59 """ 

60 self._init_state() 

61 self._lexdata = text 

62 self._filename = filename 

63 

64 def _init_state(self) -> None: 

65 self._lexdata = "" 

66 self._filename = "" 

67 self._pos = 0 

68 self._line_start = 0 

69 self._pending_tok: Optional[_Token] = None 

70 self._lineno = 1 

71 

72 @property 

73 def filename(self) -> str: 

74 return self._filename 

75 

76 def token(self) -> Optional[_Token]: 

77 # Lexing strategy overview: 

78 # 

79 # - We maintain a current position (self._pos), line number, and the 

80 # byte offset of the current line start. The lexer is a simple loop 

81 # that skips whitespace/newlines and emits one token per call. 

82 # - A small amount of logic is handled manually before regex matching: 

83 # 

84 # * Preprocessor-style directives: if we see '#', we check whether 

85 # it's a #line or #pragma directive and consume it inline. #line 

86 # updates lineno/filename and produces no tokens. #pragma can yield 

87 # both PPPRAGMA and PPPRAGMASTR, but token() returns a single token, 

88 # so we stash the PPPRAGMASTR as _pending_tok to return on the next 

89 # token() call. Otherwise we return PPHASH. 

90 # * Newlines update lineno/line-start tracking so tokens can record 

91 # accurate columns. 

92 # 

93 # - The bulk of tokens are recognized in _match_token: 

94 # 

95 # * _regex_rules: regex patterns for identifiers, literals, and other 

96 # complex tokens (including error-producing patterns). The lexer 

97 # uses a combined _regex_master to scan options at the same time. 

98 # * _fixed_tokens: exact string matches for operators and punctuation, 

99 # resolved by longest match. 

100 # 

101 # - Error patterns call the error callback and advance minimally, which 

102 # keeps lexing resilient while reporting useful diagnostics. 

103 text = self._lexdata 

104 n = len(text) 

105 

106 if self._pending_tok is not None: 

107 tok = self._pending_tok 

108 self._pending_tok = None 

109 return tok 

110 

111 while self._pos < n: 

112 match text[self._pos]: 

113 case " " | "\t": 

114 self._pos += 1 

115 case "\n": 

116 self._lineno += 1 

117 self._pos += 1 

118 self._line_start = self._pos 

119 case "#": 

120 if _line_pattern.match(text, self._pos + 1): 

121 self._pos += 1 

122 self._handle_ppline() 

123 continue 

124 if _pragma_pattern.match(text, self._pos + 1): 

125 self._pos += 1 

126 toks = self._handle_pppragma() 

127 if len(toks) > 1: 

128 self._pending_tok = toks[1] 

129 if len(toks) > 0: 

130 return toks[0] 

131 continue 

132 tok = self._make_token("PPHASH", "#", self._pos) 

133 self._pos += 1 

134 return tok 

135 case _: 

136 if tok := self._match_token(): 

137 return tok 

138 else: 

139 continue 

140 

141 def _match_token(self) -> Optional[_Token]: 

142 """Match one token at the current position. 

143 

144 Returns a Token on success, or None if no token could be matched and 

145 an error was reported. This method always advances _pos by the matched 

146 length, or by 1 on error/no-match. 

147 """ 

148 text = self._lexdata 

149 pos = self._pos 

150 # We pick the longest match between: 

151 # - the master regex (identifiers, literals, error patterns, etc.) 

152 # - fixed operator/punctuator literals from the bucket for text[pos] 

153 # 

154 # The longest match is required to ensure we properly lex something 

155 # like ".123" (a floating-point constant) as a single entity (with 

156 # FLOAT_CONST), rather than a PERIOD followed by a number. 

157 # 

158 # The fixed-literal buckets are already length-sorted, so within that 

159 # bucket we can take the first match. However, we still compare its 

160 # length to the regex match because the regex may have matched a longer 

161 # token that should take precedence. 

162 best = None 

163 

164 if m := _regex_master.match(text, pos): 

165 tok_type = m.lastgroup 

166 # All master-regex alternatives are named; lastgroup shouldn't be None. 

167 assert tok_type is not None 

168 value = m.group(tok_type) 

169 length = len(value) 

170 action, msg = _regex_actions[tok_type] 

171 best = (length, tok_type, value, action, msg) 

172 

173 if bucket := _fixed_tokens_by_first.get(text[pos]): 

174 for entry in bucket: 

175 if text.startswith(entry.literal, pos): 

176 length = len(entry.literal) 

177 if best is None or length > best[0]: 

178 best = ( 

179 length, 

180 entry.tok_type, 

181 entry.literal, 

182 _RegexAction.TOKEN, 

183 None, 

184 ) 

185 break 

186 

187 if best is None: 

188 msg = f"Illegal character {repr(text[pos])}" 

189 self._error(msg, pos) 

190 self._pos += 1 

191 return None 

192 

193 length, tok_type, value, action, msg = best 

194 if action == _RegexAction.ERROR: 

195 if tok_type == "BAD_CHAR_CONST": 

196 msg = f"Invalid char constant {value}" 

197 # All other ERROR rules provide a message. 

198 assert msg is not None 

199 self._error(msg, pos) 

200 self._pos += max(1, length) 

201 return None 

202 

203 if action == _RegexAction.ID: 

204 tok_type = _keyword_map.get(value, "ID") 

205 if tok_type == "ID" and self.type_lookup_func(value): 

206 tok_type = "TYPEID" 

207 

208 tok = self._make_token(tok_type, value, pos) 

209 self._pos += length 

210 

211 if tok.type == "LBRACE": 

212 self.on_lbrace_func() 

213 elif tok.type == "RBRACE": 

214 self.on_rbrace_func() 

215 

216 return tok 

217 

218 def _make_token(self, tok_type: str, value: str, pos: int) -> _Token: 

219 """Create a Token at an absolute input position. 

220 

221 Expects tok_type/value and the absolute byte offset pos in the current 

222 input. Does not advance lexer state; callers manage _pos themselves. 

223 Returns a Token with lineno/column computed from current line tracking. 

224 """ 

225 column = pos - self._line_start + 1 

226 tok = _Token(tok_type, value, self._lineno, column) 

227 return tok 

228 

229 def _error(self, msg: str, pos: int) -> None: 

230 column = pos - self._line_start + 1 

231 self.error_func(msg, self._lineno, column) 

232 

233 def _handle_ppline(self) -> None: 

234 # Since #line directives aren't supposed to return tokens but should 

235 # only affect the lexer's state (update line/filename for coords), this 

236 # method does a bit of parsing on its own. It doesn't return anything, 

237 # but its side effect is to update self._pos past the directive, and 

238 # potentially update self._lineno and self._filename, based on the 

239 # directive's contents. 

240 # 

241 # Accepted #line forms from preprocessors: 

242 # - "#line 66 \"kwas\\df.h\"" 

243 # - "# 9" 

244 # - "#line 10 \"include/me.h\" 1 2 3" (extra numeric flags) 

245 # - "# 1 \"file.h\" 3" 

246 # Errors we must report: 

247 # - "#line \"file.h\"" (filename before line number) 

248 # - "#line df" (garbage instead of number/string) 

249 # 

250 # We scan the directive line once (after an optional 'line' keyword), 

251 # validating the order: NUMBER, optional STRING, then any NUMBERs. 

252 # The NUMBERs tail is only accepted if a filename STRING was present. 

253 text = self._lexdata 

254 n = len(text) 

255 line_end = text.find("\n", self._pos) 

256 if line_end == -1: 

257 line_end = n 

258 line = text[self._pos : line_end] 

259 pos = 0 

260 line_len = len(line) 

261 

262 def skip_ws() -> None: 

263 nonlocal pos 

264 while pos < line_len and line[pos] in " \t": 

265 pos += 1 

266 

267 skip_ws() 

268 if line.startswith("line", pos): 

269 pos += 4 

270 

271 def success(pp_line: Optional[str], pp_filename: Optional[str]) -> None: 

272 if pp_line is None: 

273 self._error("line number missing in #line", self._pos + line_len) 

274 else: 

275 self._lineno = int(pp_line) 

276 if pp_filename is not None: 

277 self._filename = pp_filename 

278 self._pos = line_end + 1 

279 self._line_start = self._pos 

280 

281 def fail(msg: str, offset: int) -> None: 

282 self._error(msg, self._pos + offset) 

283 self._pos = line_end + 1 

284 self._line_start = self._pos 

285 

286 skip_ws() 

287 if pos >= line_len: 

288 success(None, None) 

289 return 

290 if line[pos] == '"': 

291 fail("filename before line number in #line", pos) 

292 return 

293 

294 m = re.match(_decimal_constant, line[pos:]) 

295 if not m: 

296 fail("invalid #line directive", pos) 

297 return 

298 

299 pp_line = m.group(0) 

300 pos += len(pp_line) 

301 skip_ws() 

302 if pos >= line_len: 

303 success(pp_line, None) 

304 return 

305 

306 if line[pos] != '"': 

307 fail("invalid #line directive", pos) 

308 return 

309 

310 m = re.match(_string_literal, line[pos:]) 

311 if not m: 

312 fail("invalid #line directive", pos) 

313 return 

314 

315 pp_filename = m.group(0).lstrip('"').rstrip('"') 

316 pos += len(m.group(0)) 

317 

318 # Consume arbitrary sequence of numeric flags after the directive 

319 while True: 

320 skip_ws() 

321 if pos >= line_len: 

322 break 

323 m = re.match(_decimal_constant, line[pos:]) 

324 if not m: 

325 fail("invalid #line directive", pos) 

326 return 

327 pos += len(m.group(0)) 

328 

329 success(pp_line, pp_filename) 

330 

331 def _handle_pppragma(self) -> List[_Token]: 

332 # Parse a full #pragma line; returns a list of tokens with 1 or 2 

333 # tokens - PPPRAGMA and an optional PPPRAGMASTR. If an empty list is 

334 # returned, it means an error occurred, or we're at the end of input. 

335 # 

336 # Examples: 

337 # - "#pragma" -> PPPRAGMA only 

338 # - "#pragma once" -> PPPRAGMA, PPPRAGMASTR("once") 

339 # - "# pragma omp parallel private(th_id)" -> PPPRAGMA, PPPRAGMASTR("omp parallel private(th_id)") 

340 # - "#\tpragma {pack: 2, smack: 3}" -> PPPRAGMA, PPPRAGMASTR("{pack: 2, smack: 3}") 

341 text = self._lexdata 

342 n = len(text) 

343 pos = self._pos 

344 

345 while pos < n and text[pos] in " \t": 

346 pos += 1 

347 if pos >= n: 

348 self._pos = pos 

349 return [] 

350 

351 if not text.startswith("pragma", pos): 

352 self._error("invalid #pragma directive", pos) 

353 self._pos = pos + 1 

354 return [] 

355 

356 pragma_pos = pos 

357 pos += len("pragma") 

358 toks = [self._make_token("PPPRAGMA", "pragma", pragma_pos)] 

359 

360 while pos < n and text[pos] in " \t": 

361 pos += 1 

362 

363 start = pos 

364 while pos < n and text[pos] != "\n": 

365 pos += 1 

366 if pos > start: 

367 toks.append(self._make_token("PPPRAGMASTR", text[start:pos], start)) 

368 if pos < n and text[pos] == "\n": 

369 self._lineno += 1 

370 pos += 1 

371 self._line_start = pos 

372 self._pos = pos 

373 return toks 

374 

375 

376## 

377## Reserved keywords 

378## 

379_keywords: Tuple[str, ...] = ( 

380 "AUTO", 

381 "BREAK", 

382 "CASE", 

383 "CHAR", 

384 "CONST", 

385 "CONTINUE", 

386 "DEFAULT", 

387 "DO", 

388 "DOUBLE", 

389 "ELSE", 

390 "ENUM", 

391 "EXTERN", 

392 "FLOAT", 

393 "FOR", 

394 "GOTO", 

395 "IF", 

396 "INLINE", 

397 "INT", 

398 "LONG", 

399 "REGISTER", 

400 "OFFSETOF", 

401 "RESTRICT", 

402 "RETURN", 

403 "SHORT", 

404 "SIGNED", 

405 "SIZEOF", 

406 "STATIC", 

407 "STRUCT", 

408 "SWITCH", 

409 "TYPEDEF", 

410 "UNION", 

411 "UNSIGNED", 

412 "VOID", 

413 "VOLATILE", 

414 "WHILE", 

415 "__INT128", 

416 "_BOOL", 

417 "_COMPLEX", 

418 "_NORETURN", 

419 "_THREAD_LOCAL", 

420 "_STATIC_ASSERT", 

421 "_ATOMIC", 

422 "_ALIGNOF", 

423 "_ALIGNAS", 

424 "_PRAGMA", 

425) 

426 

427_keyword_map: Dict[str, str] = {} 

428 

429for keyword in _keywords: 

430 # Keywords from new C standard are mixed-case, like _Bool, _Alignas, etc. 

431 if keyword.startswith("_") and len(keyword) > 1 and keyword[1].isalpha(): 

432 _keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword 

433 else: 

434 _keyword_map[keyword.lower()] = keyword 

435 

436## 

437## Regexes for use in tokens 

438## 

439 

440# valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers) 

441_identifier = r"[a-zA-Z_$][0-9a-zA-Z_$]*" 

442 

443_hex_prefix = "0[xX]" 

444_hex_digits = "[0-9a-fA-F]+" 

445_bin_prefix = "0[bB]" 

446_bin_digits = "[01]+" 

447 

448# integer constants (K&R2: A.2.5.1) 

449_integer_suffix_opt = ( 

450 r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?" 

451) 

452_decimal_constant = ( 

453 "(0" + _integer_suffix_opt + ")|([1-9][0-9]*" + _integer_suffix_opt + ")" 

454) 

455_octal_constant = "0[0-7]*" + _integer_suffix_opt 

456_hex_constant = _hex_prefix + _hex_digits + _integer_suffix_opt 

457_bin_constant = _bin_prefix + _bin_digits + _integer_suffix_opt 

458 

459_bad_octal_constant = "0[0-7]*[89]" 

460 

461# comments are not supported 

462_unsupported_c_style_comment = r"\/\*" 

463_unsupported_cxx_style_comment = r"\/\/" 

464 

465# character constants (K&R2: A.2.5.2) 

466# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line 

467# directives with Windows paths as filenames (..\..\dir\file) 

468# For the same reason, decimal_escape allows all digit sequences. We want to 

469# parse all correct code, even if it means to sometimes parse incorrect 

470# code. 

471# 

472# The original regexes were taken verbatim from the C syntax definition, 

473# and were later modified to avoid worst-case exponential running time. 

474# 

475# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" 

476# decimal_escape = r"""(\d+)""" 

477# hex_escape = r"""(x[0-9a-fA-F]+)""" 

478# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" 

479# 

480# The following modifications were made to avoid the ambiguity that allowed 

481# backtracking: (https://github.com/eliben/pycparser/issues/61) 

482# 

483# - \x was removed from simple_escape, unless it was not followed by a hex 

484# digit, to avoid ambiguity with hex_escape. 

485# - hex_escape allows one or more hex characters, but requires that the next 

486# character(if any) is not hex 

487# - decimal_escape allows one or more decimal characters, but requires that the 

488# next character(if any) is not a decimal 

489# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the 

490# permissive decimal_escape. 

491# 

492# Without this change, python's `re` module would recursively try parsing each 

493# ambiguous escape sequence in multiple ways. e.g. `\123` could be parsed as 

494# `\1`+`23`, `\12`+`3`, and `\123`. 

495 

496_simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" 

497_decimal_escape = r"""(\d+)(?!\d)""" 

498_hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])""" 

499_bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" 

500 

501_escape_sequence = ( 

502 r"""(\\(""" + _simple_escape + "|" + _decimal_escape + "|" + _hex_escape + "))" 

503) 

504 

505# This complicated regex with lookahead might be slow for strings, so because 

506# all of the valid escapes (including \x) allowed 

507# 0 or more non-escaped characters after the first character, 

508# simple_escape+decimal_escape+hex_escape got simplified to 

509 

510_escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" 

511 

512_cconst_char = r"""([^'\\\n]|""" + _escape_sequence + ")" 

513_char_const = "'" + _cconst_char + "'" 

514_wchar_const = "L" + _char_const 

515_u8char_const = "u8" + _char_const 

516_u16char_const = "u" + _char_const 

517_u32char_const = "U" + _char_const 

518_multicharacter_constant = "'" + _cconst_char + "{2,4}'" 

519_unmatched_quote = "('" + _cconst_char + "*\\n)|('" + _cconst_char + "*$)" 

520_bad_char_const = ( 

521 r"""('""" + _cconst_char + """[^'\n]+')|('')|('""" + _bad_escape + r"""[^'\n]*')""" 

522) 

523 

524# string literals (K&R2: A.2.6) 

525_string_char = r"""([^"\\\n]|""" + _escape_sequence_start_in_string + ")" 

526_string_literal = '"' + _string_char + '*"' 

527_wstring_literal = "L" + _string_literal 

528_u8string_literal = "u8" + _string_literal 

529_u16string_literal = "u" + _string_literal 

530_u32string_literal = "U" + _string_literal 

531_bad_string_literal = '"' + _string_char + "*" + _bad_escape + _string_char + '*"' 

532 

533# floating constants (K&R2: A.2.5.3) 

534_exponent_part = r"""([eE][-+]?[0-9]+)""" 

535_fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" 

536_floating_constant = ( 

537 "((((" 

538 + _fractional_constant 

539 + ")" 

540 + _exponent_part 

541 + "?)|([0-9]+" 

542 + _exponent_part 

543 + "))[FfLl]?)" 

544) 

545_binary_exponent_part = r"""([pP][+-]?[0-9]+)""" 

546_hex_fractional_constant = ( 

547 "(((" + _hex_digits + r""")?\.""" + _hex_digits + ")|(" + _hex_digits + r"""\.))""" 

548) 

549_hex_floating_constant = ( 

550 "(" 

551 + _hex_prefix 

552 + "(" 

553 + _hex_digits 

554 + "|" 

555 + _hex_fractional_constant 

556 + ")" 

557 + _binary_exponent_part 

558 + "[FfLl]?)" 

559) 

560 

561 

562class _RegexAction(Enum): 

563 TOKEN = 0 

564 ID = 1 

565 ERROR = 2 

566 

567 

568@dataclass(frozen=True) 

569class _RegexRule: 

570 # tok_type: name of the token emitted for a match 

571 # regex_pattern: the raw regex (no anchors) to match at the current position 

572 # action: TOKEN for normal tokens, ID for identifiers, ERROR to report 

573 # error_message: message used for ERROR entries 

574 tok_type: str 

575 regex_pattern: str 

576 action: _RegexAction 

577 error_message: Optional[str] 

578 

579 

580_regex_rules: List[_RegexRule] = [ 

581 _RegexRule( 

582 "UNSUPPORTED_C_STYLE_COMMENT", 

583 _unsupported_c_style_comment, 

584 _RegexAction.ERROR, 

585 "Comments are not supported, see https://github.com/eliben/pycparser#3using.", 

586 ), 

587 _RegexRule( 

588 "UNSUPPORTED_CXX_STYLE_COMMENT", 

589 _unsupported_cxx_style_comment, 

590 _RegexAction.ERROR, 

591 "Comments are not supported, see https://github.com/eliben/pycparser#3using.", 

592 ), 

593 _RegexRule( 

594 "BAD_STRING_LITERAL", 

595 _bad_string_literal, 

596 _RegexAction.ERROR, 

597 "String contains invalid escape code", 

598 ), 

599 _RegexRule("WSTRING_LITERAL", _wstring_literal, _RegexAction.TOKEN, None), 

600 _RegexRule("U8STRING_LITERAL", _u8string_literal, _RegexAction.TOKEN, None), 

601 _RegexRule("U16STRING_LITERAL", _u16string_literal, _RegexAction.TOKEN, None), 

602 _RegexRule("U32STRING_LITERAL", _u32string_literal, _RegexAction.TOKEN, None), 

603 _RegexRule("STRING_LITERAL", _string_literal, _RegexAction.TOKEN, None), 

604 _RegexRule("HEX_FLOAT_CONST", _hex_floating_constant, _RegexAction.TOKEN, None), 

605 _RegexRule("FLOAT_CONST", _floating_constant, _RegexAction.TOKEN, None), 

606 _RegexRule("INT_CONST_HEX", _hex_constant, _RegexAction.TOKEN, None), 

607 _RegexRule("INT_CONST_BIN", _bin_constant, _RegexAction.TOKEN, None), 

608 _RegexRule( 

609 "BAD_CONST_OCT", 

610 _bad_octal_constant, 

611 _RegexAction.ERROR, 

612 "Invalid octal constant", 

613 ), 

614 _RegexRule("INT_CONST_OCT", _octal_constant, _RegexAction.TOKEN, None), 

615 _RegexRule("INT_CONST_DEC", _decimal_constant, _RegexAction.TOKEN, None), 

616 _RegexRule("INT_CONST_CHAR", _multicharacter_constant, _RegexAction.TOKEN, None), 

617 _RegexRule("CHAR_CONST", _char_const, _RegexAction.TOKEN, None), 

618 _RegexRule("WCHAR_CONST", _wchar_const, _RegexAction.TOKEN, None), 

619 _RegexRule("U8CHAR_CONST", _u8char_const, _RegexAction.TOKEN, None), 

620 _RegexRule("U16CHAR_CONST", _u16char_const, _RegexAction.TOKEN, None), 

621 _RegexRule("U32CHAR_CONST", _u32char_const, _RegexAction.TOKEN, None), 

622 _RegexRule("UNMATCHED_QUOTE", _unmatched_quote, _RegexAction.ERROR, "Unmatched '"), 

623 _RegexRule("BAD_CHAR_CONST", _bad_char_const, _RegexAction.ERROR, None), 

624 _RegexRule("ID", _identifier, _RegexAction.ID, None), 

625] 

626 

627_regex_actions: Dict[str, Tuple[_RegexAction, Optional[str]]] = {} 

628_regex_pattern_parts: List[str] = [] 

629for _rule in _regex_rules: 

630 _regex_actions[_rule.tok_type] = (_rule.action, _rule.error_message) 

631 _regex_pattern_parts.append(f"(?P<{_rule.tok_type}>{_rule.regex_pattern})") 

632# The master regex is a single alternation of all token patterns, each wrapped 

633# in a named group. We match once at the current position and then use 

634# `lastgroup` to recover which token kind fired; this avoids iterating over all 

635# regexes on every character while keeping the same token-level semantics. 

636_regex_master: re.Pattern[str] = re.compile("|".join(_regex_pattern_parts)) 

637 

638 

639@dataclass(frozen=True) 

640class _FixedToken: 

641 tok_type: str 

642 literal: str 

643 

644 

645_fixed_tokens: List[_FixedToken] = [ 

646 _FixedToken("ELLIPSIS", "..."), 

647 _FixedToken("LSHIFTEQUAL", "<<="), 

648 _FixedToken("RSHIFTEQUAL", ">>="), 

649 _FixedToken("PLUSPLUS", "++"), 

650 _FixedToken("MINUSMINUS", "--"), 

651 _FixedToken("ARROW", "->"), 

652 _FixedToken("LAND", "&&"), 

653 _FixedToken("LOR", "||"), 

654 _FixedToken("LSHIFT", "<<"), 

655 _FixedToken("RSHIFT", ">>"), 

656 _FixedToken("LE", "<="), 

657 _FixedToken("GE", ">="), 

658 _FixedToken("EQ", "=="), 

659 _FixedToken("NE", "!="), 

660 _FixedToken("TIMESEQUAL", "*="), 

661 _FixedToken("DIVEQUAL", "/="), 

662 _FixedToken("MODEQUAL", "%="), 

663 _FixedToken("PLUSEQUAL", "+="), 

664 _FixedToken("MINUSEQUAL", "-="), 

665 _FixedToken("ANDEQUAL", "&="), 

666 _FixedToken("OREQUAL", "|="), 

667 _FixedToken("XOREQUAL", "^="), 

668 _FixedToken("EQUALS", "="), 

669 _FixedToken("PLUS", "+"), 

670 _FixedToken("MINUS", "-"), 

671 _FixedToken("TIMES", "*"), 

672 _FixedToken("DIVIDE", "/"), 

673 _FixedToken("MOD", "%"), 

674 _FixedToken("OR", "|"), 

675 _FixedToken("AND", "&"), 

676 _FixedToken("NOT", "~"), 

677 _FixedToken("XOR", "^"), 

678 _FixedToken("LNOT", "!"), 

679 _FixedToken("LT", "<"), 

680 _FixedToken("GT", ">"), 

681 _FixedToken("CONDOP", "?"), 

682 _FixedToken("LPAREN", "("), 

683 _FixedToken("RPAREN", ")"), 

684 _FixedToken("LBRACKET", "["), 

685 _FixedToken("RBRACKET", "]"), 

686 _FixedToken("LBRACE", "{"), 

687 _FixedToken("RBRACE", "}"), 

688 _FixedToken("COMMA", ","), 

689 _FixedToken("PERIOD", "."), 

690 _FixedToken("SEMI", ";"), 

691 _FixedToken("COLON", ":"), 

692] 

693 

694# To avoid scanning all fixed tokens on every character, we bucket them by the 

695# first character. When matching at position i, we only look at the bucket for 

696# text[i], and we pre-sort that bucket by token length so the first match is 

697# also the longest. This preserves longest-match semantics (e.g. '>>=' before 

698# '>>' before '>') while reducing the number of comparisons. 

699_fixed_tokens_by_first: Dict[str, List[_FixedToken]] = {} 

700for _entry in _fixed_tokens: 

701 _fixed_tokens_by_first.setdefault(_entry.literal[0], []).append(_entry) 

702for _bucket in _fixed_tokens_by_first.values(): 

703 _bucket.sort(key=lambda item: len(item.literal), reverse=True) 

704 

705_line_pattern: re.Pattern[str] = re.compile(r"([ \t]*line\W)|([ \t]*\d+)") 

706_pragma_pattern: re.Pattern[str] = re.compile(r"[ \t]*pragma\W")