Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pycparser/c_lexer.py: 75%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

288 statements  

1# ------------------------------------------------------------------------------ 

2# pycparser: c_lexer.py 

3# 

4# CLexer class: lexer for the C language 

5# 

6# Eli Bendersky [https://eli.thegreenplace.net/] 

7# License: BSD 

8# ------------------------------------------------------------------------------ 

9import re 

10from dataclasses import dataclass 

11from enum import Enum 

12from typing import Callable, Dict, List, Optional, Tuple 

13 

14 

15@dataclass(slots=True) 

16class Token: 

17 type: str 

18 value: str 

19 lineno: int 

20 column: int 

21 

22 

23class CLexer: 

24 """A standalone lexer for C. 

25 

26 The lexer takes the following callback functions as parameters during 

27 construction: 

28 error_func: 

29 Called with (msg, line, column) on lexing errors. 

30 on_lbrace_func: 

31 Called when an LBRACE token is produced (used for scope tracking). 

32 on_rbrace_func: 

33 Called when an RBRACE token is produced (used for scope tracking). 

34 type_lookup_func: 

35 Called with an identifier name; expected to return True if it is 

36 a typedef name and should be tokenized as TYPEID. 

37 

38 Call input(text) to initialize lexing, and then keep calling token() to 

39 get the next token, until it returns None (at end of input). 

40 """ 

41 

42 def __init__( 

43 self, 

44 error_func: Callable[[str, int, int], None], 

45 on_lbrace_func: Callable[[], None], 

46 on_rbrace_func: Callable[[], None], 

47 type_lookup_func: Callable[[str], bool], 

48 ) -> None: 

49 self.error_func = error_func 

50 self.on_lbrace_func = on_lbrace_func 

51 self.on_rbrace_func = on_rbrace_func 

52 self.type_lookup_func = type_lookup_func 

53 self._init_state() 

54 

55 def input(self, text: str, filename: str = "") -> None: 

56 """Initialize the lexer to the given input text. 

57 

58 filename is an optional name identifying the initial file from which the 

59 input comes. The lexer may modify it if #line directives are 

60 encountered. 

61 """ 

62 self._init_state() 

63 self._lexdata = text 

64 self._filename = filename 

65 

66 def _init_state(self) -> None: 

67 self._lexdata = "" 

68 self._filename = "" 

69 self._pos = 0 

70 self._line_start = 0 

71 self._pending_tok: Optional[Token] = None 

72 self._lineno = 1 

73 

74 @property 

75 def filename(self) -> str: 

76 return self._filename 

77 

78 def token(self) -> Optional[Token]: 

79 # Lexing strategy overview: 

80 # 

81 # - We maintain a current position (self._pos), line number, and the 

82 # byte offset of the current line start. The lexer is a simple loop 

83 # that skips whitespace/newlines and emits one token per call. 

84 # 

85 # - A small amount of logic is handled manually before regex matching: 

86 # 

87 # * Preprocessor-style directives: if we see '#', we check whether 

88 # it's a #line or #pragma directive and consume it inline. #line 

89 # updates lineno/filename and produces no tokens. #pragma can yield 

90 # both PPPRAGMA and PPPRAGMASTR, but token() returns a single token, 

91 # so we stash the PPPRAGMASTR as _pending_tok to return on the next 

92 # token() call. Otherwise we return PPHASH. 

93 # * Newlines update lineno/line-start tracking so tokens can record 

94 # accurate columns. 

95 # 

96 # - The bulk of tokens are recognized in _match_token: 

97 # 

98 # * _regex_rules: regex patterns for identifiers, literals, and other 

99 # complex tokens (including error-producing patterns). The lexer 

100 # uses a combined _regex_master to scan options at the same time. 

101 # * _fixed_tokens: exact string matches for operators and punctuation, 

102 # resolved by longest match. 

103 # 

104 # - Error patterns call the error callback and advance minimally, which 

105 # keeps lexing resilient while reporting useful diagnostics. 

106 text = self._lexdata 

107 n = len(text) 

108 

109 if self._pending_tok is not None: 

110 tok = self._pending_tok 

111 self._pending_tok = None 

112 return tok 

113 

114 while self._pos < n: 

115 match text[self._pos]: 

116 case " " | "\t": 

117 self._pos += 1 

118 case "\n": 

119 self._lineno += 1 

120 self._pos += 1 

121 self._line_start = self._pos 

122 case "#": 

123 if _line_pattern.match(text, self._pos + 1): 

124 self._pos += 1 

125 self._handle_ppline() 

126 continue 

127 if _pragma_pattern.match(text, self._pos + 1): 

128 self._pos += 1 

129 toks = self._handle_pppragma() 

130 if len(toks) > 1: 

131 self._pending_tok = toks[1] 

132 if len(toks) > 0: 

133 return toks[0] 

134 continue 

135 tok = self._make_token("PPHASH", "#", self._pos) 

136 self._pos += 1 

137 return tok 

138 case _: 

139 if tok := self._match_token(): 

140 return tok 

141 else: 

142 continue 

143 

144 def _match_token(self) -> Optional[Token]: 

145 """Match one token at the current position. 

146 

147 Returns a Token on success, or None if no token could be matched and 

148 an error was reported. This method always advances _pos by the matched 

149 length, or by 1 on error/no-match. 

150 """ 

151 text = self._lexdata 

152 pos = self._pos 

153 # We pick the longest match between: 

154 # - the master regex (identifiers, literals, error patterns, etc.) 

155 # - fixed operator/punctuator literals from the bucket for text[pos] 

156 # 

157 # The longest match is required to ensure we properly lex something 

158 # like ".123" (a floating-point constant) as a single entity (with 

159 # FLOAT_CONST), rather than a PERIOD followed by a number. 

160 # 

161 # The fixed-literal buckets are already length-sorted, so within that 

162 # bucket we can take the first match. However, we still compare its 

163 # length to the regex match because the regex may have matched a longer 

164 # token that should take precedence. 

165 best = None 

166 

167 if m := _regex_master.match(text, pos): 

168 tok_type = m.lastgroup 

169 # All master-regex alternatives are named; lastgroup shouldn't be None. 

170 assert tok_type is not None 

171 value = m.group(tok_type) 

172 length = len(value) 

173 action, msg = _regex_actions[tok_type] 

174 best = (length, tok_type, value, action, msg) 

175 

176 if bucket := _fixed_tokens_by_first.get(text[pos]): 

177 for entry in bucket: 

178 if text.startswith(entry.literal, pos): 

179 length = len(entry.literal) 

180 if best is None or length > best[0]: 

181 best = ( 

182 length, 

183 entry.tok_type, 

184 entry.literal, 

185 _RegexAction.TOKEN, 

186 None, 

187 ) 

188 break 

189 

190 if best is None: 

191 self._error(f"Illegal character {repr(text[pos])}", pos) 

192 self._pos += 1 

193 return None 

194 

195 length, tok_type, value, action, msg = best 

196 match action: 

197 case _RegexAction.TOKEN: 

198 pass 

199 case _RegexAction.ERROR: 

200 if tok_type == "BAD_CHAR_CONST": 

201 msg = f"Invalid char constant {value}" 

202 # All other ERROR rules provide a message. 

203 assert msg is not None 

204 self._error(msg, pos) 

205 self._pos += max(1, length) 

206 return None 

207 case _RegexAction.ID: 

208 tok_type = _keyword_map.get(value, "ID") 

209 if tok_type == "ID" and self.type_lookup_func(value): 

210 tok_type = "TYPEID" 

211 case _: 

212 raise RuntimeError("unreachable") 

213 

214 tok = self._make_token(tok_type, value, pos) 

215 self._pos += length 

216 

217 if tok.type == "LBRACE": 

218 self.on_lbrace_func() 

219 elif tok.type == "RBRACE": 

220 self.on_rbrace_func() 

221 

222 return tok 

223 

224 def _make_token(self, tok_type: str, value: str, pos: int) -> Token: 

225 """Create a Token at an absolute input position. 

226 

227 Expects tok_type/value and the absolute byte offset pos in the current 

228 input. Does not advance lexer state; callers manage _pos themselves. 

229 Returns a Token with lineno/column computed from current line tracking. 

230 """ 

231 column = pos - self._line_start + 1 

232 tok = Token(tok_type, value, self._lineno, column) 

233 return tok 

234 

235 def _error(self, msg: str, pos: int) -> None: 

236 column = pos - self._line_start + 1 

237 self.error_func(msg, self._lineno, column) 

238 

239 def _handle_ppline(self) -> None: 

240 # Since #line directives aren't supposed to return tokens but should 

241 # only affect the lexer's state (update line/filename for coords), this 

242 # method does a bit of parsing on its own. It doesn't return anything, 

243 # but its side effect is to update self._pos past the directive, and 

244 # potentially update self._lineno and self._filename, based on the 

245 # directive's contents. 

246 # 

247 # Accepted #line forms from preprocessors: 

248 # - "#line 66 \"kwas\\df.h\"" 

249 # - "# 9" 

250 # - "#line 10 \"include/me.h\" 1 2 3" (extra numeric flags) 

251 # - "# 1 \"file.h\" 3" 

252 # Errors we must report: 

253 # - "#line \"file.h\"" (filename before line number) 

254 # - "#line df" (garbage instead of number/string) 

255 # 

256 # We scan the directive line once (after an optional 'line' keyword), 

257 # validating the order: NUMBER, optional STRING, then any NUMBERs. 

258 # The NUMBERs tail is only accepted if a filename STRING was present. 

259 text = self._lexdata 

260 n = len(text) 

261 line_end = text.find("\n", self._pos) 

262 if line_end == -1: 

263 line_end = n 

264 line = text[self._pos : line_end] 

265 pos = 0 

266 line_len = len(line) 

267 

268 def skip_ws() -> None: 

269 nonlocal pos 

270 while pos < line_len and line[pos] in " \t": 

271 pos += 1 

272 

273 skip_ws() 

274 if line.startswith("line", pos): 

275 pos += 4 

276 

277 def success(pp_line: Optional[str], pp_filename: Optional[str]) -> None: 

278 if pp_line is None: 

279 self._error("line number missing in #line", self._pos + line_len) 

280 else: 

281 self._lineno = int(pp_line) 

282 if pp_filename is not None: 

283 self._filename = pp_filename 

284 self._pos = line_end + 1 

285 self._line_start = self._pos 

286 

287 def fail(msg: str, offset: int) -> None: 

288 self._error(msg, self._pos + offset) 

289 self._pos = line_end + 1 

290 self._line_start = self._pos 

291 

292 skip_ws() 

293 if pos >= line_len: 

294 success(None, None) 

295 return 

296 if line[pos] == '"': 

297 fail("filename before line number in #line", pos) 

298 return 

299 

300 m = re.match(_decimal_constant, line[pos:]) 

301 if not m: 

302 fail("invalid #line directive", pos) 

303 return 

304 

305 pp_line = m.group(0) 

306 pos += len(pp_line) 

307 skip_ws() 

308 if pos >= line_len: 

309 success(pp_line, None) 

310 return 

311 

312 if line[pos] != '"': 

313 fail("invalid #line directive", pos) 

314 return 

315 

316 m = re.match(_string_literal, line[pos:]) 

317 if not m: 

318 fail("invalid #line directive", pos) 

319 return 

320 

321 pp_filename = m.group(0).lstrip('"').rstrip('"') 

322 pos += len(m.group(0)) 

323 

324 # Consume arbitrary sequence of numeric flags after the directive 

325 while True: 

326 skip_ws() 

327 if pos >= line_len: 

328 break 

329 m = re.match(_decimal_constant, line[pos:]) 

330 if not m: 

331 fail("invalid #line directive", pos) 

332 return 

333 pos += len(m.group(0)) 

334 

335 success(pp_line, pp_filename) 

336 

337 def _handle_pppragma(self) -> List[Token]: 

338 # Parse a full #pragma line; returns a list of tokens with 1 or 2 

339 # tokens - PPPRAGMA and an optional PPPRAGMASTR. If an empty list is 

340 # returned, it means an error occurred, or we're at the end of input. 

341 # 

342 # Examples: 

343 # - "#pragma" -> PPPRAGMA only 

344 # - "#pragma once" -> PPPRAGMA, PPPRAGMASTR("once") 

345 # - "# pragma omp parallel private(th_id)" -> PPPRAGMA, PPPRAGMASTR("omp parallel private(th_id)") 

346 # - "#\tpragma {pack: 2, smack: 3}" -> PPPRAGMA, PPPRAGMASTR("{pack: 2, smack: 3}") 

347 text = self._lexdata 

348 n = len(text) 

349 pos = self._pos 

350 

351 while pos < n and text[pos] in " \t": 

352 pos += 1 

353 if pos >= n: 

354 self._pos = pos 

355 return [] 

356 

357 if not text.startswith("pragma", pos): 

358 self._error("invalid #pragma directive", pos) 

359 self._pos = pos + 1 

360 return [] 

361 

362 pragma_pos = pos 

363 pos += len("pragma") 

364 toks = [self._make_token("PPPRAGMA", "pragma", pragma_pos)] 

365 

366 while pos < n and text[pos] in " \t": 

367 pos += 1 

368 

369 start = pos 

370 while pos < n and text[pos] != "\n": 

371 pos += 1 

372 if pos > start: 

373 toks.append(self._make_token("PPPRAGMASTR", text[start:pos], start)) 

374 if pos < n and text[pos] == "\n": 

375 self._lineno += 1 

376 pos += 1 

377 self._line_start = pos 

378 self._pos = pos 

379 return toks 

380 

381 

382## 

383## Reserved keywords 

384## 

385_keywords: Tuple[str, ...] = ( 

386 "AUTO", 

387 "BREAK", 

388 "CASE", 

389 "CHAR", 

390 "CONST", 

391 "CONTINUE", 

392 "DEFAULT", 

393 "DO", 

394 "DOUBLE", 

395 "ELSE", 

396 "ENUM", 

397 "EXTERN", 

398 "FLOAT", 

399 "FOR", 

400 "GOTO", 

401 "IF", 

402 "INLINE", 

403 "INT", 

404 "LONG", 

405 "REGISTER", 

406 "OFFSETOF", 

407 "RESTRICT", 

408 "RETURN", 

409 "SHORT", 

410 "SIGNED", 

411 "SIZEOF", 

412 "STATIC", 

413 "STRUCT", 

414 "SWITCH", 

415 "TYPEDEF", 

416 "UNION", 

417 "UNSIGNED", 

418 "VOID", 

419 "VOLATILE", 

420 "WHILE", 

421 "__INT128", 

422 "_BOOL", 

423 "_COMPLEX", 

424 "_NORETURN", 

425 "_THREAD_LOCAL", 

426 "_STATIC_ASSERT", 

427 "_ATOMIC", 

428 "_ALIGNOF", 

429 "_ALIGNAS", 

430 "_PRAGMA", 

431) 

432 

433_keyword_map: Dict[str, str] = {} 

434 

435for keyword in _keywords: 

436 # Keywords from new C standard are mixed-case, like _Bool, _Alignas, etc. 

437 if keyword.startswith("_") and len(keyword) > 1 and keyword[1].isalpha(): 

438 _keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword 

439 else: 

440 _keyword_map[keyword.lower()] = keyword 

441 

442## 

443## Regexes for use in tokens 

444## 

445 

446# valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers) 

447_identifier = r"[a-zA-Z_$][0-9a-zA-Z_$]*" 

448 

449_hex_prefix = "0[xX]" 

450_hex_digits = "[0-9a-fA-F]+" 

451_bin_prefix = "0[bB]" 

452_bin_digits = "[01]+" 

453 

454# integer constants (K&R2: A.2.5.1) 

455_integer_suffix_opt = ( 

456 r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?" 

457) 

458_decimal_constant = ( 

459 "(0" + _integer_suffix_opt + ")|([1-9][0-9]*" + _integer_suffix_opt + ")" 

460) 

461_octal_constant = "0[0-7]*" + _integer_suffix_opt 

462_hex_constant = _hex_prefix + _hex_digits + _integer_suffix_opt 

463_bin_constant = _bin_prefix + _bin_digits + _integer_suffix_opt 

464 

465_bad_octal_constant = "0[0-7]*[89]" 

466 

467# comments are not supported 

468_unsupported_c_style_comment = r"\/\*" 

469_unsupported_cxx_style_comment = r"\/\/" 

470 

471# character constants (K&R2: A.2.5.2) 

472# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line 

473# directives with Windows paths as filenames (..\..\dir\file) 

474# For the same reason, decimal_escape allows all digit sequences. We want to 

475# parse all correct code, even if it means to sometimes parse incorrect 

476# code. 

477# 

478# The original regexes were taken verbatim from the C syntax definition, 

479# and were later modified to avoid worst-case exponential running time. 

480# 

481# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" 

482# decimal_escape = r"""(\d+)""" 

483# hex_escape = r"""(x[0-9a-fA-F]+)""" 

484# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" 

485# 

486# The following modifications were made to avoid the ambiguity that allowed 

487# backtracking: (https://github.com/eliben/pycparser/issues/61) 

488# 

489# - \x was removed from simple_escape, unless it was not followed by a hex 

490# digit, to avoid ambiguity with hex_escape. 

491# - hex_escape allows one or more hex characters, but requires that the next 

492# character(if any) is not hex 

493# - decimal_escape allows one or more decimal characters, but requires that the 

494# next character(if any) is not a decimal 

495# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the 

496# permissive decimal_escape. 

497# 

498# Without this change, python's `re` module would recursively try parsing each 

499# ambiguous escape sequence in multiple ways. e.g. `\123` could be parsed as 

500# `\1`+`23`, `\12`+`3`, and `\123`. 

501 

502_simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" 

503_decimal_escape = r"""(\d+)(?!\d)""" 

504_hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])""" 

505_bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" 

506 

507_escape_sequence = ( 

508 r"""(\\(""" + _simple_escape + "|" + _decimal_escape + "|" + _hex_escape + "))" 

509) 

510 

511# This complicated regex with lookahead might be slow for strings, so because 

512# all of the valid escapes (including \x) allowed 

513# 0 or more non-escaped characters after the first character, 

514# simple_escape+decimal_escape+hex_escape got simplified to 

515 

516_escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" 

517 

518_cconst_char = r"""([^'\\\n]|""" + _escape_sequence + ")" 

519_char_const = "'" + _cconst_char + "'" 

520_wchar_const = "L" + _char_const 

521_u8char_const = "u8" + _char_const 

522_u16char_const = "u" + _char_const 

523_u32char_const = "U" + _char_const 

524_multicharacter_constant = "'" + _cconst_char + "{2,4}'" 

525_unmatched_quote = "('" + _cconst_char + "*\\n)|('" + _cconst_char + "*$)" 

526_bad_char_const = ( 

527 r"""('""" + _cconst_char + """[^'\n]+')|('')|('""" + _bad_escape + r"""[^'\n]*')""" 

528) 

529 

530# string literals (K&R2: A.2.6) 

531_string_char = r"""([^"\\\n]|""" + _escape_sequence_start_in_string + ")" 

532_string_literal = '"' + _string_char + '*"' 

533_wstring_literal = "L" + _string_literal 

534_u8string_literal = "u8" + _string_literal 

535_u16string_literal = "u" + _string_literal 

536_u32string_literal = "U" + _string_literal 

537_bad_string_literal = '"' + _string_char + "*" + _bad_escape + _string_char + '*"' 

538 

539# floating constants (K&R2: A.2.5.3) 

540_exponent_part = r"""([eE][-+]?[0-9]+)""" 

541_fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" 

542_floating_constant = ( 

543 "((((" 

544 + _fractional_constant 

545 + ")" 

546 + _exponent_part 

547 + "?)|([0-9]+" 

548 + _exponent_part 

549 + "))[FfLl]?)" 

550) 

551_binary_exponent_part = r"""([pP][+-]?[0-9]+)""" 

552_hex_fractional_constant = ( 

553 "(((" + _hex_digits + r""")?\.""" + _hex_digits + ")|(" + _hex_digits + r"""\.))""" 

554) 

555_hex_floating_constant = ( 

556 "(" 

557 + _hex_prefix 

558 + "(" 

559 + _hex_digits 

560 + "|" 

561 + _hex_fractional_constant 

562 + ")" 

563 + _binary_exponent_part 

564 + "[FfLl]?)" 

565) 

566 

567 

568class _RegexAction(Enum): 

569 TOKEN = 0 

570 ID = 1 

571 ERROR = 2 

572 

573 

574@dataclass(frozen=True) 

575class _RegexRule: 

576 # tok_type: name of the token emitted for a match 

577 # regex_pattern: the raw regex (no anchors) to match at the current position 

578 # action: TOKEN for normal tokens, ID for identifiers, ERROR to report 

579 # error_message: message used for ERROR entries 

580 tok_type: str 

581 regex_pattern: str 

582 action: _RegexAction 

583 error_message: Optional[str] 

584 

585 

586_regex_rules: List[_RegexRule] = [ 

587 _RegexRule( 

588 "UNSUPPORTED_C_STYLE_COMMENT", 

589 _unsupported_c_style_comment, 

590 _RegexAction.ERROR, 

591 "Comments are not supported, see https://github.com/eliben/pycparser#3using.", 

592 ), 

593 _RegexRule( 

594 "UNSUPPORTED_CXX_STYLE_COMMENT", 

595 _unsupported_cxx_style_comment, 

596 _RegexAction.ERROR, 

597 "Comments are not supported, see https://github.com/eliben/pycparser#3using.", 

598 ), 

599 _RegexRule( 

600 "BAD_STRING_LITERAL", 

601 _bad_string_literal, 

602 _RegexAction.ERROR, 

603 "String contains invalid escape code", 

604 ), 

605 _RegexRule("WSTRING_LITERAL", _wstring_literal, _RegexAction.TOKEN, None), 

606 _RegexRule("U8STRING_LITERAL", _u8string_literal, _RegexAction.TOKEN, None), 

607 _RegexRule("U16STRING_LITERAL", _u16string_literal, _RegexAction.TOKEN, None), 

608 _RegexRule("U32STRING_LITERAL", _u32string_literal, _RegexAction.TOKEN, None), 

609 _RegexRule("STRING_LITERAL", _string_literal, _RegexAction.TOKEN, None), 

610 _RegexRule("HEX_FLOAT_CONST", _hex_floating_constant, _RegexAction.TOKEN, None), 

611 _RegexRule("FLOAT_CONST", _floating_constant, _RegexAction.TOKEN, None), 

612 _RegexRule("INT_CONST_HEX", _hex_constant, _RegexAction.TOKEN, None), 

613 _RegexRule("INT_CONST_BIN", _bin_constant, _RegexAction.TOKEN, None), 

614 _RegexRule( 

615 "BAD_CONST_OCT", 

616 _bad_octal_constant, 

617 _RegexAction.ERROR, 

618 "Invalid octal constant", 

619 ), 

620 _RegexRule("INT_CONST_OCT", _octal_constant, _RegexAction.TOKEN, None), 

621 _RegexRule("INT_CONST_DEC", _decimal_constant, _RegexAction.TOKEN, None), 

622 _RegexRule("INT_CONST_CHAR", _multicharacter_constant, _RegexAction.TOKEN, None), 

623 _RegexRule("CHAR_CONST", _char_const, _RegexAction.TOKEN, None), 

624 _RegexRule("WCHAR_CONST", _wchar_const, _RegexAction.TOKEN, None), 

625 _RegexRule("U8CHAR_CONST", _u8char_const, _RegexAction.TOKEN, None), 

626 _RegexRule("U16CHAR_CONST", _u16char_const, _RegexAction.TOKEN, None), 

627 _RegexRule("U32CHAR_CONST", _u32char_const, _RegexAction.TOKEN, None), 

628 _RegexRule("UNMATCHED_QUOTE", _unmatched_quote, _RegexAction.ERROR, "Unmatched '"), 

629 _RegexRule("BAD_CHAR_CONST", _bad_char_const, _RegexAction.ERROR, None), 

630 _RegexRule("ID", _identifier, _RegexAction.ID, None), 

631] 

632 

633_regex_actions: Dict[str, Tuple[_RegexAction, Optional[str]]] = {} 

634_regex_pattern_parts: List[str] = [] 

635for _rule in _regex_rules: 

636 _regex_actions[_rule.tok_type] = (_rule.action, _rule.error_message) 

637 _regex_pattern_parts.append(f"(?P<{_rule.tok_type}>{_rule.regex_pattern})") 

638# The master regex is a single alternation of all token patterns, each wrapped 

639# in a named group. We match once at the current position and then use 

640# `lastgroup` to recover which token kind fired; this avoids iterating over all 

641# regexes on every character while keeping the same token-level semantics. 

642_regex_master: re.Pattern[str] = re.compile("|".join(_regex_pattern_parts)) 

643 

644 

645@dataclass(frozen=True) 

646class _FixedToken: 

647 tok_type: str 

648 literal: str 

649 

650 

651_fixed_tokens: List[_FixedToken] = [ 

652 _FixedToken("ELLIPSIS", "..."), 

653 _FixedToken("LSHIFTEQUAL", "<<="), 

654 _FixedToken("RSHIFTEQUAL", ">>="), 

655 _FixedToken("PLUSPLUS", "++"), 

656 _FixedToken("MINUSMINUS", "--"), 

657 _FixedToken("ARROW", "->"), 

658 _FixedToken("LAND", "&&"), 

659 _FixedToken("LOR", "||"), 

660 _FixedToken("LSHIFT", "<<"), 

661 _FixedToken("RSHIFT", ">>"), 

662 _FixedToken("LE", "<="), 

663 _FixedToken("GE", ">="), 

664 _FixedToken("EQ", "=="), 

665 _FixedToken("NE", "!="), 

666 _FixedToken("TIMESEQUAL", "*="), 

667 _FixedToken("DIVEQUAL", "/="), 

668 _FixedToken("MODEQUAL", "%="), 

669 _FixedToken("PLUSEQUAL", "+="), 

670 _FixedToken("MINUSEQUAL", "-="), 

671 _FixedToken("ANDEQUAL", "&="), 

672 _FixedToken("OREQUAL", "|="), 

673 _FixedToken("XOREQUAL", "^="), 

674 _FixedToken("EQUALS", "="), 

675 _FixedToken("PLUS", "+"), 

676 _FixedToken("MINUS", "-"), 

677 _FixedToken("TIMES", "*"), 

678 _FixedToken("DIVIDE", "/"), 

679 _FixedToken("MOD", "%"), 

680 _FixedToken("OR", "|"), 

681 _FixedToken("AND", "&"), 

682 _FixedToken("NOT", "~"), 

683 _FixedToken("XOR", "^"), 

684 _FixedToken("LNOT", "!"), 

685 _FixedToken("LT", "<"), 

686 _FixedToken("GT", ">"), 

687 _FixedToken("CONDOP", "?"), 

688 _FixedToken("LPAREN", "("), 

689 _FixedToken("RPAREN", ")"), 

690 _FixedToken("LBRACKET", "["), 

691 _FixedToken("RBRACKET", "]"), 

692 _FixedToken("LBRACE", "{"), 

693 _FixedToken("RBRACE", "}"), 

694 _FixedToken("COMMA", ","), 

695 _FixedToken("PERIOD", "."), 

696 _FixedToken("SEMI", ";"), 

697 _FixedToken("COLON", ":"), 

698] 

699 

700# To avoid scanning all fixed tokens on every character, we bucket them by the 

701# first character. When matching at position i, we only look at the bucket for 

702# text[i], and we pre-sort that bucket by token length so the first match is 

703# also the longest. This preserves longest-match semantics (e.g. '>>=' before 

704# '>>' before '>') while reducing the number of comparisons. 

705_fixed_tokens_by_first: Dict[str, List[_FixedToken]] = {} 

706for _entry in _fixed_tokens: 

707 _fixed_tokens_by_first.setdefault(_entry.literal[0], []).append(_entry) 

708for _bucket in _fixed_tokens_by_first.values(): 

709 _bucket.sort(key=lambda item: len(item.literal), reverse=True) 

710 

711_line_pattern: re.Pattern[str] = re.compile(r"([ \t]*line\W)|([ \t]*\d+)") 

712_pragma_pattern: re.Pattern[str] = re.compile(r"[ \t]*pragma\W")