Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pycparser/c

1# ------------------------------------------------------------------------------

2# pycparser: c_lexer.py

4# CLexer class: lexer for the C language

6# Eli Bendersky [https://eli.thegreenplace.net/]

7# License: BSD

8# ------------------------------------------------------------------------------

9import re

10from dataclasses import dataclass

11from enum import Enum

12from typing import Callable, Dict, List, Optional, Tuple

15@dataclass(slots=True)

16class Token:

17 type: str

18 value: str

19 lineno: int

20 column: int

23class CLexer:

24 """A standalone lexer for C.

26 The lexer takes the following callback functions as parameters during

27 construction:

28 error_func:

29 Called with (msg, line, column) on lexing errors.

30 on_lbrace_func:

31 Called when an LBRACE token is produced (used for scope tracking).

32 on_rbrace_func:

33 Called when an RBRACE token is produced (used for scope tracking).

34 type_lookup_func:

35 Called with an identifier name; expected to return True if it is

36 a typedef name and should be tokenized as TYPEID.

38 Call input(text) to initialize lexing, and then keep calling token() to

39 get the next token, until it returns None (at end of input).

40 """

42 def __init__(

43 self,

44 error_func: Callable[[str, int, int], None],

45 on_lbrace_func: Callable[[], None],

46 on_rbrace_func: Callable[[], None],

47 type_lookup_func: Callable[[str], bool],

48 ) -> None:

49 self.error_func = error_func

50 self.on_lbrace_func = on_lbrace_func

51 self.on_rbrace_func = on_rbrace_func

52 self.type_lookup_func = type_lookup_func

53 self._init_state()

55 def input(self, text: str, filename: str = "") -> None:

56 """Initialize the lexer to the given input text.

58 filename is an optional name identifying the initial file from which the

59 input comes. The lexer may modify it if #line directives are

60 encountered.

61 """

62 self._init_state()

63 self._lexdata = text

64 self._filename = filename

66 def _init_state(self) -> None:

67 self._lexdata = ""

68 self._filename = ""

69 self._pos = 0

70 self._line_start = 0

71 self._pending_tok: Optional[Token] = None

72 self._lineno = 1

74 @property

75 def filename(self) -> str:

76 return self._filename

78 def token(self) -> Optional[Token]:

79 # Lexing strategy overview:

80 #

81 # - We maintain a current position (self._pos), line number, and the

82 # byte offset of the current line start. The lexer is a simple loop

83 # that skips whitespace/newlines and emits one token per call.

84 #

85 # - A small amount of logic is handled manually before regex matching:

86 #

87 # * Preprocessor-style directives: if we see '#', we check whether

88 # it's a #line or #pragma directive and consume it inline. #line

89 # updates lineno/filename and produces no tokens. #pragma can yield

90 # both PPPRAGMA and PPPRAGMASTR, but token() returns a single token,

91 # so we stash the PPPRAGMASTR as _pending_tok to return on the next

92 # token() call. Otherwise we return PPHASH.

93 # * Newlines update lineno/line-start tracking so tokens can record

94 # accurate columns.

95 #

96 # - The bulk of tokens are recognized in _match_token:

97 #

98 # * _regex_rules: regex patterns for identifiers, literals, and other

99 # complex tokens (including error-producing patterns). The lexer

100 # uses a combined _regex_master to scan options at the same time.

101 # * _fixed_tokens: exact string matches for operators and punctuation,

102 # resolved by longest match.

103 #

104 # - Error patterns call the error callback and advance minimally, which

105 # keeps lexing resilient while reporting useful diagnostics.

106 text = self._lexdata

107 n = len(text)

108

109 if self._pending_tok is not None:

110 tok = self._pending_tok

111 self._pending_tok = None

112 return tok

113

114 while self._pos < n:

115 match text[self._pos]:

116 case " " | "\t":

117 self._pos += 1

118 case "\n":

119 self._lineno += 1

120 self._pos += 1

121 self._line_start = self._pos

122 case "#":

123 if _line_pattern.match(text, self._pos + 1):

124 self._pos += 1

125 self._handle_ppline()

126 continue

127 if _pragma_pattern.match(text, self._pos + 1):

128 self._pos += 1

129 toks = self._handle_pppragma()

130 if len(toks) > 1:

131 self._pending_tok = toks[1]

132 if len(toks) > 0:

133 return toks[0]

134 continue

135 tok = self._make_token("PPHASH", "#", self._pos)

136 self._pos += 1

137 return tok

138 case _:

139 if tok := self._match_token():

140 return tok

141 else:

142 continue

143

144 def _match_token(self) -> Optional[Token]:

145 """Match one token at the current position.

146

147 Returns a Token on success, or None if no token could be matched and

148 an error was reported. This method always advances _pos by the matched

149 length, or by 1 on error/no-match.

150 """

151 text = self._lexdata

152 pos = self._pos

153 # We pick the longest match between:

154 # - the master regex (identifiers, literals, error patterns, etc.)

155 # - fixed operator/punctuator literals from the bucket for text[pos]

156 #

157 # The longest match is required to ensure we properly lex something

158 # like ".123" (a floating-point constant) as a single entity (with

159 # FLOAT_CONST), rather than a PERIOD followed by a number.

160 #

161 # The fixed-literal buckets are already length-sorted, so within that

162 # bucket we can take the first match. However, we still compare its

163 # length to the regex match because the regex may have matched a longer

164 # token that should take precedence.

165 best = None

166

167 if m := _regex_master.match(text, pos):

168 tok_type = m.lastgroup

169 # All master-regex alternatives are named; lastgroup shouldn't be None.

170 assert tok_type is not None

171 value = m.group(tok_type)

172 length = len(value)

173 action, msg = _regex_actions[tok_type]

174 best = (length, tok_type, value, action, msg)

175

176 if bucket := _fixed_tokens_by_first.get(text[pos]):

177 for entry in bucket:

178 if text.startswith(entry.literal, pos):

179 length = len(entry.literal)

180 if best is None or length > best[0]:

181 best = (

182 length,

183 entry.tok_type,

184 entry.literal,

185 _RegexAction.TOKEN,

186 None,

187 )

188 break

189

190 if best is None:

191 self._error(f"Illegal character {repr(text[pos])}", pos)

192 self._pos += 1

193 return None

194

195 length, tok_type, value, action, msg = best

196 match action:

197 case _RegexAction.TOKEN:

198 pass

199 case _RegexAction.ERROR:

200 if tok_type == "BAD_CHAR_CONST":

201 msg = f"Invalid char constant {value}"

202 # All other ERROR rules provide a message.

203 assert msg is not None

204 self._error(msg, pos)

205 self._pos += max(1, length)

206 return None

207 case _RegexAction.ID:

208 tok_type = _keyword_map.get(value, "ID")

209 if tok_type == "ID" and self.type_lookup_func(value):

210 tok_type = "TYPEID"

211 case _:

212 raise RuntimeError("unreachable")

213

214 tok = self._make_token(tok_type, value, pos)

215 self._pos += length

216

217 if tok.type == "LBRACE":

218 self.on_lbrace_func()

219 elif tok.type == "RBRACE":

220 self.on_rbrace_func()

221

222 return tok

223

224 def _make_token(self, tok_type: str, value: str, pos: int) -> Token:

225 """Create a Token at an absolute input position.

226

227 Expects tok_type/value and the absolute byte offset pos in the current

228 input. Does not advance lexer state; callers manage _pos themselves.

229 Returns a Token with lineno/column computed from current line tracking.

230 """

231 column = pos - self._line_start + 1

232 tok = Token(tok_type, value, self._lineno, column)

233 return tok

234

235 def _error(self, msg: str, pos: int) -> None:

236 column = pos - self._line_start + 1

237 self.error_func(msg, self._lineno, column)

238

239 def _handle_ppline(self) -> None:

240 # Since #line directives aren't supposed to return tokens but should

241 # only affect the lexer's state (update line/filename for coords), this

242 # method does a bit of parsing on its own. It doesn't return anything,

243 # but its side effect is to update self._pos past the directive, and

244 # potentially update self._lineno and self._filename, based on the

245 # directive's contents.

246 #

247 # Accepted #line forms from preprocessors:

248 # - "#line 66 \"kwas\\df.h\""

249 # - "# 9"

250 # - "#line 10 \"include/me.h\" 1 2 3" (extra numeric flags)

251 # - "# 1 \"file.h\" 3"

252 # Errors we must report:

253 # - "#line \"file.h\"" (filename before line number)

254 # - "#line df" (garbage instead of number/string)

255 #

256 # We scan the directive line once (after an optional 'line' keyword),

257 # validating the order: NUMBER, optional STRING, then any NUMBERs.

258 # The NUMBERs tail is only accepted if a filename STRING was present.

259 text = self._lexdata

260 n = len(text)

261 line_end = text.find("\n", self._pos)

262 if line_end == -1:

263 line_end = n

264 line = text[self._pos : line_end]

265 pos = 0

266 line_len = len(line)

267

268 def skip_ws() -> None:

269 nonlocal pos

270 while pos < line_len and line[pos] in " \t":

271 pos += 1

272

273 skip_ws()

274 if line.startswith("line", pos):

275 pos += 4

276

277 def success(pp_line: Optional[str], pp_filename: Optional[str]) -> None:

278 if pp_line is None:

279 self._error("line number missing in #line", self._pos + line_len)

280 else:

281 self._lineno = int(pp_line)

282 if pp_filename is not None:

283 self._filename = pp_filename

284 self._pos = line_end + 1

285 self._line_start = self._pos

286

287 def fail(msg: str, offset: int) -> None:

288 self._error(msg, self._pos + offset)

289 self._pos = line_end + 1

290 self._line_start = self._pos

291

292 skip_ws()

293 if pos >= line_len:

294 success(None, None)

295 return

296 if line[pos] == '"':

297 fail("filename before line number in #line", pos)

298 return

299

300 m = re.match(_decimal_constant, line[pos:])

301 if not m:

302 fail("invalid #line directive", pos)

303 return

304

305 pp_line = m.group(0)

306 pos += len(pp_line)

307 skip_ws()

308 if pos >= line_len:

309 success(pp_line, None)

310 return

311

312 if line[pos] != '"':

313 fail("invalid #line directive", pos)

314 return

315

316 m = re.match(_string_literal, line[pos:])

317 if not m:

318 fail("invalid #line directive", pos)

319 return

320

321 pp_filename = m.group(0).lstrip('"').rstrip('"')

322 pos += len(m.group(0))

323

324 # Consume arbitrary sequence of numeric flags after the directive

325 while True:

326 skip_ws()

327 if pos >= line_len:

328 break

329 m = re.match(_decimal_constant, line[pos:])

330 if not m:

331 fail("invalid #line directive", pos)

332 return

333 pos += len(m.group(0))

334

335 success(pp_line, pp_filename)

336

337 def _handle_pppragma(self) -> List[Token]:

338 # Parse a full #pragma line; returns a list of tokens with 1 or 2

339 # tokens - PPPRAGMA and an optional PPPRAGMASTR. If an empty list is

340 # returned, it means an error occurred, or we're at the end of input.

341 #

342 # Examples:

343 # - "#pragma" -> PPPRAGMA only

344 # - "#pragma once" -> PPPRAGMA, PPPRAGMASTR("once")

345 # - "# pragma omp parallel private(th_id)" -> PPPRAGMA, PPPRAGMASTR("omp parallel private(th_id)")

346 # - "#\tpragma {pack: 2, smack: 3}" -> PPPRAGMA, PPPRAGMASTR("{pack: 2, smack: 3}")

347 text = self._lexdata

348 n = len(text)

349 pos = self._pos

350

351 while pos < n and text[pos] in " \t":

352 pos += 1

353 if pos >= n:

354 self._pos = pos

355 return []

356

357 if not text.startswith("pragma", pos):

358 self._error("invalid #pragma directive", pos)

359 self._pos = pos + 1

360 return []

361

362 pragma_pos = pos

363 pos += len("pragma")

364 toks = [self._make_token("PPPRAGMA", "pragma", pragma_pos)]

365

366 while pos < n and text[pos] in " \t":

367 pos += 1

368

369 start = pos

370 while pos < n and text[pos] != "\n":

371 pos += 1

372 if pos > start:

373 toks.append(self._make_token("PPPRAGMASTR", text[start:pos], start))

374 if pos < n and text[pos] == "\n":

375 self._lineno += 1

376 pos += 1

377 self._line_start = pos

378 self._pos = pos

379 return toks

380

381

382##

383## Reserved keywords

384##

385_keywords: Tuple[str, ...] = (

386 "AUTO",

387 "BREAK",

388 "CASE",

389 "CHAR",

390 "CONST",

391 "CONTINUE",

392 "DEFAULT",

393 "DO",

394 "DOUBLE",

395 "ELSE",

396 "ENUM",

397 "EXTERN",

398 "FLOAT",

399 "FOR",

400 "GOTO",

401 "IF",

402 "INLINE",

403 "INT",

404 "LONG",

405 "REGISTER",

406 "OFFSETOF",

407 "RESTRICT",

408 "RETURN",

409 "SHORT",

410 "SIGNED",

411 "SIZEOF",

412 "STATIC",

413 "STRUCT",

414 "SWITCH",

415 "TYPEDEF",

416 "UNION",

417 "UNSIGNED",

418 "VOID",

419 "VOLATILE",

420 "WHILE",

421 "__INT128",

422 "_BOOL",

423 "_COMPLEX",

424 "_NORETURN",

425 "_THREAD_LOCAL",

426 "_STATIC_ASSERT",

427 "_ATOMIC",

428 "_ALIGNOF",

429 "_ALIGNAS",

430 "_PRAGMA",

431)

432

433_keyword_map: Dict[str, str] = {}

434

435for keyword in _keywords:

436 # Keywords from new C standard are mixed-case, like _Bool, _Alignas, etc.

437 if keyword.startswith("_") and len(keyword) > 1 and keyword[1].isalpha():

438 _keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword

439 else:

440 _keyword_map[keyword.lower()] = keyword

441

442##

443## Regexes for use in tokens

444##

445

446# valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)

447_identifier = r"[a-zA-Z_$][0-9a-zA-Z_$]*"

448

449_hex_prefix = "0[xX]"

450_hex_digits = "[0-9a-fA-F]+"

451_bin_prefix = "0[bB]"

452_bin_digits = "[01]+"

453

454# integer constants (K&R2: A.2.5.1)

455_integer_suffix_opt = (

457)

458_decimal_constant = (

459 "(0" + _integer_suffix_opt + ")|([1-9][0-9]*" + _integer_suffix_opt + ")"

460)

461_octal_constant = "0[0-7]*" + _integer_suffix_opt

462_hex_constant = _hex_prefix + _hex_digits + _integer_suffix_opt

463_bin_constant = _bin_prefix + _bin_digits + _integer_suffix_opt

464

465_bad_octal_constant = "0[0-7]*[89]"

466

467# comments are not supported

468_unsupported_c_style_comment = r"\/\*"

469_unsupported_cxx_style_comment = r"\/\/"

470

471# character constants (K&R2: A.2.5.2)

472# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line

473# directives with Windows paths as filenames (..\..\dir\file)

474# For the same reason, decimal_escape allows all digit sequences. We want to

475# parse all correct code, even if it means to sometimes parse incorrect

476# code.

477#

478# The original regexes were taken verbatim from the C syntax definition,

479# and were later modified to avoid worst-case exponential running time.

480#

481# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""

482# decimal_escape = r"""(\d+)"""

483# hex_escape = r"""(x[0-9a-fA-F]+)"""

484# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""

485#

486# The following modifications were made to avoid the ambiguity that allowed

487# backtracking: (https://github.com/eliben/pycparser/issues/61)

488#

489# - \x was removed from simple_escape, unless it was not followed by a hex

490# digit, to avoid ambiguity with hex_escape.

491# - hex_escape allows one or more hex characters, but requires that the next

492# character(if any) is not hex

493# - decimal_escape allows one or more decimal characters, but requires that the

494# next character(if any) is not a decimal

495# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the

496# permissive decimal_escape.

497#

498# Without this change, python's `re` module would recursively try parsing each

499# ambiguous escape sequence in multiple ways. e.g. `\123` could be parsed as

500# `\1`+`23`, `\12`+`3`, and `\123`.

501

502_simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""

503_decimal_escape = r"""(\d+)(?!\d)"""

504_hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""

505_bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""

506

507_escape_sequence = (

508 r"""(\\(""" + _simple_escape + "|" + _decimal_escape + "|" + _hex_escape + "))"

509)

510

511# This complicated regex with lookahead might be slow for strings, so because

512# all of the valid escapes (including \x) allowed

513# 0 or more non-escaped characters after the first character,

514# simple_escape+decimal_escape+hex_escape got simplified to

515

516_escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""

517

518_cconst_char = r"""([^'\\\n]|""" + _escape_sequence + ")"

519_char_const = "'" + _cconst_char + "'"

520_wchar_const = "L" + _char_const

521_u8char_const = "u8" + _char_const

522_u16char_const = "u" + _char_const

523_u32char_const = "U" + _char_const

524_multicharacter_constant = "'" + _cconst_char + "{2,4}'"

525_unmatched_quote = "('" + _cconst_char + "*\\n)|('" + _cconst_char + "*$)"

526_bad_char_const = (

527 r"""('""" + _cconst_char + """[^'\n]+')|('')|('""" + _bad_escape + r"""[^'\n]*')"""

528)

529

530# string literals (K&R2: A.2.6)

531_string_char = r"""([^"\\\n]|""" + _escape_sequence_start_in_string + ")"

532_string_literal = '"' + _string_char + '*"'

533_wstring_literal = "L" + _string_literal

534_u8string_literal = "u8" + _string_literal

535_u16string_literal = "u" + _string_literal

536_u32string_literal = "U" + _string_literal

537_bad_string_literal = '"' + _string_char + "*" + _bad_escape + _string_char + '*"'

538

539# floating constants (K&R2: A.2.5.3)

540_exponent_part = r"""([eE][-+]?[0-9]+)"""

541_fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""

542_floating_constant = (

543 "(((("

544 + _fractional_constant

545 + ")"

546 + _exponent_part

547 + "?)|([0-9]+"

548 + _exponent_part

549 + "))[FfLl]?)"

550)

551_binary_exponent_part = r"""([pP][+-]?[0-9]+)"""

552_hex_fractional_constant = (

553 "(((" + _hex_digits + r""")?\.""" + _hex_digits + ")|(" + _hex_digits + r"""\.))"""

554)

555_hex_floating_constant = (

556 "("

557 + _hex_prefix

558 + "("

559 + _hex_digits

560 + "|"

561 + _hex_fractional_constant

562 + ")"

563 + _binary_exponent_part

564 + "[FfLl]?)"

565)

566

567

568class _RegexAction(Enum):

569 TOKEN = 0

570 ID = 1

571 ERROR = 2

572

573

574@dataclass(frozen=True)

575class _RegexRule:

576 # tok_type: name of the token emitted for a match

577 # regex_pattern: the raw regex (no anchors) to match at the current position

578 # action: TOKEN for normal tokens, ID for identifiers, ERROR to report

579 # error_message: message used for ERROR entries

580 tok_type: str

581 regex_pattern: str

582 action: _RegexAction

583 error_message: Optional[str]

584

585

586_regex_rules: List[_RegexRule] = [

587 _RegexRule(

588 "UNSUPPORTED_C_STYLE_COMMENT",

589 _unsupported_c_style_comment,

590 _RegexAction.ERROR,

591 "Comments are not supported, see https://github.com/eliben/pycparser#3using.",

592 ),

593 _RegexRule(

594 "UNSUPPORTED_CXX_STYLE_COMMENT",

595 _unsupported_cxx_style_comment,

596 _RegexAction.ERROR,

597 "Comments are not supported, see https://github.com/eliben/pycparser#3using.",

598 ),

599 _RegexRule(

600 "BAD_STRING_LITERAL",

601 _bad_string_literal,

602 _RegexAction.ERROR,

603 "String contains invalid escape code",

604 ),

605 _RegexRule("WSTRING_LITERAL", _wstring_literal, _RegexAction.TOKEN, None),

606 _RegexRule("U8STRING_LITERAL", _u8string_literal, _RegexAction.TOKEN, None),

607 _RegexRule("U16STRING_LITERAL", _u16string_literal, _RegexAction.TOKEN, None),

608 _RegexRule("U32STRING_LITERAL", _u32string_literal, _RegexAction.TOKEN, None),

609 _RegexRule("STRING_LITERAL", _string_literal, _RegexAction.TOKEN, None),

610 _RegexRule("HEX_FLOAT_CONST", _hex_floating_constant, _RegexAction.TOKEN, None),

611 _RegexRule("FLOAT_CONST", _floating_constant, _RegexAction.TOKEN, None),

612 _RegexRule("INT_CONST_HEX", _hex_constant, _RegexAction.TOKEN, None),

613 _RegexRule("INT_CONST_BIN", _bin_constant, _RegexAction.TOKEN, None),

614 _RegexRule(

615 "BAD_CONST_OCT",

616 _bad_octal_constant,

617 _RegexAction.ERROR,

618 "Invalid octal constant",

619 ),

620 _RegexRule("INT_CONST_OCT", _octal_constant, _RegexAction.TOKEN, None),

621 _RegexRule("INT_CONST_DEC", _decimal_constant, _RegexAction.TOKEN, None),

622 _RegexRule("INT_CONST_CHAR", _multicharacter_constant, _RegexAction.TOKEN, None),

623 _RegexRule("CHAR_CONST", _char_const, _RegexAction.TOKEN, None),

624 _RegexRule("WCHAR_CONST", _wchar_const, _RegexAction.TOKEN, None),

625 _RegexRule("U8CHAR_CONST", _u8char_const, _RegexAction.TOKEN, None),

626 _RegexRule("U16CHAR_CONST", _u16char_const, _RegexAction.TOKEN, None),

627 _RegexRule("U32CHAR_CONST", _u32char_const, _RegexAction.TOKEN, None),

628 _RegexRule("UNMATCHED_QUOTE", _unmatched_quote, _RegexAction.ERROR, "Unmatched '"),

629 _RegexRule("BAD_CHAR_CONST", _bad_char_const, _RegexAction.ERROR, None),

630 _RegexRule("ID", _identifier, _RegexAction.ID, None),

631]

632

633_regex_actions: Dict[str, Tuple[_RegexAction, Optional[str]]] = {}

634_regex_pattern_parts: List[str] = []

635for _rule in _regex_rules:

636 _regex_actions[_rule.tok_type] = (_rule.action, _rule.error_message)

637 _regex_pattern_parts.append(f"(?P<{_rule.tok_type}>{_rule.regex_pattern})")

638# The master regex is a single alternation of all token patterns, each wrapped

639# in a named group. We match once at the current position and then use

640# `lastgroup` to recover which token kind fired; this avoids iterating over all

641# regexes on every character while keeping the same token-level semantics.

642_regex_master: re.Pattern[str] = re.compile("|".join(_regex_pattern_parts))

643

644

645@dataclass(frozen=True)

646class _FixedToken:

647 tok_type: str

648 literal: str

649

650

651_fixed_tokens: List[_FixedToken] = [

652 _FixedToken("ELLIPSIS", "..."),

653 _FixedToken("LSHIFTEQUAL", "<<="),

654 _FixedToken("RSHIFTEQUAL", ">>="),

655 _FixedToken("PLUSPLUS", "++"),

656 _FixedToken("MINUSMINUS", "--"),

657 _FixedToken("ARROW", "->"),

658 _FixedToken("LAND", "&&"),

659 _FixedToken("LOR", "||"),

660 _FixedToken("LSHIFT", "<<"),

661 _FixedToken("RSHIFT", ">>"),

662 _FixedToken("LE", "<="),

663 _FixedToken("GE", ">="),

664 _FixedToken("EQ", "=="),

665 _FixedToken("NE", "!="),

666 _FixedToken("TIMESEQUAL", "*="),

667 _FixedToken("DIVEQUAL", "/="),

668 _FixedToken("MODEQUAL", "%="),

669 _FixedToken("PLUSEQUAL", "+="),

670 _FixedToken("MINUSEQUAL", "-="),

671 _FixedToken("ANDEQUAL", "&="),

672 _FixedToken("OREQUAL", "|="),

673 _FixedToken("XOREQUAL", "^="),

674 _FixedToken("EQUALS", "="),

675 _FixedToken("PLUS", "+"),

676 _FixedToken("MINUS", "-"),

677 _FixedToken("TIMES", "*"),

678 _FixedToken("DIVIDE", "/"),

679 _FixedToken("MOD", "%"),

680 _FixedToken("OR", "|"),

681 _FixedToken("AND", "&"),

682 _FixedToken("NOT", "~"),

683 _FixedToken("XOR", "^"),

684 _FixedToken("LNOT", "!"),

685 _FixedToken("LT", "<"),

686 _FixedToken("GT", ">"),

687 _FixedToken("CONDOP", "?"),

688 _FixedToken("LPAREN", "("),

689 _FixedToken("RPAREN", ")"),

690 _FixedToken("LBRACKET", "["),

691 _FixedToken("RBRACKET", "]"),

692 _FixedToken("LBRACE", "{"),

693 _FixedToken("RBRACE", "}"),

694 _FixedToken("COMMA", ","),

695 _FixedToken("PERIOD", "."),

696 _FixedToken("SEMI", ";"),

697 _FixedToken("COLON", ":"),

698]

699

700# To avoid scanning all fixed tokens on every character, we bucket them by the

701# first character. When matching at position i, we only look at the bucket for

702# text[i], and we pre-sort that bucket by token length so the first match is

703# also the longest. This preserves longest-match semantics (e.g. '>>=' before

704# '>>' before '>') while reducing the number of comparisons.

705_fixed_tokens_by_first: Dict[str, List[_FixedToken]] = {}

706for _entry in _fixed_tokens:

707 _fixed_tokens_by_first.setdefault(_entry.literal[0], []).append(_entry)

708for _bucket in _fixed_tokens_by_first.values():

709 _bucket.sort(key=lambda item: len(item.literal), reverse=True)

710

711_line_pattern: re.Pattern[str] = re.compile(r"([ \t]*line\W)|([ \t]*\d+)")

712_pragma_pattern: re.Pattern[str] = re.compile(r"[ \t]*pragma\W")

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pycparser/c_lexer.py: 75%

288 statements