Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pycparser/c

1# ------------------------------------------------------------------------------

2# pycparser: c_lexer.py

4# CLexer class: lexer for the C language

6# Eli Bendersky [https://eli.thegreenplace.net/]

7# License: BSD

8# ------------------------------------------------------------------------------

9import re

10from dataclasses import dataclass

11from enum import Enum

12from typing import Callable, Dict, List, Optional, Tuple

15@dataclass(slots=True)

16class _Token:

17 type: str

18 value: str

19 lineno: int

20 column: int

23class CLexer:

24 """A standalone lexer for C.

26 Parameters for construction:

27 error_func:

28 Called with (msg, line, column) on lexing errors.

29 on_lbrace_func:

30 Called when an LBRACE token is produced (used for scope tracking).

31 on_rbrace_func:

32 Called when an RBRACE token is produced (used for scope tracking).

33 type_lookup_func:

34 Called with an identifier name; expected to return True if it is

35 a typedef name and should be tokenized as TYPEID.

37 Call input(text) to initialize lexing, and then keep calling token() to

38 get the next token, until it returns None (at end of input).

39 """

41 def __init__(

42 self,

43 error_func: Callable[[str, int, int], None],

44 on_lbrace_func: Callable[[], None],

45 on_rbrace_func: Callable[[], None],

46 type_lookup_func: Callable[[str], bool],

47 ) -> None:

48 self.error_func = error_func

49 self.on_lbrace_func = on_lbrace_func

50 self.on_rbrace_func = on_rbrace_func

51 self.type_lookup_func = type_lookup_func

52 self._init_state()

54 def input(self, text: str, filename: str = "") -> None:

55 """Initialize the lexer to the given input text.

57 filename is an optional name identifying the file from which the input

58 comes. The lexer can modify it if #line directives are encountered.

59 """

60 self._init_state()

61 self._lexdata = text

62 self._filename = filename

64 def _init_state(self) -> None:

65 self._lexdata = ""

66 self._filename = ""

67 self._pos = 0

68 self._line_start = 0

69 self._pending_tok: Optional[_Token] = None

70 self._lineno = 1

72 @property

73 def filename(self) -> str:

74 return self._filename

76 def token(self) -> Optional[_Token]:

77 # Lexing strategy overview:

78 #

79 # - We maintain a current position (self._pos), line number, and the

80 # byte offset of the current line start. The lexer is a simple loop

81 # that skips whitespace/newlines and emits one token per call.

82 # - A small amount of logic is handled manually before regex matching:

83 #

84 # * Preprocessor-style directives: if we see '#', we check whether

85 # it's a #line or #pragma directive and consume it inline. #line

86 # updates lineno/filename and produces no tokens. #pragma can yield

87 # both PPPRAGMA and PPPRAGMASTR, but token() returns a single token,

88 # so we stash the PPPRAGMASTR as _pending_tok to return on the next

89 # token() call. Otherwise we return PPHASH.

90 # * Newlines update lineno/line-start tracking so tokens can record

91 # accurate columns.

92 #

93 # - The bulk of tokens are recognized in _match_token:

94 #

95 # * _regex_rules: regex patterns for identifiers, literals, and other

96 # complex tokens (including error-producing patterns). The lexer

97 # uses a combined _regex_master to scan options at the same time.

98 # * _fixed_tokens: exact string matches for operators and punctuation,

99 # resolved by longest match.

100 #

101 # - Error patterns call the error callback and advance minimally, which

102 # keeps lexing resilient while reporting useful diagnostics.

103 text = self._lexdata

104 n = len(text)

105

106 if self._pending_tok is not None:

107 tok = self._pending_tok

108 self._pending_tok = None

109 return tok

110

111 while self._pos < n:

112 match text[self._pos]:

113 case " " | "\t":

114 self._pos += 1

115 case "\n":

116 self._lineno += 1

117 self._pos += 1

118 self._line_start = self._pos

119 case "#":

120 if _line_pattern.match(text, self._pos + 1):

121 self._pos += 1

122 self._handle_ppline()

123 continue

124 if _pragma_pattern.match(text, self._pos + 1):

125 self._pos += 1

126 toks = self._handle_pppragma()

127 if len(toks) > 1:

128 self._pending_tok = toks[1]

129 if len(toks) > 0:

130 return toks[0]

131 continue

132 tok = self._make_token("PPHASH", "#", self._pos)

133 self._pos += 1

134 return tok

135 case _:

136 if tok := self._match_token():

137 return tok

138 else:

139 continue

140

141 def _match_token(self) -> Optional[_Token]:

142 """Match one token at the current position.

143

144 Returns a Token on success, or None if no token could be matched and

145 an error was reported. This method always advances _pos by the matched

146 length, or by 1 on error/no-match.

147 """

148 text = self._lexdata

149 pos = self._pos

150 # We pick the longest match between:

151 # - the master regex (identifiers, literals, error patterns, etc.)

152 # - fixed operator/punctuator literals from the bucket for text[pos]

153 #

154 # The longest match is required to ensure we properly lex something

155 # like ".123" (a floating-point constant) as a single entity (with

156 # FLOAT_CONST), rather than a PERIOD followed by a number.

157 #

158 # The fixed-literal buckets are already length-sorted, so within that

159 # bucket we can take the first match. However, we still compare its

160 # length to the regex match because the regex may have matched a longer

161 # token that should take precedence.

162 best = None

163

164 if m := _regex_master.match(text, pos):

165 tok_type = m.lastgroup

166 # All master-regex alternatives are named; lastgroup shouldn't be None.

167 assert tok_type is not None

168 value = m.group(tok_type)

169 length = len(value)

170 action, msg = _regex_actions[tok_type]

171 best = (length, tok_type, value, action, msg)

172

173 if bucket := _fixed_tokens_by_first.get(text[pos]):

174 for entry in bucket:

175 if text.startswith(entry.literal, pos):

176 length = len(entry.literal)

177 if best is None or length > best[0]:

178 best = (

179 length,

180 entry.tok_type,

181 entry.literal,

182 _RegexAction.TOKEN,

183 None,

184 )

185 break

186

187 if best is None:

188 msg = f"Illegal character {repr(text[pos])}"

189 self._error(msg, pos)

190 self._pos += 1

191 return None

192

193 length, tok_type, value, action, msg = best

194 if action == _RegexAction.ERROR:

195 if tok_type == "BAD_CHAR_CONST":

196 msg = f"Invalid char constant {value}"

197 # All other ERROR rules provide a message.

198 assert msg is not None

199 self._error(msg, pos)

200 self._pos += max(1, length)

201 return None

202

203 if action == _RegexAction.ID:

204 tok_type = _keyword_map.get(value, "ID")

205 if tok_type == "ID" and self.type_lookup_func(value):

206 tok_type = "TYPEID"

207

208 tok = self._make_token(tok_type, value, pos)

209 self._pos += length

210

211 if tok.type == "LBRACE":

212 self.on_lbrace_func()

213 elif tok.type == "RBRACE":

214 self.on_rbrace_func()

215

216 return tok

217

218 def _make_token(self, tok_type: str, value: str, pos: int) -> _Token:

219 """Create a Token at an absolute input position.

220

221 Expects tok_type/value and the absolute byte offset pos in the current

222 input. Does not advance lexer state; callers manage _pos themselves.

223 Returns a Token with lineno/column computed from current line tracking.

224 """

225 column = pos - self._line_start + 1

226 tok = _Token(tok_type, value, self._lineno, column)

227 return tok

228

229 def _error(self, msg: str, pos: int) -> None:

230 column = pos - self._line_start + 1

231 self.error_func(msg, self._lineno, column)

232

233 def _handle_ppline(self) -> None:

234 # Since #line directives aren't supposed to return tokens but should

235 # only affect the lexer's state (update line/filename for coords), this

236 # method does a bit of parsing on its own. It doesn't return anything,

237 # but its side effect is to update self._pos past the directive, and

238 # potentially update self._lineno and self._filename, based on the

239 # directive's contents.

240 #

241 # Accepted #line forms from preprocessors:

242 # - "#line 66 \"kwas\\df.h\""

243 # - "# 9"

244 # - "#line 10 \"include/me.h\" 1 2 3" (extra numeric flags)

245 # - "# 1 \"file.h\" 3"

246 # Errors we must report:

247 # - "#line \"file.h\"" (filename before line number)

248 # - "#line df" (garbage instead of number/string)

249 #

250 # We scan the directive line once (after an optional 'line' keyword),

251 # validating the order: NUMBER, optional STRING, then any NUMBERs.

252 # The NUMBERs tail is only accepted if a filename STRING was present.

253 text = self._lexdata

254 n = len(text)

255 line_end = text.find("\n", self._pos)

256 if line_end == -1:

257 line_end = n

258 line = text[self._pos : line_end]

259 pos = 0

260 line_len = len(line)

261

262 def skip_ws() -> None:

263 nonlocal pos

264 while pos < line_len and line[pos] in " \t":

265 pos += 1

266

267 skip_ws()

268 if line.startswith("line", pos):

269 pos += 4

270

271 def success(pp_line: Optional[str], pp_filename: Optional[str]) -> None:

272 if pp_line is None:

273 self._error("line number missing in #line", self._pos + line_len)

274 else:

275 self._lineno = int(pp_line)

276 if pp_filename is not None:

277 self._filename = pp_filename

278 self._pos = line_end + 1

279 self._line_start = self._pos

280

281 def fail(msg: str, offset: int) -> None:

282 self._error(msg, self._pos + offset)

283 self._pos = line_end + 1

284 self._line_start = self._pos

285

286 skip_ws()

287 if pos >= line_len:

288 success(None, None)

289 return

290 if line[pos] == '"':

291 fail("filename before line number in #line", pos)

292 return

293

294 m = re.match(_decimal_constant, line[pos:])

295 if not m:

296 fail("invalid #line directive", pos)

297 return

298

299 pp_line = m.group(0)

300 pos += len(pp_line)

301 skip_ws()

302 if pos >= line_len:

303 success(pp_line, None)

304 return

305

306 if line[pos] != '"':

307 fail("invalid #line directive", pos)

308 return

309

310 m = re.match(_string_literal, line[pos:])

311 if not m:

312 fail("invalid #line directive", pos)

313 return

314

315 pp_filename = m.group(0).lstrip('"').rstrip('"')

316 pos += len(m.group(0))

317

318 # Consume arbitrary sequence of numeric flags after the directive

319 while True:

320 skip_ws()

321 if pos >= line_len:

322 break

323 m = re.match(_decimal_constant, line[pos:])

324 if not m:

325 fail("invalid #line directive", pos)

326 return

327 pos += len(m.group(0))

328

329 success(pp_line, pp_filename)

330

331 def _handle_pppragma(self) -> List[_Token]:

332 # Parse a full #pragma line; returns a list of tokens with 1 or 2

333 # tokens - PPPRAGMA and an optional PPPRAGMASTR. If an empty list is

334 # returned, it means an error occurred, or we're at the end of input.

335 #

336 # Examples:

337 # - "#pragma" -> PPPRAGMA only

338 # - "#pragma once" -> PPPRAGMA, PPPRAGMASTR("once")

339 # - "# pragma omp parallel private(th_id)" -> PPPRAGMA, PPPRAGMASTR("omp parallel private(th_id)")

340 # - "#\tpragma {pack: 2, smack: 3}" -> PPPRAGMA, PPPRAGMASTR("{pack: 2, smack: 3}")

341 text = self._lexdata

342 n = len(text)

343 pos = self._pos

344

345 while pos < n and text[pos] in " \t":

346 pos += 1

347 if pos >= n:

348 self._pos = pos

349 return []

350

351 if not text.startswith("pragma", pos):

352 self._error("invalid #pragma directive", pos)

353 self._pos = pos + 1

354 return []

355

356 pragma_pos = pos

357 pos += len("pragma")

358 toks = [self._make_token("PPPRAGMA", "pragma", pragma_pos)]

359

360 while pos < n and text[pos] in " \t":

361 pos += 1

362

363 start = pos

364 while pos < n and text[pos] != "\n":

365 pos += 1

366 if pos > start:

367 toks.append(self._make_token("PPPRAGMASTR", text[start:pos], start))

368 if pos < n and text[pos] == "\n":

369 self._lineno += 1

370 pos += 1

371 self._line_start = pos

372 self._pos = pos

373 return toks

374

375

376##

377## Reserved keywords

378##

379_keywords: Tuple[str, ...] = (

380 "AUTO",

381 "BREAK",

382 "CASE",

383 "CHAR",

384 "CONST",

385 "CONTINUE",

386 "DEFAULT",

387 "DO",

388 "DOUBLE",

389 "ELSE",

390 "ENUM",

391 "EXTERN",

392 "FLOAT",

393 "FOR",

394 "GOTO",

395 "IF",

396 "INLINE",

397 "INT",

398 "LONG",

399 "REGISTER",

400 "OFFSETOF",

401 "RESTRICT",

402 "RETURN",

403 "SHORT",

404 "SIGNED",

405 "SIZEOF",

406 "STATIC",

407 "STRUCT",

408 "SWITCH",

409 "TYPEDEF",

410 "UNION",

411 "UNSIGNED",

412 "VOID",

413 "VOLATILE",

414 "WHILE",

415 "__INT128",

416 "_BOOL",

417 "_COMPLEX",

418 "_NORETURN",

419 "_THREAD_LOCAL",

420 "_STATIC_ASSERT",

421 "_ATOMIC",

422 "_ALIGNOF",

423 "_ALIGNAS",

424 "_PRAGMA",

425)

426

427_keyword_map: Dict[str, str] = {}

428

429for keyword in _keywords:

430 # Keywords from new C standard are mixed-case, like _Bool, _Alignas, etc.

431 if keyword.startswith("_") and len(keyword) > 1 and keyword[1].isalpha():

432 _keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword

433 else:

434 _keyword_map[keyword.lower()] = keyword

435

436##

437## Regexes for use in tokens

438##

439

440# valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)

441_identifier = r"[a-zA-Z_$][0-9a-zA-Z_$]*"

442

443_hex_prefix = "0[xX]"

444_hex_digits = "[0-9a-fA-F]+"

445_bin_prefix = "0[bB]"

446_bin_digits = "[01]+"

447

448# integer constants (K&R2: A.2.5.1)

449_integer_suffix_opt = (

451)

452_decimal_constant = (

453 "(0" + _integer_suffix_opt + ")|([1-9][0-9]*" + _integer_suffix_opt + ")"

454)

455_octal_constant = "0[0-7]*" + _integer_suffix_opt

456_hex_constant = _hex_prefix + _hex_digits + _integer_suffix_opt

457_bin_constant = _bin_prefix + _bin_digits + _integer_suffix_opt

458

459_bad_octal_constant = "0[0-7]*[89]"

460

461# comments are not supported

462_unsupported_c_style_comment = r"\/\*"

463_unsupported_cxx_style_comment = r"\/\/"

464

465# character constants (K&R2: A.2.5.2)

466# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line

467# directives with Windows paths as filenames (..\..\dir\file)

468# For the same reason, decimal_escape allows all digit sequences. We want to

469# parse all correct code, even if it means to sometimes parse incorrect

470# code.

471#

472# The original regexes were taken verbatim from the C syntax definition,

473# and were later modified to avoid worst-case exponential running time.

474#

475# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""

476# decimal_escape = r"""(\d+)"""

477# hex_escape = r"""(x[0-9a-fA-F]+)"""

478# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""

479#

480# The following modifications were made to avoid the ambiguity that allowed

481# backtracking: (https://github.com/eliben/pycparser/issues/61)

482#

483# - \x was removed from simple_escape, unless it was not followed by a hex

484# digit, to avoid ambiguity with hex_escape.

485# - hex_escape allows one or more hex characters, but requires that the next

486# character(if any) is not hex

487# - decimal_escape allows one or more decimal characters, but requires that the

488# next character(if any) is not a decimal

489# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the

490# permissive decimal_escape.

491#

492# Without this change, python's `re` module would recursively try parsing each

493# ambiguous escape sequence in multiple ways. e.g. `\123` could be parsed as

494# `\1`+`23`, `\12`+`3`, and `\123`.

495

496_simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""

497_decimal_escape = r"""(\d+)(?!\d)"""

498_hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""

499_bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""

500

501_escape_sequence = (

502 r"""(\\(""" + _simple_escape + "|" + _decimal_escape + "|" + _hex_escape + "))"

503)

504

505# This complicated regex with lookahead might be slow for strings, so because

506# all of the valid escapes (including \x) allowed

507# 0 or more non-escaped characters after the first character,

508# simple_escape+decimal_escape+hex_escape got simplified to

509

510_escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""

511

512_cconst_char = r"""([^'\\\n]|""" + _escape_sequence + ")"

513_char_const = "'" + _cconst_char + "'"

514_wchar_const = "L" + _char_const

515_u8char_const = "u8" + _char_const

516_u16char_const = "u" + _char_const

517_u32char_const = "U" + _char_const

518_multicharacter_constant = "'" + _cconst_char + "{2,4}'"

519_unmatched_quote = "('" + _cconst_char + "*\\n)|('" + _cconst_char + "*$)"

520_bad_char_const = (

521 r"""('""" + _cconst_char + """[^'\n]+')|('')|('""" + _bad_escape + r"""[^'\n]*')"""

522)

523

524# string literals (K&R2: A.2.6)

525_string_char = r"""([^"\\\n]|""" + _escape_sequence_start_in_string + ")"

526_string_literal = '"' + _string_char + '*"'

527_wstring_literal = "L" + _string_literal

528_u8string_literal = "u8" + _string_literal

529_u16string_literal = "u" + _string_literal

530_u32string_literal = "U" + _string_literal

531_bad_string_literal = '"' + _string_char + "*" + _bad_escape + _string_char + '*"'

532

533# floating constants (K&R2: A.2.5.3)

534_exponent_part = r"""([eE][-+]?[0-9]+)"""

535_fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""

536_floating_constant = (

537 "(((("

538 + _fractional_constant

539 + ")"

540 + _exponent_part

541 + "?)|([0-9]+"

542 + _exponent_part

543 + "))[FfLl]?)"

544)

545_binary_exponent_part = r"""([pP][+-]?[0-9]+)"""

546_hex_fractional_constant = (

547 "(((" + _hex_digits + r""")?\.""" + _hex_digits + ")|(" + _hex_digits + r"""\.))"""

548)

549_hex_floating_constant = (

550 "("

551 + _hex_prefix

552 + "("

553 + _hex_digits

554 + "|"

555 + _hex_fractional_constant

556 + ")"

557 + _binary_exponent_part

558 + "[FfLl]?)"

559)

560

561

562class _RegexAction(Enum):

563 TOKEN = 0

564 ID = 1

565 ERROR = 2

566

567

568@dataclass(frozen=True)

569class _RegexRule:

570 # tok_type: name of the token emitted for a match

571 # regex_pattern: the raw regex (no anchors) to match at the current position

572 # action: TOKEN for normal tokens, ID for identifiers, ERROR to report

573 # error_message: message used for ERROR entries

574 tok_type: str

575 regex_pattern: str

576 action: _RegexAction

577 error_message: Optional[str]

578

579

580_regex_rules: List[_RegexRule] = [

581 _RegexRule(

582 "UNSUPPORTED_C_STYLE_COMMENT",

583 _unsupported_c_style_comment,

584 _RegexAction.ERROR,

585 "Comments are not supported, see https://github.com/eliben/pycparser#3using.",

586 ),

587 _RegexRule(

588 "UNSUPPORTED_CXX_STYLE_COMMENT",

589 _unsupported_cxx_style_comment,

590 _RegexAction.ERROR,

591 "Comments are not supported, see https://github.com/eliben/pycparser#3using.",

592 ),

593 _RegexRule(

594 "BAD_STRING_LITERAL",

595 _bad_string_literal,

596 _RegexAction.ERROR,

597 "String contains invalid escape code",

598 ),

599 _RegexRule("WSTRING_LITERAL", _wstring_literal, _RegexAction.TOKEN, None),

600 _RegexRule("U8STRING_LITERAL", _u8string_literal, _RegexAction.TOKEN, None),

601 _RegexRule("U16STRING_LITERAL", _u16string_literal, _RegexAction.TOKEN, None),

602 _RegexRule("U32STRING_LITERAL", _u32string_literal, _RegexAction.TOKEN, None),

603 _RegexRule("STRING_LITERAL", _string_literal, _RegexAction.TOKEN, None),

604 _RegexRule("HEX_FLOAT_CONST", _hex_floating_constant, _RegexAction.TOKEN, None),

605 _RegexRule("FLOAT_CONST", _floating_constant, _RegexAction.TOKEN, None),

606 _RegexRule("INT_CONST_HEX", _hex_constant, _RegexAction.TOKEN, None),

607 _RegexRule("INT_CONST_BIN", _bin_constant, _RegexAction.TOKEN, None),

608 _RegexRule(

609 "BAD_CONST_OCT",

610 _bad_octal_constant,

611 _RegexAction.ERROR,

612 "Invalid octal constant",

613 ),

614 _RegexRule("INT_CONST_OCT", _octal_constant, _RegexAction.TOKEN, None),

615 _RegexRule("INT_CONST_DEC", _decimal_constant, _RegexAction.TOKEN, None),

616 _RegexRule("INT_CONST_CHAR", _multicharacter_constant, _RegexAction.TOKEN, None),

617 _RegexRule("CHAR_CONST", _char_const, _RegexAction.TOKEN, None),

618 _RegexRule("WCHAR_CONST", _wchar_const, _RegexAction.TOKEN, None),

619 _RegexRule("U8CHAR_CONST", _u8char_const, _RegexAction.TOKEN, None),

620 _RegexRule("U16CHAR_CONST", _u16char_const, _RegexAction.TOKEN, None),

621 _RegexRule("U32CHAR_CONST", _u32char_const, _RegexAction.TOKEN, None),

622 _RegexRule("UNMATCHED_QUOTE", _unmatched_quote, _RegexAction.ERROR, "Unmatched '"),

623 _RegexRule("BAD_CHAR_CONST", _bad_char_const, _RegexAction.ERROR, None),

624 _RegexRule("ID", _identifier, _RegexAction.ID, None),

625]

626

627_regex_actions: Dict[str, Tuple[_RegexAction, Optional[str]]] = {}

628_regex_pattern_parts: List[str] = []

629for _rule in _regex_rules:

630 _regex_actions[_rule.tok_type] = (_rule.action, _rule.error_message)

631 _regex_pattern_parts.append(f"(?P<{_rule.tok_type}>{_rule.regex_pattern})")

632# The master regex is a single alternation of all token patterns, each wrapped

633# in a named group. We match once at the current position and then use

634# `lastgroup` to recover which token kind fired; this avoids iterating over all

635# regexes on every character while keeping the same token-level semantics.

636_regex_master: re.Pattern[str] = re.compile("|".join(_regex_pattern_parts))

637

638

639@dataclass(frozen=True)

640class _FixedToken:

641 tok_type: str

642 literal: str

643

644

645_fixed_tokens: List[_FixedToken] = [

646 _FixedToken("ELLIPSIS", "..."),

647 _FixedToken("LSHIFTEQUAL", "<<="),

648 _FixedToken("RSHIFTEQUAL", ">>="),

649 _FixedToken("PLUSPLUS", "++"),

650 _FixedToken("MINUSMINUS", "--"),

651 _FixedToken("ARROW", "->"),

652 _FixedToken("LAND", "&&"),

653 _FixedToken("LOR", "||"),

654 _FixedToken("LSHIFT", "<<"),

655 _FixedToken("RSHIFT", ">>"),

656 _FixedToken("LE", "<="),

657 _FixedToken("GE", ">="),

658 _FixedToken("EQ", "=="),

659 _FixedToken("NE", "!="),

660 _FixedToken("TIMESEQUAL", "*="),

661 _FixedToken("DIVEQUAL", "/="),

662 _FixedToken("MODEQUAL", "%="),

663 _FixedToken("PLUSEQUAL", "+="),

664 _FixedToken("MINUSEQUAL", "-="),

665 _FixedToken("ANDEQUAL", "&="),

666 _FixedToken("OREQUAL", "|="),

667 _FixedToken("XOREQUAL", "^="),

668 _FixedToken("EQUALS", "="),

669 _FixedToken("PLUS", "+"),

670 _FixedToken("MINUS", "-"),

671 _FixedToken("TIMES", "*"),

672 _FixedToken("DIVIDE", "/"),

673 _FixedToken("MOD", "%"),

674 _FixedToken("OR", "|"),

675 _FixedToken("AND", "&"),

676 _FixedToken("NOT", "~"),

677 _FixedToken("XOR", "^"),

678 _FixedToken("LNOT", "!"),

679 _FixedToken("LT", "<"),

680 _FixedToken("GT", ">"),

681 _FixedToken("CONDOP", "?"),

682 _FixedToken("LPAREN", "("),

683 _FixedToken("RPAREN", ")"),

684 _FixedToken("LBRACKET", "["),

685 _FixedToken("RBRACKET", "]"),

686 _FixedToken("LBRACE", "{"),

687 _FixedToken("RBRACE", "}"),

688 _FixedToken("COMMA", ","),

689 _FixedToken("PERIOD", "."),

690 _FixedToken("SEMI", ";"),

691 _FixedToken("COLON", ":"),

692]

693

694# To avoid scanning all fixed tokens on every character, we bucket them by the

695# first character. When matching at position i, we only look at the bucket for

696# text[i], and we pre-sort that bucket by token length so the first match is

697# also the longest. This preserves longest-match semantics (e.g. '>>=' before

698# '>>' before '>') while reducing the number of comparisons.

699_fixed_tokens_by_first: Dict[str, List[_FixedToken]] = {}

700for _entry in _fixed_tokens:

701 _fixed_tokens_by_first.setdefault(_entry.literal[0], []).append(_entry)

702for _bucket in _fixed_tokens_by_first.values():

703 _bucket.sort(key=lambda item: len(item.literal), reverse=True)

704

705_line_pattern: re.Pattern[str] = re.compile(r"([ \t]*line\W)|([ \t]*\d+)")

706_pragma_pattern: re.Pattern[str] = re.compile(r"[ \t]*pragma\W")

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pycparser/c_lexer.py: 74%

284 statements