Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pycparser/c_lexer.py: 74%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# ------------------------------------------------------------------------------
2# pycparser: c_lexer.py
3#
4# CLexer class: lexer for the C language
5#
6# Eli Bendersky [https://eli.thegreenplace.net/]
7# License: BSD
8# ------------------------------------------------------------------------------
9import re
10from dataclasses import dataclass
11from enum import Enum
12from typing import Callable, Dict, List, Optional, Tuple
15@dataclass(slots=True)
16class _Token:
17 type: str
18 value: str
19 lineno: int
20 column: int
23class CLexer:
24 """A standalone lexer for C.
26 Parameters for construction:
27 error_func:
28 Called with (msg, line, column) on lexing errors.
29 on_lbrace_func:
30 Called when an LBRACE token is produced (used for scope tracking).
31 on_rbrace_func:
32 Called when an RBRACE token is produced (used for scope tracking).
33 type_lookup_func:
34 Called with an identifier name; expected to return True if it is
35 a typedef name and should be tokenized as TYPEID.
37 Call input(text) to initialize lexing, and then keep calling token() to
38 get the next token, until it returns None (at end of input).
39 """
41 def __init__(
42 self,
43 error_func: Callable[[str, int, int], None],
44 on_lbrace_func: Callable[[], None],
45 on_rbrace_func: Callable[[], None],
46 type_lookup_func: Callable[[str], bool],
47 ) -> None:
48 self.error_func = error_func
49 self.on_lbrace_func = on_lbrace_func
50 self.on_rbrace_func = on_rbrace_func
51 self.type_lookup_func = type_lookup_func
52 self._init_state()
54 def input(self, text: str, filename: str = "") -> None:
55 """Initialize the lexer to the given input text.
57 filename is an optional name identifying the file from which the input
58 comes. The lexer can modify it if #line directives are encountered.
59 """
60 self._init_state()
61 self._lexdata = text
62 self._filename = filename
64 def _init_state(self) -> None:
65 self._lexdata = ""
66 self._filename = ""
67 self._pos = 0
68 self._line_start = 0
69 self._pending_tok: Optional[_Token] = None
70 self._lineno = 1
72 @property
73 def filename(self) -> str:
74 return self._filename
76 def token(self) -> Optional[_Token]:
77 # Lexing strategy overview:
78 #
79 # - We maintain a current position (self._pos), line number, and the
80 # byte offset of the current line start. The lexer is a simple loop
81 # that skips whitespace/newlines and emits one token per call.
82 # - A small amount of logic is handled manually before regex matching:
83 #
84 # * Preprocessor-style directives: if we see '#', we check whether
85 # it's a #line or #pragma directive and consume it inline. #line
86 # updates lineno/filename and produces no tokens. #pragma can yield
87 # both PPPRAGMA and PPPRAGMASTR, but token() returns a single token,
88 # so we stash the PPPRAGMASTR as _pending_tok to return on the next
89 # token() call. Otherwise we return PPHASH.
90 # * Newlines update lineno/line-start tracking so tokens can record
91 # accurate columns.
92 #
93 # - The bulk of tokens are recognized in _match_token:
94 #
95 # * _regex_rules: regex patterns for identifiers, literals, and other
96 # complex tokens (including error-producing patterns). The lexer
97 # uses a combined _regex_master to scan options at the same time.
98 # * _fixed_tokens: exact string matches for operators and punctuation,
99 # resolved by longest match.
100 #
101 # - Error patterns call the error callback and advance minimally, which
102 # keeps lexing resilient while reporting useful diagnostics.
103 text = self._lexdata
104 n = len(text)
106 if self._pending_tok is not None:
107 tok = self._pending_tok
108 self._pending_tok = None
109 return tok
111 while self._pos < n:
112 match text[self._pos]:
113 case " " | "\t":
114 self._pos += 1
115 case "\n":
116 self._lineno += 1
117 self._pos += 1
118 self._line_start = self._pos
119 case "#":
120 if _line_pattern.match(text, self._pos + 1):
121 self._pos += 1
122 self._handle_ppline()
123 continue
124 if _pragma_pattern.match(text, self._pos + 1):
125 self._pos += 1
126 toks = self._handle_pppragma()
127 if len(toks) > 1:
128 self._pending_tok = toks[1]
129 if len(toks) > 0:
130 return toks[0]
131 continue
132 tok = self._make_token("PPHASH", "#", self._pos)
133 self._pos += 1
134 return tok
135 case _:
136 if tok := self._match_token():
137 return tok
138 else:
139 continue
141 def _match_token(self) -> Optional[_Token]:
142 """Match one token at the current position.
144 Returns a Token on success, or None if no token could be matched and
145 an error was reported. This method always advances _pos by the matched
146 length, or by 1 on error/no-match.
147 """
148 text = self._lexdata
149 pos = self._pos
150 # We pick the longest match between:
151 # - the master regex (identifiers, literals, error patterns, etc.)
152 # - fixed operator/punctuator literals from the bucket for text[pos]
153 #
154 # The longest match is required to ensure we properly lex something
155 # like ".123" (a floating-point constant) as a single entity (with
156 # FLOAT_CONST), rather than a PERIOD followed by a number.
157 #
158 # The fixed-literal buckets are already length-sorted, so within that
159 # bucket we can take the first match. However, we still compare its
160 # length to the regex match because the regex may have matched a longer
161 # token that should take precedence.
162 best = None
164 if m := _regex_master.match(text, pos):
165 tok_type = m.lastgroup
166 # All master-regex alternatives are named; lastgroup shouldn't be None.
167 assert tok_type is not None
168 value = m.group(tok_type)
169 length = len(value)
170 action, msg = _regex_actions[tok_type]
171 best = (length, tok_type, value, action, msg)
173 if bucket := _fixed_tokens_by_first.get(text[pos]):
174 for entry in bucket:
175 if text.startswith(entry.literal, pos):
176 length = len(entry.literal)
177 if best is None or length > best[0]:
178 best = (
179 length,
180 entry.tok_type,
181 entry.literal,
182 _RegexAction.TOKEN,
183 None,
184 )
185 break
187 if best is None:
188 msg = f"Illegal character {repr(text[pos])}"
189 self._error(msg, pos)
190 self._pos += 1
191 return None
193 length, tok_type, value, action, msg = best
194 if action == _RegexAction.ERROR:
195 if tok_type == "BAD_CHAR_CONST":
196 msg = f"Invalid char constant {value}"
197 # All other ERROR rules provide a message.
198 assert msg is not None
199 self._error(msg, pos)
200 self._pos += max(1, length)
201 return None
203 if action == _RegexAction.ID:
204 tok_type = _keyword_map.get(value, "ID")
205 if tok_type == "ID" and self.type_lookup_func(value):
206 tok_type = "TYPEID"
208 tok = self._make_token(tok_type, value, pos)
209 self._pos += length
211 if tok.type == "LBRACE":
212 self.on_lbrace_func()
213 elif tok.type == "RBRACE":
214 self.on_rbrace_func()
216 return tok
218 def _make_token(self, tok_type: str, value: str, pos: int) -> _Token:
219 """Create a Token at an absolute input position.
221 Expects tok_type/value and the absolute byte offset pos in the current
222 input. Does not advance lexer state; callers manage _pos themselves.
223 Returns a Token with lineno/column computed from current line tracking.
224 """
225 column = pos - self._line_start + 1
226 tok = _Token(tok_type, value, self._lineno, column)
227 return tok
229 def _error(self, msg: str, pos: int) -> None:
230 column = pos - self._line_start + 1
231 self.error_func(msg, self._lineno, column)
233 def _handle_ppline(self) -> None:
234 # Since #line directives aren't supposed to return tokens but should
235 # only affect the lexer's state (update line/filename for coords), this
236 # method does a bit of parsing on its own. It doesn't return anything,
237 # but its side effect is to update self._pos past the directive, and
238 # potentially update self._lineno and self._filename, based on the
239 # directive's contents.
240 #
241 # Accepted #line forms from preprocessors:
242 # - "#line 66 \"kwas\\df.h\""
243 # - "# 9"
244 # - "#line 10 \"include/me.h\" 1 2 3" (extra numeric flags)
245 # - "# 1 \"file.h\" 3"
246 # Errors we must report:
247 # - "#line \"file.h\"" (filename before line number)
248 # - "#line df" (garbage instead of number/string)
249 #
250 # We scan the directive line once (after an optional 'line' keyword),
251 # validating the order: NUMBER, optional STRING, then any NUMBERs.
252 # The NUMBERs tail is only accepted if a filename STRING was present.
253 text = self._lexdata
254 n = len(text)
255 line_end = text.find("\n", self._pos)
256 if line_end == -1:
257 line_end = n
258 line = text[self._pos : line_end]
259 pos = 0
260 line_len = len(line)
262 def skip_ws() -> None:
263 nonlocal pos
264 while pos < line_len and line[pos] in " \t":
265 pos += 1
267 skip_ws()
268 if line.startswith("line", pos):
269 pos += 4
271 def success(pp_line: Optional[str], pp_filename: Optional[str]) -> None:
272 if pp_line is None:
273 self._error("line number missing in #line", self._pos + line_len)
274 else:
275 self._lineno = int(pp_line)
276 if pp_filename is not None:
277 self._filename = pp_filename
278 self._pos = line_end + 1
279 self._line_start = self._pos
281 def fail(msg: str, offset: int) -> None:
282 self._error(msg, self._pos + offset)
283 self._pos = line_end + 1
284 self._line_start = self._pos
286 skip_ws()
287 if pos >= line_len:
288 success(None, None)
289 return
290 if line[pos] == '"':
291 fail("filename before line number in #line", pos)
292 return
294 m = re.match(_decimal_constant, line[pos:])
295 if not m:
296 fail("invalid #line directive", pos)
297 return
299 pp_line = m.group(0)
300 pos += len(pp_line)
301 skip_ws()
302 if pos >= line_len:
303 success(pp_line, None)
304 return
306 if line[pos] != '"':
307 fail("invalid #line directive", pos)
308 return
310 m = re.match(_string_literal, line[pos:])
311 if not m:
312 fail("invalid #line directive", pos)
313 return
315 pp_filename = m.group(0).lstrip('"').rstrip('"')
316 pos += len(m.group(0))
318 # Consume arbitrary sequence of numeric flags after the directive
319 while True:
320 skip_ws()
321 if pos >= line_len:
322 break
323 m = re.match(_decimal_constant, line[pos:])
324 if not m:
325 fail("invalid #line directive", pos)
326 return
327 pos += len(m.group(0))
329 success(pp_line, pp_filename)
331 def _handle_pppragma(self) -> List[_Token]:
332 # Parse a full #pragma line; returns a list of tokens with 1 or 2
333 # tokens - PPPRAGMA and an optional PPPRAGMASTR. If an empty list is
334 # returned, it means an error occurred, or we're at the end of input.
335 #
336 # Examples:
337 # - "#pragma" -> PPPRAGMA only
338 # - "#pragma once" -> PPPRAGMA, PPPRAGMASTR("once")
339 # - "# pragma omp parallel private(th_id)" -> PPPRAGMA, PPPRAGMASTR("omp parallel private(th_id)")
340 # - "#\tpragma {pack: 2, smack: 3}" -> PPPRAGMA, PPPRAGMASTR("{pack: 2, smack: 3}")
341 text = self._lexdata
342 n = len(text)
343 pos = self._pos
345 while pos < n and text[pos] in " \t":
346 pos += 1
347 if pos >= n:
348 self._pos = pos
349 return []
351 if not text.startswith("pragma", pos):
352 self._error("invalid #pragma directive", pos)
353 self._pos = pos + 1
354 return []
356 pragma_pos = pos
357 pos += len("pragma")
358 toks = [self._make_token("PPPRAGMA", "pragma", pragma_pos)]
360 while pos < n and text[pos] in " \t":
361 pos += 1
363 start = pos
364 while pos < n and text[pos] != "\n":
365 pos += 1
366 if pos > start:
367 toks.append(self._make_token("PPPRAGMASTR", text[start:pos], start))
368 if pos < n and text[pos] == "\n":
369 self._lineno += 1
370 pos += 1
371 self._line_start = pos
372 self._pos = pos
373 return toks
376##
377## Reserved keywords
378##
379_keywords: Tuple[str, ...] = (
380 "AUTO",
381 "BREAK",
382 "CASE",
383 "CHAR",
384 "CONST",
385 "CONTINUE",
386 "DEFAULT",
387 "DO",
388 "DOUBLE",
389 "ELSE",
390 "ENUM",
391 "EXTERN",
392 "FLOAT",
393 "FOR",
394 "GOTO",
395 "IF",
396 "INLINE",
397 "INT",
398 "LONG",
399 "REGISTER",
400 "OFFSETOF",
401 "RESTRICT",
402 "RETURN",
403 "SHORT",
404 "SIGNED",
405 "SIZEOF",
406 "STATIC",
407 "STRUCT",
408 "SWITCH",
409 "TYPEDEF",
410 "UNION",
411 "UNSIGNED",
412 "VOID",
413 "VOLATILE",
414 "WHILE",
415 "__INT128",
416 "_BOOL",
417 "_COMPLEX",
418 "_NORETURN",
419 "_THREAD_LOCAL",
420 "_STATIC_ASSERT",
421 "_ATOMIC",
422 "_ALIGNOF",
423 "_ALIGNAS",
424 "_PRAGMA",
425)
427_keyword_map: Dict[str, str] = {}
429for keyword in _keywords:
430 # Keywords from new C standard are mixed-case, like _Bool, _Alignas, etc.
431 if keyword.startswith("_") and len(keyword) > 1 and keyword[1].isalpha():
432 _keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword
433 else:
434 _keyword_map[keyword.lower()] = keyword
436##
437## Regexes for use in tokens
438##
440# valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
441_identifier = r"[a-zA-Z_$][0-9a-zA-Z_$]*"
443_hex_prefix = "0[xX]"
444_hex_digits = "[0-9a-fA-F]+"
445_bin_prefix = "0[bB]"
446_bin_digits = "[01]+"
448# integer constants (K&R2: A.2.5.1)
449_integer_suffix_opt = (
450 r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?"
451)
452_decimal_constant = (
453 "(0" + _integer_suffix_opt + ")|([1-9][0-9]*" + _integer_suffix_opt + ")"
454)
455_octal_constant = "0[0-7]*" + _integer_suffix_opt
456_hex_constant = _hex_prefix + _hex_digits + _integer_suffix_opt
457_bin_constant = _bin_prefix + _bin_digits + _integer_suffix_opt
459_bad_octal_constant = "0[0-7]*[89]"
461# comments are not supported
462_unsupported_c_style_comment = r"\/\*"
463_unsupported_cxx_style_comment = r"\/\/"
465# character constants (K&R2: A.2.5.2)
466# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
467# directives with Windows paths as filenames (..\..\dir\file)
468# For the same reason, decimal_escape allows all digit sequences. We want to
469# parse all correct code, even if it means to sometimes parse incorrect
470# code.
471#
472# The original regexes were taken verbatim from the C syntax definition,
473# and were later modified to avoid worst-case exponential running time.
474#
475# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
476# decimal_escape = r"""(\d+)"""
477# hex_escape = r"""(x[0-9a-fA-F]+)"""
478# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
479#
480# The following modifications were made to avoid the ambiguity that allowed
481# backtracking: (https://github.com/eliben/pycparser/issues/61)
482#
483# - \x was removed from simple_escape, unless it was not followed by a hex
484# digit, to avoid ambiguity with hex_escape.
485# - hex_escape allows one or more hex characters, but requires that the next
486# character(if any) is not hex
487# - decimal_escape allows one or more decimal characters, but requires that the
488# next character(if any) is not a decimal
489# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the
490# permissive decimal_escape.
491#
492# Without this change, python's `re` module would recursively try parsing each
493# ambiguous escape sequence in multiple ways. e.g. `\123` could be parsed as
494# `\1`+`23`, `\12`+`3`, and `\123`.
496_simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
497_decimal_escape = r"""(\d+)(?!\d)"""
498_hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
499_bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
501_escape_sequence = (
502 r"""(\\(""" + _simple_escape + "|" + _decimal_escape + "|" + _hex_escape + "))"
503)
505# This complicated regex with lookahead might be slow for strings, so because
506# all of the valid escapes (including \x) allowed
507# 0 or more non-escaped characters after the first character,
508# simple_escape+decimal_escape+hex_escape got simplified to
510_escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
512_cconst_char = r"""([^'\\\n]|""" + _escape_sequence + ")"
513_char_const = "'" + _cconst_char + "'"
514_wchar_const = "L" + _char_const
515_u8char_const = "u8" + _char_const
516_u16char_const = "u" + _char_const
517_u32char_const = "U" + _char_const
518_multicharacter_constant = "'" + _cconst_char + "{2,4}'"
519_unmatched_quote = "('" + _cconst_char + "*\\n)|('" + _cconst_char + "*$)"
520_bad_char_const = (
521 r"""('""" + _cconst_char + """[^'\n]+')|('')|('""" + _bad_escape + r"""[^'\n]*')"""
522)
524# string literals (K&R2: A.2.6)
525_string_char = r"""([^"\\\n]|""" + _escape_sequence_start_in_string + ")"
526_string_literal = '"' + _string_char + '*"'
527_wstring_literal = "L" + _string_literal
528_u8string_literal = "u8" + _string_literal
529_u16string_literal = "u" + _string_literal
530_u32string_literal = "U" + _string_literal
531_bad_string_literal = '"' + _string_char + "*" + _bad_escape + _string_char + '*"'
533# floating constants (K&R2: A.2.5.3)
534_exponent_part = r"""([eE][-+]?[0-9]+)"""
535_fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
536_floating_constant = (
537 "(((("
538 + _fractional_constant
539 + ")"
540 + _exponent_part
541 + "?)|([0-9]+"
542 + _exponent_part
543 + "))[FfLl]?)"
544)
545_binary_exponent_part = r"""([pP][+-]?[0-9]+)"""
546_hex_fractional_constant = (
547 "(((" + _hex_digits + r""")?\.""" + _hex_digits + ")|(" + _hex_digits + r"""\.))"""
548)
549_hex_floating_constant = (
550 "("
551 + _hex_prefix
552 + "("
553 + _hex_digits
554 + "|"
555 + _hex_fractional_constant
556 + ")"
557 + _binary_exponent_part
558 + "[FfLl]?)"
559)
562class _RegexAction(Enum):
563 TOKEN = 0
564 ID = 1
565 ERROR = 2
568@dataclass(frozen=True)
569class _RegexRule:
570 # tok_type: name of the token emitted for a match
571 # regex_pattern: the raw regex (no anchors) to match at the current position
572 # action: TOKEN for normal tokens, ID for identifiers, ERROR to report
573 # error_message: message used for ERROR entries
574 tok_type: str
575 regex_pattern: str
576 action: _RegexAction
577 error_message: Optional[str]
580_regex_rules: List[_RegexRule] = [
581 _RegexRule(
582 "UNSUPPORTED_C_STYLE_COMMENT",
583 _unsupported_c_style_comment,
584 _RegexAction.ERROR,
585 "Comments are not supported, see https://github.com/eliben/pycparser#3using.",
586 ),
587 _RegexRule(
588 "UNSUPPORTED_CXX_STYLE_COMMENT",
589 _unsupported_cxx_style_comment,
590 _RegexAction.ERROR,
591 "Comments are not supported, see https://github.com/eliben/pycparser#3using.",
592 ),
593 _RegexRule(
594 "BAD_STRING_LITERAL",
595 _bad_string_literal,
596 _RegexAction.ERROR,
597 "String contains invalid escape code",
598 ),
599 _RegexRule("WSTRING_LITERAL", _wstring_literal, _RegexAction.TOKEN, None),
600 _RegexRule("U8STRING_LITERAL", _u8string_literal, _RegexAction.TOKEN, None),
601 _RegexRule("U16STRING_LITERAL", _u16string_literal, _RegexAction.TOKEN, None),
602 _RegexRule("U32STRING_LITERAL", _u32string_literal, _RegexAction.TOKEN, None),
603 _RegexRule("STRING_LITERAL", _string_literal, _RegexAction.TOKEN, None),
604 _RegexRule("HEX_FLOAT_CONST", _hex_floating_constant, _RegexAction.TOKEN, None),
605 _RegexRule("FLOAT_CONST", _floating_constant, _RegexAction.TOKEN, None),
606 _RegexRule("INT_CONST_HEX", _hex_constant, _RegexAction.TOKEN, None),
607 _RegexRule("INT_CONST_BIN", _bin_constant, _RegexAction.TOKEN, None),
608 _RegexRule(
609 "BAD_CONST_OCT",
610 _bad_octal_constant,
611 _RegexAction.ERROR,
612 "Invalid octal constant",
613 ),
614 _RegexRule("INT_CONST_OCT", _octal_constant, _RegexAction.TOKEN, None),
615 _RegexRule("INT_CONST_DEC", _decimal_constant, _RegexAction.TOKEN, None),
616 _RegexRule("INT_CONST_CHAR", _multicharacter_constant, _RegexAction.TOKEN, None),
617 _RegexRule("CHAR_CONST", _char_const, _RegexAction.TOKEN, None),
618 _RegexRule("WCHAR_CONST", _wchar_const, _RegexAction.TOKEN, None),
619 _RegexRule("U8CHAR_CONST", _u8char_const, _RegexAction.TOKEN, None),
620 _RegexRule("U16CHAR_CONST", _u16char_const, _RegexAction.TOKEN, None),
621 _RegexRule("U32CHAR_CONST", _u32char_const, _RegexAction.TOKEN, None),
622 _RegexRule("UNMATCHED_QUOTE", _unmatched_quote, _RegexAction.ERROR, "Unmatched '"),
623 _RegexRule("BAD_CHAR_CONST", _bad_char_const, _RegexAction.ERROR, None),
624 _RegexRule("ID", _identifier, _RegexAction.ID, None),
625]
627_regex_actions: Dict[str, Tuple[_RegexAction, Optional[str]]] = {}
628_regex_pattern_parts: List[str] = []
629for _rule in _regex_rules:
630 _regex_actions[_rule.tok_type] = (_rule.action, _rule.error_message)
631 _regex_pattern_parts.append(f"(?P<{_rule.tok_type}>{_rule.regex_pattern})")
632# The master regex is a single alternation of all token patterns, each wrapped
633# in a named group. We match once at the current position and then use
634# `lastgroup` to recover which token kind fired; this avoids iterating over all
635# regexes on every character while keeping the same token-level semantics.
636_regex_master: re.Pattern[str] = re.compile("|".join(_regex_pattern_parts))
639@dataclass(frozen=True)
640class _FixedToken:
641 tok_type: str
642 literal: str
645_fixed_tokens: List[_FixedToken] = [
646 _FixedToken("ELLIPSIS", "..."),
647 _FixedToken("LSHIFTEQUAL", "<<="),
648 _FixedToken("RSHIFTEQUAL", ">>="),
649 _FixedToken("PLUSPLUS", "++"),
650 _FixedToken("MINUSMINUS", "--"),
651 _FixedToken("ARROW", "->"),
652 _FixedToken("LAND", "&&"),
653 _FixedToken("LOR", "||"),
654 _FixedToken("LSHIFT", "<<"),
655 _FixedToken("RSHIFT", ">>"),
656 _FixedToken("LE", "<="),
657 _FixedToken("GE", ">="),
658 _FixedToken("EQ", "=="),
659 _FixedToken("NE", "!="),
660 _FixedToken("TIMESEQUAL", "*="),
661 _FixedToken("DIVEQUAL", "/="),
662 _FixedToken("MODEQUAL", "%="),
663 _FixedToken("PLUSEQUAL", "+="),
664 _FixedToken("MINUSEQUAL", "-="),
665 _FixedToken("ANDEQUAL", "&="),
666 _FixedToken("OREQUAL", "|="),
667 _FixedToken("XOREQUAL", "^="),
668 _FixedToken("EQUALS", "="),
669 _FixedToken("PLUS", "+"),
670 _FixedToken("MINUS", "-"),
671 _FixedToken("TIMES", "*"),
672 _FixedToken("DIVIDE", "/"),
673 _FixedToken("MOD", "%"),
674 _FixedToken("OR", "|"),
675 _FixedToken("AND", "&"),
676 _FixedToken("NOT", "~"),
677 _FixedToken("XOR", "^"),
678 _FixedToken("LNOT", "!"),
679 _FixedToken("LT", "<"),
680 _FixedToken("GT", ">"),
681 _FixedToken("CONDOP", "?"),
682 _FixedToken("LPAREN", "("),
683 _FixedToken("RPAREN", ")"),
684 _FixedToken("LBRACKET", "["),
685 _FixedToken("RBRACKET", "]"),
686 _FixedToken("LBRACE", "{"),
687 _FixedToken("RBRACE", "}"),
688 _FixedToken("COMMA", ","),
689 _FixedToken("PERIOD", "."),
690 _FixedToken("SEMI", ";"),
691 _FixedToken("COLON", ":"),
692]
694# To avoid scanning all fixed tokens on every character, we bucket them by the
695# first character. When matching at position i, we only look at the bucket for
696# text[i], and we pre-sort that bucket by token length so the first match is
697# also the longest. This preserves longest-match semantics (e.g. '>>=' before
698# '>>' before '>') while reducing the number of comparisons.
699_fixed_tokens_by_first: Dict[str, List[_FixedToken]] = {}
700for _entry in _fixed_tokens:
701 _fixed_tokens_by_first.setdefault(_entry.literal[0], []).append(_entry)
702for _bucket in _fixed_tokens_by_first.values():
703 _bucket.sort(key=lambda item: len(item.literal), reverse=True)
705_line_pattern: re.Pattern[str] = re.compile(r"([ \t]*line\W)|([ \t]*\d+)")
706_pragma_pattern: re.Pattern[str] = re.compile(r"[ \t]*pragma\W")