Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pycparser/c_lexer.py: 75%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# ------------------------------------------------------------------------------
2# pycparser: c_lexer.py
3#
4# CLexer class: lexer for the C language
5#
6# Eli Bendersky [https://eli.thegreenplace.net/]
7# License: BSD
8# ------------------------------------------------------------------------------
9import re
10from dataclasses import dataclass
11from enum import Enum
12from typing import Callable, Dict, List, Optional, Tuple
15@dataclass(slots=True)
16class Token:
17 type: str
18 value: str
19 lineno: int
20 column: int
23class CLexer:
24 """A standalone lexer for C.
26 The lexer takes the following callback functions as parameters during
27 construction:
28 error_func:
29 Called with (msg, line, column) on lexing errors.
30 on_lbrace_func:
31 Called when an LBRACE token is produced (used for scope tracking).
32 on_rbrace_func:
33 Called when an RBRACE token is produced (used for scope tracking).
34 type_lookup_func:
35 Called with an identifier name; expected to return True if it is
36 a typedef name and should be tokenized as TYPEID.
38 Call input(text) to initialize lexing, and then keep calling token() to
39 get the next token, until it returns None (at end of input).
40 """
42 def __init__(
43 self,
44 error_func: Callable[[str, int, int], None],
45 on_lbrace_func: Callable[[], None],
46 on_rbrace_func: Callable[[], None],
47 type_lookup_func: Callable[[str], bool],
48 ) -> None:
49 self.error_func = error_func
50 self.on_lbrace_func = on_lbrace_func
51 self.on_rbrace_func = on_rbrace_func
52 self.type_lookup_func = type_lookup_func
53 self._init_state()
55 def input(self, text: str, filename: str = "") -> None:
56 """Initialize the lexer to the given input text.
58 filename is an optional name identifying the initial file from which the
59 input comes. The lexer may modify it if #line directives are
60 encountered.
61 """
62 self._init_state()
63 self._lexdata = text
64 self._filename = filename
66 def _init_state(self) -> None:
67 self._lexdata = ""
68 self._filename = ""
69 self._pos = 0
70 self._line_start = 0
71 self._pending_tok: Optional[Token] = None
72 self._lineno = 1
74 @property
75 def filename(self) -> str:
76 return self._filename
78 def token(self) -> Optional[Token]:
79 # Lexing strategy overview:
80 #
81 # - We maintain a current position (self._pos), line number, and the
82 # byte offset of the current line start. The lexer is a simple loop
83 # that skips whitespace/newlines and emits one token per call.
84 #
85 # - A small amount of logic is handled manually before regex matching:
86 #
87 # * Preprocessor-style directives: if we see '#', we check whether
88 # it's a #line or #pragma directive and consume it inline. #line
89 # updates lineno/filename and produces no tokens. #pragma can yield
90 # both PPPRAGMA and PPPRAGMASTR, but token() returns a single token,
91 # so we stash the PPPRAGMASTR as _pending_tok to return on the next
92 # token() call. Otherwise we return PPHASH.
93 # * Newlines update lineno/line-start tracking so tokens can record
94 # accurate columns.
95 #
96 # - The bulk of tokens are recognized in _match_token:
97 #
98 # * _regex_rules: regex patterns for identifiers, literals, and other
99 # complex tokens (including error-producing patterns). The lexer
100 # uses a combined _regex_master to scan options at the same time.
101 # * _fixed_tokens: exact string matches for operators and punctuation,
102 # resolved by longest match.
103 #
104 # - Error patterns call the error callback and advance minimally, which
105 # keeps lexing resilient while reporting useful diagnostics.
106 text = self._lexdata
107 n = len(text)
109 if self._pending_tok is not None:
110 tok = self._pending_tok
111 self._pending_tok = None
112 return tok
114 while self._pos < n:
115 match text[self._pos]:
116 case " " | "\t":
117 self._pos += 1
118 case "\n":
119 self._lineno += 1
120 self._pos += 1
121 self._line_start = self._pos
122 case "#":
123 if _line_pattern.match(text, self._pos + 1):
124 self._pos += 1
125 self._handle_ppline()
126 continue
127 if _pragma_pattern.match(text, self._pos + 1):
128 self._pos += 1
129 toks = self._handle_pppragma()
130 if len(toks) > 1:
131 self._pending_tok = toks[1]
132 if len(toks) > 0:
133 return toks[0]
134 continue
135 tok = self._make_token("PPHASH", "#", self._pos)
136 self._pos += 1
137 return tok
138 case _:
139 if tok := self._match_token():
140 return tok
141 else:
142 continue
144 def _match_token(self) -> Optional[Token]:
145 """Match one token at the current position.
147 Returns a Token on success, or None if no token could be matched and
148 an error was reported. This method always advances _pos by the matched
149 length, or by 1 on error/no-match.
150 """
151 text = self._lexdata
152 pos = self._pos
153 # We pick the longest match between:
154 # - the master regex (identifiers, literals, error patterns, etc.)
155 # - fixed operator/punctuator literals from the bucket for text[pos]
156 #
157 # The longest match is required to ensure we properly lex something
158 # like ".123" (a floating-point constant) as a single entity (with
159 # FLOAT_CONST), rather than a PERIOD followed by a number.
160 #
161 # The fixed-literal buckets are already length-sorted, so within that
162 # bucket we can take the first match. However, we still compare its
163 # length to the regex match because the regex may have matched a longer
164 # token that should take precedence.
165 best = None
167 if m := _regex_master.match(text, pos):
168 tok_type = m.lastgroup
169 # All master-regex alternatives are named; lastgroup shouldn't be None.
170 assert tok_type is not None
171 value = m.group(tok_type)
172 length = len(value)
173 action, msg = _regex_actions[tok_type]
174 best = (length, tok_type, value, action, msg)
176 if bucket := _fixed_tokens_by_first.get(text[pos]):
177 for entry in bucket:
178 if text.startswith(entry.literal, pos):
179 length = len(entry.literal)
180 if best is None or length > best[0]:
181 best = (
182 length,
183 entry.tok_type,
184 entry.literal,
185 _RegexAction.TOKEN,
186 None,
187 )
188 break
190 if best is None:
191 self._error(f"Illegal character {repr(text[pos])}", pos)
192 self._pos += 1
193 return None
195 length, tok_type, value, action, msg = best
196 match action:
197 case _RegexAction.TOKEN:
198 pass
199 case _RegexAction.ERROR:
200 if tok_type == "BAD_CHAR_CONST":
201 msg = f"Invalid char constant {value}"
202 # All other ERROR rules provide a message.
203 assert msg is not None
204 self._error(msg, pos)
205 self._pos += max(1, length)
206 return None
207 case _RegexAction.ID:
208 tok_type = _keyword_map.get(value, "ID")
209 if tok_type == "ID" and self.type_lookup_func(value):
210 tok_type = "TYPEID"
211 case _:
212 raise RuntimeError("unreachable")
214 tok = self._make_token(tok_type, value, pos)
215 self._pos += length
217 if tok.type == "LBRACE":
218 self.on_lbrace_func()
219 elif tok.type == "RBRACE":
220 self.on_rbrace_func()
222 return tok
224 def _make_token(self, tok_type: str, value: str, pos: int) -> Token:
225 """Create a Token at an absolute input position.
227 Expects tok_type/value and the absolute byte offset pos in the current
228 input. Does not advance lexer state; callers manage _pos themselves.
229 Returns a Token with lineno/column computed from current line tracking.
230 """
231 column = pos - self._line_start + 1
232 tok = Token(tok_type, value, self._lineno, column)
233 return tok
235 def _error(self, msg: str, pos: int) -> None:
236 column = pos - self._line_start + 1
237 self.error_func(msg, self._lineno, column)
239 def _handle_ppline(self) -> None:
240 # Since #line directives aren't supposed to return tokens but should
241 # only affect the lexer's state (update line/filename for coords), this
242 # method does a bit of parsing on its own. It doesn't return anything,
243 # but its side effect is to update self._pos past the directive, and
244 # potentially update self._lineno and self._filename, based on the
245 # directive's contents.
246 #
247 # Accepted #line forms from preprocessors:
248 # - "#line 66 \"kwas\\df.h\""
249 # - "# 9"
250 # - "#line 10 \"include/me.h\" 1 2 3" (extra numeric flags)
251 # - "# 1 \"file.h\" 3"
252 # Errors we must report:
253 # - "#line \"file.h\"" (filename before line number)
254 # - "#line df" (garbage instead of number/string)
255 #
256 # We scan the directive line once (after an optional 'line' keyword),
257 # validating the order: NUMBER, optional STRING, then any NUMBERs.
258 # The NUMBERs tail is only accepted if a filename STRING was present.
259 text = self._lexdata
260 n = len(text)
261 line_end = text.find("\n", self._pos)
262 if line_end == -1:
263 line_end = n
264 line = text[self._pos : line_end]
265 pos = 0
266 line_len = len(line)
268 def skip_ws() -> None:
269 nonlocal pos
270 while pos < line_len and line[pos] in " \t":
271 pos += 1
273 skip_ws()
274 if line.startswith("line", pos):
275 pos += 4
277 def success(pp_line: Optional[str], pp_filename: Optional[str]) -> None:
278 if pp_line is None:
279 self._error("line number missing in #line", self._pos + line_len)
280 else:
281 self._lineno = int(pp_line)
282 if pp_filename is not None:
283 self._filename = pp_filename
284 self._pos = line_end + 1
285 self._line_start = self._pos
287 def fail(msg: str, offset: int) -> None:
288 self._error(msg, self._pos + offset)
289 self._pos = line_end + 1
290 self._line_start = self._pos
292 skip_ws()
293 if pos >= line_len:
294 success(None, None)
295 return
296 if line[pos] == '"':
297 fail("filename before line number in #line", pos)
298 return
300 m = re.match(_decimal_constant, line[pos:])
301 if not m:
302 fail("invalid #line directive", pos)
303 return
305 pp_line = m.group(0)
306 pos += len(pp_line)
307 skip_ws()
308 if pos >= line_len:
309 success(pp_line, None)
310 return
312 if line[pos] != '"':
313 fail("invalid #line directive", pos)
314 return
316 m = re.match(_string_literal, line[pos:])
317 if not m:
318 fail("invalid #line directive", pos)
319 return
321 pp_filename = m.group(0).lstrip('"').rstrip('"')
322 pos += len(m.group(0))
324 # Consume arbitrary sequence of numeric flags after the directive
325 while True:
326 skip_ws()
327 if pos >= line_len:
328 break
329 m = re.match(_decimal_constant, line[pos:])
330 if not m:
331 fail("invalid #line directive", pos)
332 return
333 pos += len(m.group(0))
335 success(pp_line, pp_filename)
337 def _handle_pppragma(self) -> List[Token]:
338 # Parse a full #pragma line; returns a list of tokens with 1 or 2
339 # tokens - PPPRAGMA and an optional PPPRAGMASTR. If an empty list is
340 # returned, it means an error occurred, or we're at the end of input.
341 #
342 # Examples:
343 # - "#pragma" -> PPPRAGMA only
344 # - "#pragma once" -> PPPRAGMA, PPPRAGMASTR("once")
345 # - "# pragma omp parallel private(th_id)" -> PPPRAGMA, PPPRAGMASTR("omp parallel private(th_id)")
346 # - "#\tpragma {pack: 2, smack: 3}" -> PPPRAGMA, PPPRAGMASTR("{pack: 2, smack: 3}")
347 text = self._lexdata
348 n = len(text)
349 pos = self._pos
351 while pos < n and text[pos] in " \t":
352 pos += 1
353 if pos >= n:
354 self._pos = pos
355 return []
357 if not text.startswith("pragma", pos):
358 self._error("invalid #pragma directive", pos)
359 self._pos = pos + 1
360 return []
362 pragma_pos = pos
363 pos += len("pragma")
364 toks = [self._make_token("PPPRAGMA", "pragma", pragma_pos)]
366 while pos < n and text[pos] in " \t":
367 pos += 1
369 start = pos
370 while pos < n and text[pos] != "\n":
371 pos += 1
372 if pos > start:
373 toks.append(self._make_token("PPPRAGMASTR", text[start:pos], start))
374 if pos < n and text[pos] == "\n":
375 self._lineno += 1
376 pos += 1
377 self._line_start = pos
378 self._pos = pos
379 return toks
382##
383## Reserved keywords
384##
385_keywords: Tuple[str, ...] = (
386 "AUTO",
387 "BREAK",
388 "CASE",
389 "CHAR",
390 "CONST",
391 "CONTINUE",
392 "DEFAULT",
393 "DO",
394 "DOUBLE",
395 "ELSE",
396 "ENUM",
397 "EXTERN",
398 "FLOAT",
399 "FOR",
400 "GOTO",
401 "IF",
402 "INLINE",
403 "INT",
404 "LONG",
405 "REGISTER",
406 "OFFSETOF",
407 "RESTRICT",
408 "RETURN",
409 "SHORT",
410 "SIGNED",
411 "SIZEOF",
412 "STATIC",
413 "STRUCT",
414 "SWITCH",
415 "TYPEDEF",
416 "UNION",
417 "UNSIGNED",
418 "VOID",
419 "VOLATILE",
420 "WHILE",
421 "__INT128",
422 "_BOOL",
423 "_COMPLEX",
424 "_NORETURN",
425 "_THREAD_LOCAL",
426 "_STATIC_ASSERT",
427 "_ATOMIC",
428 "_ALIGNOF",
429 "_ALIGNAS",
430 "_PRAGMA",
431)
433_keyword_map: Dict[str, str] = {}
435for keyword in _keywords:
436 # Keywords from new C standard are mixed-case, like _Bool, _Alignas, etc.
437 if keyword.startswith("_") and len(keyword) > 1 and keyword[1].isalpha():
438 _keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword
439 else:
440 _keyword_map[keyword.lower()] = keyword
442##
443## Regexes for use in tokens
444##
446# valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
447_identifier = r"[a-zA-Z_$][0-9a-zA-Z_$]*"
449_hex_prefix = "0[xX]"
450_hex_digits = "[0-9a-fA-F]+"
451_bin_prefix = "0[bB]"
452_bin_digits = "[01]+"
454# integer constants (K&R2: A.2.5.1)
455_integer_suffix_opt = (
456 r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?"
457)
458_decimal_constant = (
459 "(0" + _integer_suffix_opt + ")|([1-9][0-9]*" + _integer_suffix_opt + ")"
460)
461_octal_constant = "0[0-7]*" + _integer_suffix_opt
462_hex_constant = _hex_prefix + _hex_digits + _integer_suffix_opt
463_bin_constant = _bin_prefix + _bin_digits + _integer_suffix_opt
465_bad_octal_constant = "0[0-7]*[89]"
467# comments are not supported
468_unsupported_c_style_comment = r"\/\*"
469_unsupported_cxx_style_comment = r"\/\/"
471# character constants (K&R2: A.2.5.2)
472# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
473# directives with Windows paths as filenames (..\..\dir\file)
474# For the same reason, decimal_escape allows all digit sequences. We want to
475# parse all correct code, even if it means to sometimes parse incorrect
476# code.
477#
478# The original regexes were taken verbatim from the C syntax definition,
479# and were later modified to avoid worst-case exponential running time.
480#
481# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
482# decimal_escape = r"""(\d+)"""
483# hex_escape = r"""(x[0-9a-fA-F]+)"""
484# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
485#
486# The following modifications were made to avoid the ambiguity that allowed
487# backtracking: (https://github.com/eliben/pycparser/issues/61)
488#
489# - \x was removed from simple_escape, unless it was not followed by a hex
490# digit, to avoid ambiguity with hex_escape.
491# - hex_escape allows one or more hex characters, but requires that the next
492# character(if any) is not hex
493# - decimal_escape allows one or more decimal characters, but requires that the
494# next character(if any) is not a decimal
495# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the
496# permissive decimal_escape.
497#
498# Without this change, python's `re` module would recursively try parsing each
499# ambiguous escape sequence in multiple ways. e.g. `\123` could be parsed as
500# `\1`+`23`, `\12`+`3`, and `\123`.
502_simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
503_decimal_escape = r"""(\d+)(?!\d)"""
504_hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
505_bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
507_escape_sequence = (
508 r"""(\\(""" + _simple_escape + "|" + _decimal_escape + "|" + _hex_escape + "))"
509)
511# This complicated regex with lookahead might be slow for strings, so because
512# all of the valid escapes (including \x) allowed
513# 0 or more non-escaped characters after the first character,
514# simple_escape+decimal_escape+hex_escape got simplified to
516_escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
518_cconst_char = r"""([^'\\\n]|""" + _escape_sequence + ")"
519_char_const = "'" + _cconst_char + "'"
520_wchar_const = "L" + _char_const
521_u8char_const = "u8" + _char_const
522_u16char_const = "u" + _char_const
523_u32char_const = "U" + _char_const
524_multicharacter_constant = "'" + _cconst_char + "{2,4}'"
525_unmatched_quote = "('" + _cconst_char + "*\\n)|('" + _cconst_char + "*$)"
526_bad_char_const = (
527 r"""('""" + _cconst_char + """[^'\n]+')|('')|('""" + _bad_escape + r"""[^'\n]*')"""
528)
530# string literals (K&R2: A.2.6)
531_string_char = r"""([^"\\\n]|""" + _escape_sequence_start_in_string + ")"
532_string_literal = '"' + _string_char + '*"'
533_wstring_literal = "L" + _string_literal
534_u8string_literal = "u8" + _string_literal
535_u16string_literal = "u" + _string_literal
536_u32string_literal = "U" + _string_literal
537_bad_string_literal = '"' + _string_char + "*" + _bad_escape + _string_char + '*"'
539# floating constants (K&R2: A.2.5.3)
540_exponent_part = r"""([eE][-+]?[0-9]+)"""
541_fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
542_floating_constant = (
543 "(((("
544 + _fractional_constant
545 + ")"
546 + _exponent_part
547 + "?)|([0-9]+"
548 + _exponent_part
549 + "))[FfLl]?)"
550)
551_binary_exponent_part = r"""([pP][+-]?[0-9]+)"""
552_hex_fractional_constant = (
553 "(((" + _hex_digits + r""")?\.""" + _hex_digits + ")|(" + _hex_digits + r"""\.))"""
554)
555_hex_floating_constant = (
556 "("
557 + _hex_prefix
558 + "("
559 + _hex_digits
560 + "|"
561 + _hex_fractional_constant
562 + ")"
563 + _binary_exponent_part
564 + "[FfLl]?)"
565)
568class _RegexAction(Enum):
569 TOKEN = 0
570 ID = 1
571 ERROR = 2
574@dataclass(frozen=True)
575class _RegexRule:
576 # tok_type: name of the token emitted for a match
577 # regex_pattern: the raw regex (no anchors) to match at the current position
578 # action: TOKEN for normal tokens, ID for identifiers, ERROR to report
579 # error_message: message used for ERROR entries
580 tok_type: str
581 regex_pattern: str
582 action: _RegexAction
583 error_message: Optional[str]
586_regex_rules: List[_RegexRule] = [
587 _RegexRule(
588 "UNSUPPORTED_C_STYLE_COMMENT",
589 _unsupported_c_style_comment,
590 _RegexAction.ERROR,
591 "Comments are not supported, see https://github.com/eliben/pycparser#3using.",
592 ),
593 _RegexRule(
594 "UNSUPPORTED_CXX_STYLE_COMMENT",
595 _unsupported_cxx_style_comment,
596 _RegexAction.ERROR,
597 "Comments are not supported, see https://github.com/eliben/pycparser#3using.",
598 ),
599 _RegexRule(
600 "BAD_STRING_LITERAL",
601 _bad_string_literal,
602 _RegexAction.ERROR,
603 "String contains invalid escape code",
604 ),
605 _RegexRule("WSTRING_LITERAL", _wstring_literal, _RegexAction.TOKEN, None),
606 _RegexRule("U8STRING_LITERAL", _u8string_literal, _RegexAction.TOKEN, None),
607 _RegexRule("U16STRING_LITERAL", _u16string_literal, _RegexAction.TOKEN, None),
608 _RegexRule("U32STRING_LITERAL", _u32string_literal, _RegexAction.TOKEN, None),
609 _RegexRule("STRING_LITERAL", _string_literal, _RegexAction.TOKEN, None),
610 _RegexRule("HEX_FLOAT_CONST", _hex_floating_constant, _RegexAction.TOKEN, None),
611 _RegexRule("FLOAT_CONST", _floating_constant, _RegexAction.TOKEN, None),
612 _RegexRule("INT_CONST_HEX", _hex_constant, _RegexAction.TOKEN, None),
613 _RegexRule("INT_CONST_BIN", _bin_constant, _RegexAction.TOKEN, None),
614 _RegexRule(
615 "BAD_CONST_OCT",
616 _bad_octal_constant,
617 _RegexAction.ERROR,
618 "Invalid octal constant",
619 ),
620 _RegexRule("INT_CONST_OCT", _octal_constant, _RegexAction.TOKEN, None),
621 _RegexRule("INT_CONST_DEC", _decimal_constant, _RegexAction.TOKEN, None),
622 _RegexRule("INT_CONST_CHAR", _multicharacter_constant, _RegexAction.TOKEN, None),
623 _RegexRule("CHAR_CONST", _char_const, _RegexAction.TOKEN, None),
624 _RegexRule("WCHAR_CONST", _wchar_const, _RegexAction.TOKEN, None),
625 _RegexRule("U8CHAR_CONST", _u8char_const, _RegexAction.TOKEN, None),
626 _RegexRule("U16CHAR_CONST", _u16char_const, _RegexAction.TOKEN, None),
627 _RegexRule("U32CHAR_CONST", _u32char_const, _RegexAction.TOKEN, None),
628 _RegexRule("UNMATCHED_QUOTE", _unmatched_quote, _RegexAction.ERROR, "Unmatched '"),
629 _RegexRule("BAD_CHAR_CONST", _bad_char_const, _RegexAction.ERROR, None),
630 _RegexRule("ID", _identifier, _RegexAction.ID, None),
631]
633_regex_actions: Dict[str, Tuple[_RegexAction, Optional[str]]] = {}
634_regex_pattern_parts: List[str] = []
635for _rule in _regex_rules:
636 _regex_actions[_rule.tok_type] = (_rule.action, _rule.error_message)
637 _regex_pattern_parts.append(f"(?P<{_rule.tok_type}>{_rule.regex_pattern})")
638# The master regex is a single alternation of all token patterns, each wrapped
639# in a named group. We match once at the current position and then use
640# `lastgroup` to recover which token kind fired; this avoids iterating over all
641# regexes on every character while keeping the same token-level semantics.
642_regex_master: re.Pattern[str] = re.compile("|".join(_regex_pattern_parts))
645@dataclass(frozen=True)
646class _FixedToken:
647 tok_type: str
648 literal: str
651_fixed_tokens: List[_FixedToken] = [
652 _FixedToken("ELLIPSIS", "..."),
653 _FixedToken("LSHIFTEQUAL", "<<="),
654 _FixedToken("RSHIFTEQUAL", ">>="),
655 _FixedToken("PLUSPLUS", "++"),
656 _FixedToken("MINUSMINUS", "--"),
657 _FixedToken("ARROW", "->"),
658 _FixedToken("LAND", "&&"),
659 _FixedToken("LOR", "||"),
660 _FixedToken("LSHIFT", "<<"),
661 _FixedToken("RSHIFT", ">>"),
662 _FixedToken("LE", "<="),
663 _FixedToken("GE", ">="),
664 _FixedToken("EQ", "=="),
665 _FixedToken("NE", "!="),
666 _FixedToken("TIMESEQUAL", "*="),
667 _FixedToken("DIVEQUAL", "/="),
668 _FixedToken("MODEQUAL", "%="),
669 _FixedToken("PLUSEQUAL", "+="),
670 _FixedToken("MINUSEQUAL", "-="),
671 _FixedToken("ANDEQUAL", "&="),
672 _FixedToken("OREQUAL", "|="),
673 _FixedToken("XOREQUAL", "^="),
674 _FixedToken("EQUALS", "="),
675 _FixedToken("PLUS", "+"),
676 _FixedToken("MINUS", "-"),
677 _FixedToken("TIMES", "*"),
678 _FixedToken("DIVIDE", "/"),
679 _FixedToken("MOD", "%"),
680 _FixedToken("OR", "|"),
681 _FixedToken("AND", "&"),
682 _FixedToken("NOT", "~"),
683 _FixedToken("XOR", "^"),
684 _FixedToken("LNOT", "!"),
685 _FixedToken("LT", "<"),
686 _FixedToken("GT", ">"),
687 _FixedToken("CONDOP", "?"),
688 _FixedToken("LPAREN", "("),
689 _FixedToken("RPAREN", ")"),
690 _FixedToken("LBRACKET", "["),
691 _FixedToken("RBRACKET", "]"),
692 _FixedToken("LBRACE", "{"),
693 _FixedToken("RBRACE", "}"),
694 _FixedToken("COMMA", ","),
695 _FixedToken("PERIOD", "."),
696 _FixedToken("SEMI", ";"),
697 _FixedToken("COLON", ":"),
698]
700# To avoid scanning all fixed tokens on every character, we bucket them by the
701# first character. When matching at position i, we only look at the bucket for
702# text[i], and we pre-sort that bucket by token length so the first match is
703# also the longest. This preserves longest-match semantics (e.g. '>>=' before
704# '>>' before '>') while reducing the number of comparisons.
705_fixed_tokens_by_first: Dict[str, List[_FixedToken]] = {}
706for _entry in _fixed_tokens:
707 _fixed_tokens_by_first.setdefault(_entry.literal[0], []).append(_entry)
708for _bucket in _fixed_tokens_by_first.values():
709 _bucket.sort(key=lambda item: len(item.literal), reverse=True)
711_line_pattern: re.Pattern[str] = re.compile(r"([ \t]*line\W)|([ \t]*\d+)")
712_pragma_pattern: re.Pattern[str] = re.compile(r"[ \t]*pragma\W")