1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""Parser engine for the grammar tables generated by pgen.
5
6The grammar table must be loaded first.
7
8See Parser/parser.c in the Python distribution for additional info on
9how this parsing engine works.
10
11"""
12
13from collections.abc import Callable, Iterator
14from contextlib import contextmanager
15from typing import TYPE_CHECKING, Any, Optional, Union, cast
16
17from blib2to3.pgen2.grammar import Grammar
18from blib2to3.pytree import NL, Context, Leaf, Node, RawNode, convert
19
20# Local imports
21from . import grammar, token, tokenize
22
23if TYPE_CHECKING:
24 from blib2to3.pgen2.driver import TokenProxy
25
26
27Results = dict[str, NL]
28Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
29DFA = list[list[tuple[int, int]]]
30DFAS = tuple[DFA, dict[int, int]]
31
32
33def lam_sub(grammar: Grammar, node: RawNode) -> NL:
34 assert node[3] is not None
35 return Node(type=node[0], children=node[3], context=node[2])
36
37
38# A placeholder node, used when parser is backtracking.
39DUMMY_NODE = (-1, None, None, None)
40
41
42def stack_copy(
43 stack: list[tuple[DFAS, int, RawNode]],
44) -> list[tuple[DFAS, int, RawNode]]:
45 """Nodeless stack copy."""
46 return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]
47
48
49class Recorder:
50 def __init__(self, parser: "Parser", ilabels: list[int], context: Context) -> None:
51 self.parser = parser
52 self._ilabels = ilabels
53 self.context = context # not really matter
54
55 self._dead_ilabels: set[int] = set()
56 self._start_point = self.parser.stack
57 self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
58
59 @property
60 def ilabels(self) -> set[int]:
61 return self._dead_ilabels.symmetric_difference(self._ilabels)
62
63 @contextmanager
64 def switch_to(self, ilabel: int) -> Iterator[None]:
65 with self.backtrack():
66 self.parser.stack = self._points[ilabel]
67 try:
68 yield
69 except ParseError:
70 self._dead_ilabels.add(ilabel)
71 finally:
72 self.parser.stack = self._start_point
73
74 @contextmanager
75 def backtrack(self) -> Iterator[None]:
76 """
77 Use the node-level invariant ones for basic parsing operations (push/pop/shift).
78 These still will operate on the stack; but they won't create any new nodes, or
79 modify the contents of any other existing nodes.
80
81 This saves us a ton of time when we are backtracking, since we
82 want to restore to the initial state as quick as possible, which
83 can only be done by having as little mutatations as possible.
84 """
85 is_backtracking = self.parser.is_backtracking
86 try:
87 self.parser.is_backtracking = True
88 yield
89 finally:
90 self.parser.is_backtracking = is_backtracking
91
92 def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None:
93 for ilabel in self.ilabels:
94 with self.switch_to(ilabel):
95 if raw:
96 self.parser._addtoken(ilabel, tok_type, tok_val, self.context)
97 else:
98 self.parser.addtoken(tok_type, tok_val, self.context)
99
100 def determine_route(
101 self, value: Optional[str] = None, force: bool = False
102 ) -> Optional[int]:
103 alive_ilabels = self.ilabels
104 if len(alive_ilabels) == 0:
105 *_, most_successful_ilabel = self._dead_ilabels
106 raise ParseError("bad input", most_successful_ilabel, value, self.context)
107
108 ilabel, *rest = alive_ilabels
109 if force or not rest:
110 return ilabel
111 else:
112 return None
113
114
115class ParseError(Exception):
116 """Exception to signal the parser is stuck."""
117
118 def __init__(
119 self, msg: str, type: Optional[int], value: Optional[str], context: Context
120 ) -> None:
121 Exception.__init__(
122 self, f"{msg}: type={type!r}, value={value!r}, context={context!r}"
123 )
124 self.msg = msg
125 self.type = type
126 self.value = value
127 self.context = context
128
129
130class Parser:
131 """Parser engine.
132
133 The proper usage sequence is:
134
135 p = Parser(grammar, [converter]) # create instance
136 p.setup([start]) # prepare for parsing
137 <for each input token>:
138 if p.addtoken(...): # parse a token; may raise ParseError
139 break
140 root = p.rootnode # root of abstract syntax tree
141
142 A Parser instance may be reused by calling setup() repeatedly.
143
144 A Parser instance contains state pertaining to the current token
145 sequence, and should not be used concurrently by different threads
146 to parse separate token sequences.
147
148 See driver.py for how to get input tokens by tokenizing a file or
149 string.
150
151 Parsing is complete when addtoken() returns True; the root of the
152 abstract syntax tree can then be retrieved from the rootnode
153 instance variable. When a syntax error occurs, addtoken() raises
154 the ParseError exception. There is no error recovery; the parser
155 cannot be used after a syntax error was reported (but it can be
156 reinitialized by calling setup()).
157
158 """
159
160 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
161 """Constructor.
162
163 The grammar argument is a grammar.Grammar instance; see the
164 grammar module for more information.
165
166 The parser is not ready yet for parsing; you must call the
167 setup() method to get it started.
168
169 The optional convert argument is a function mapping concrete
170 syntax tree nodes to abstract syntax tree nodes. If not
171 given, no conversion is done and the syntax tree produced is
172 the concrete syntax tree. If given, it must be a function of
173 two arguments, the first being the grammar (a grammar.Grammar
174 instance), and the second being the concrete syntax tree node
175 to be converted. The syntax tree is converted from the bottom
176 up.
177
178 **post-note: the convert argument is ignored since for Black's
179 usage, convert will always be blib2to3.pytree.convert. Allowing
180 this to be dynamic hurts mypyc's ability to use early binding.
181 These docs are left for historical and informational value.
182
183 A concrete syntax tree node is a (type, value, context, nodes)
184 tuple, where type is the node type (a token or symbol number),
185 value is None for symbols and a string for tokens, context is
186 None or an opaque value used for error reporting (typically a
187 (lineno, offset) pair), and nodes is a list of children for
188 symbols, and None for tokens.
189
190 An abstract syntax tree node may be anything; this is entirely
191 up to the converter function.
192
193 """
194 self.grammar = grammar
195 # See note in docstring above. TL;DR this is ignored.
196 self.convert = convert or lam_sub
197 self.is_backtracking = False
198 self.last_token: Optional[int] = None
199
200 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
201 """Prepare for parsing.
202
203 This *must* be called before starting to parse.
204
205 The optional argument is an alternative start symbol; it
206 defaults to the grammar's start symbol.
207
208 You can use a Parser instance to parse any number of programs;
209 each time you call setup() the parser is reset to an initial
210 state determined by the (implicit or explicit) start symbol.
211
212 """
213 if start is None:
214 start = self.grammar.start
215 # Each stack entry is a tuple: (dfa, state, node).
216 # A node is a tuple: (type, value, context, children),
217 # where children is a list of nodes or None, and context may be None.
218 newnode: RawNode = (start, None, None, [])
219 stackentry = (self.grammar.dfas[start], 0, newnode)
220 self.stack: list[tuple[DFAS, int, RawNode]] = [stackentry]
221 self.rootnode: Optional[NL] = None
222 self.used_names: set[str] = set()
223 self.proxy = proxy
224 self.last_token = None
225
226 def addtoken(self, type: int, value: str, context: Context) -> bool:
227 """Add a token; return True iff this is the end of the program."""
228 # Map from token to label
229 ilabels = self.classify(type, value, context)
230 assert len(ilabels) >= 1
231
232 # If we have only one state to advance, we'll directly
233 # take it as is.
234 if len(ilabels) == 1:
235 [ilabel] = ilabels
236 return self._addtoken(ilabel, type, value, context)
237
238 # If there are multiple states which we can advance (only
239 # happen under soft-keywords), then we will try all of them
240 # in parallel and as soon as one state can reach further than
241 # the rest, we'll choose that one. This is a pretty hacky
242 # and hopefully temporary algorithm.
243 #
244 # For a more detailed explanation, check out this post:
245 # https://tree.science/what-the-backtracking.html
246
247 with self.proxy.release() as proxy:
248 counter, force = 0, False
249 recorder = Recorder(self, ilabels, context)
250 recorder.add_token(type, value, raw=True)
251
252 next_token_value = value
253 while recorder.determine_route(next_token_value) is None:
254 if not proxy.can_advance(counter):
255 force = True
256 break
257
258 next_token_type, next_token_value, *_ = proxy.eat(counter)
259 if next_token_type in (tokenize.COMMENT, tokenize.NL):
260 counter += 1
261 continue
262
263 if next_token_type == tokenize.OP:
264 next_token_type = grammar.opmap[next_token_value]
265
266 recorder.add_token(next_token_type, next_token_value)
267 counter += 1
268
269 ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
270 assert ilabel is not None
271
272 return self._addtoken(ilabel, type, value, context)
273
274 def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool:
275 # Loop until the token is shifted; may raise exceptions
276 while True:
277 dfa, state, node = self.stack[-1]
278 states, first = dfa
279 arcs = states[state]
280 # Look for a state with this label
281 for i, newstate in arcs:
282 t = self.grammar.labels[i][0]
283 if t >= 256:
284 # See if it's a symbol and if we're in its first set
285 itsdfa = self.grammar.dfas[t]
286 itsstates, itsfirst = itsdfa
287 if ilabel in itsfirst:
288 # Push a symbol
289 self.push(t, itsdfa, newstate, context)
290 break # To continue the outer while loop
291
292 elif ilabel == i:
293 # Look it up in the list of labels
294 # Shift a token; we're done with it
295 self.shift(type, value, newstate, context)
296 # Pop while we are in an accept-only state
297 state = newstate
298 while states[state] == [(0, state)]:
299 self.pop()
300 if not self.stack:
301 # Done parsing!
302 return True
303 dfa, state, node = self.stack[-1]
304 states, first = dfa
305 # Done with this token
306 self.last_token = type
307 return False
308
309 else:
310 if (0, state) in arcs:
311 # An accepting state, pop it and try something else
312 self.pop()
313 if not self.stack:
314 # Done parsing, but another token is input
315 raise ParseError("too much input", type, value, context)
316 else:
317 # No success finding a transition
318 raise ParseError("bad input", type, value, context)
319
320 def classify(self, type: int, value: str, context: Context) -> list[int]:
321 """Turn a token into a label. (Internal)
322
323 Depending on whether the value is a soft-keyword or not,
324 this function may return multiple labels to choose from."""
325 if type == token.NAME:
326 # Keep a listing of all used names
327 self.used_names.add(value)
328 # Check for reserved words
329 if value in self.grammar.keywords:
330 return [self.grammar.keywords[value]]
331 elif value in self.grammar.soft_keywords:
332 assert type in self.grammar.tokens
333 # Current soft keywords (match, case, type) can only appear at the
334 # beginning of a statement. So as a shortcut, don't try to treat them
335 # like keywords in any other context.
336 # ('_' is also a soft keyword in the real grammar, but for our grammar
337 # it's just an expression, so we don't need to treat it specially.)
338 if self.last_token not in (
339 None,
340 token.INDENT,
341 token.DEDENT,
342 token.NEWLINE,
343 token.SEMI,
344 token.COLON,
345 ):
346 return [self.grammar.tokens[type]]
347 return [
348 self.grammar.tokens[type],
349 self.grammar.soft_keywords[value],
350 ]
351
352 ilabel = self.grammar.tokens.get(type)
353 if ilabel is None:
354 raise ParseError("bad token", type, value, context)
355 return [ilabel]
356
357 def shift(self, type: int, value: str, newstate: int, context: Context) -> None:
358 """Shift a token. (Internal)"""
359 if self.is_backtracking:
360 dfa, state, _ = self.stack[-1]
361 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
362 else:
363 dfa, state, node = self.stack[-1]
364 rawnode: RawNode = (type, value, context, None)
365 newnode = convert(self.grammar, rawnode)
366 assert node[-1] is not None
367 node[-1].append(newnode)
368 self.stack[-1] = (dfa, newstate, node)
369
370 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
371 """Push a nonterminal. (Internal)"""
372 if self.is_backtracking:
373 dfa, state, _ = self.stack[-1]
374 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
375 self.stack.append((newdfa, 0, DUMMY_NODE))
376 else:
377 dfa, state, node = self.stack[-1]
378 newnode: RawNode = (type, None, context, [])
379 self.stack[-1] = (dfa, newstate, node)
380 self.stack.append((newdfa, 0, newnode))
381
382 def pop(self) -> None:
383 """Pop a nonterminal. (Internal)"""
384 if self.is_backtracking:
385 self.stack.pop()
386 else:
387 popdfa, popstate, popnode = self.stack.pop()
388 newnode = convert(self.grammar, popnode)
389 if self.stack:
390 dfa, state, node = self.stack[-1]
391 assert node[-1] is not None
392 node[-1].append(newnode)
393 else:
394 self.rootnode = newnode
395 self.rootnode.used_names = self.used_names