1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""Parser engine for the grammar tables generated by pgen.
5
6The grammar table must be loaded first.
7
8See Parser/parser.c in the Python distribution for additional info on
9how this parsing engine works.
10
11"""
12from collections.abc import Callable, Iterator
13from contextlib import contextmanager
14from typing import TYPE_CHECKING, Any, Optional, Union, cast
15
16from blib2to3.pgen2.grammar import Grammar
17from blib2to3.pytree import NL, Context, Leaf, Node, RawNode, convert
18
19# Local imports
20from . import grammar, token, tokenize
21
22if TYPE_CHECKING:
23 from blib2to3.pgen2.driver import TokenProxy
24
25
26Results = dict[str, NL]
27Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
28DFA = list[list[tuple[int, int]]]
29DFAS = tuple[DFA, dict[int, int]]
30
31
32def lam_sub(grammar: Grammar, node: RawNode) -> NL:
33 assert node[3] is not None
34 return Node(type=node[0], children=node[3], context=node[2])
35
36
37# A placeholder node, used when parser is backtracking.
38DUMMY_NODE = (-1, None, None, None)
39
40
41def stack_copy(
42 stack: list[tuple[DFAS, int, RawNode]],
43) -> list[tuple[DFAS, int, RawNode]]:
44 """Nodeless stack copy."""
45 return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]
46
47
48class Recorder:
49 def __init__(self, parser: "Parser", ilabels: list[int], context: Context) -> None:
50 self.parser = parser
51 self._ilabels = ilabels
52 self.context = context # not really matter
53
54 self._dead_ilabels: set[int] = set()
55 self._start_point = self.parser.stack
56 self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
57
58 @property
59 def ilabels(self) -> set[int]:
60 return self._dead_ilabels.symmetric_difference(self._ilabels)
61
62 @contextmanager
63 def switch_to(self, ilabel: int) -> Iterator[None]:
64 with self.backtrack():
65 self.parser.stack = self._points[ilabel]
66 try:
67 yield
68 except ParseError:
69 self._dead_ilabels.add(ilabel)
70 finally:
71 self.parser.stack = self._start_point
72
73 @contextmanager
74 def backtrack(self) -> Iterator[None]:
75 """
76 Use the node-level invariant ones for basic parsing operations (push/pop/shift).
77 These still will operate on the stack; but they won't create any new nodes, or
78 modify the contents of any other existing nodes.
79
80 This saves us a ton of time when we are backtracking, since we
81 want to restore to the initial state as quick as possible, which
82 can only be done by having as little mutatations as possible.
83 """
84 is_backtracking = self.parser.is_backtracking
85 try:
86 self.parser.is_backtracking = True
87 yield
88 finally:
89 self.parser.is_backtracking = is_backtracking
90
91 def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None:
92 for ilabel in self.ilabels:
93 with self.switch_to(ilabel):
94 if raw:
95 self.parser._addtoken(ilabel, tok_type, tok_val, self.context)
96 else:
97 self.parser.addtoken(tok_type, tok_val, self.context)
98
99 def determine_route(
100 self, value: Optional[str] = None, force: bool = False
101 ) -> Optional[int]:
102 alive_ilabels = self.ilabels
103 if len(alive_ilabels) == 0:
104 *_, most_successful_ilabel = self._dead_ilabels
105 raise ParseError("bad input", most_successful_ilabel, value, self.context)
106
107 ilabel, *rest = alive_ilabels
108 if force or not rest:
109 return ilabel
110 else:
111 return None
112
113
114class ParseError(Exception):
115 """Exception to signal the parser is stuck."""
116
117 def __init__(
118 self, msg: str, type: Optional[int], value: Optional[str], context: Context
119 ) -> None:
120 Exception.__init__(
121 self, f"{msg}: type={type!r}, value={value!r}, context={context!r}"
122 )
123 self.msg = msg
124 self.type = type
125 self.value = value
126 self.context = context
127
128
129class Parser:
130 """Parser engine.
131
132 The proper usage sequence is:
133
134 p = Parser(grammar, [converter]) # create instance
135 p.setup([start]) # prepare for parsing
136 <for each input token>:
137 if p.addtoken(...): # parse a token; may raise ParseError
138 break
139 root = p.rootnode # root of abstract syntax tree
140
141 A Parser instance may be reused by calling setup() repeatedly.
142
143 A Parser instance contains state pertaining to the current token
144 sequence, and should not be used concurrently by different threads
145 to parse separate token sequences.
146
147 See driver.py for how to get input tokens by tokenizing a file or
148 string.
149
150 Parsing is complete when addtoken() returns True; the root of the
151 abstract syntax tree can then be retrieved from the rootnode
152 instance variable. When a syntax error occurs, addtoken() raises
153 the ParseError exception. There is no error recovery; the parser
154 cannot be used after a syntax error was reported (but it can be
155 reinitialized by calling setup()).
156
157 """
158
159 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
160 """Constructor.
161
162 The grammar argument is a grammar.Grammar instance; see the
163 grammar module for more information.
164
165 The parser is not ready yet for parsing; you must call the
166 setup() method to get it started.
167
168 The optional convert argument is a function mapping concrete
169 syntax tree nodes to abstract syntax tree nodes. If not
170 given, no conversion is done and the syntax tree produced is
171 the concrete syntax tree. If given, it must be a function of
172 two arguments, the first being the grammar (a grammar.Grammar
173 instance), and the second being the concrete syntax tree node
174 to be converted. The syntax tree is converted from the bottom
175 up.
176
177 **post-note: the convert argument is ignored since for Black's
178 usage, convert will always be blib2to3.pytree.convert. Allowing
179 this to be dynamic hurts mypyc's ability to use early binding.
180 These docs are left for historical and informational value.
181
182 A concrete syntax tree node is a (type, value, context, nodes)
183 tuple, where type is the node type (a token or symbol number),
184 value is None for symbols and a string for tokens, context is
185 None or an opaque value used for error reporting (typically a
186 (lineno, offset) pair), and nodes is a list of children for
187 symbols, and None for tokens.
188
189 An abstract syntax tree node may be anything; this is entirely
190 up to the converter function.
191
192 """
193 self.grammar = grammar
194 # See note in docstring above. TL;DR this is ignored.
195 self.convert = convert or lam_sub
196 self.is_backtracking = False
197 self.last_token: Optional[int] = None
198
199 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
200 """Prepare for parsing.
201
202 This *must* be called before starting to parse.
203
204 The optional argument is an alternative start symbol; it
205 defaults to the grammar's start symbol.
206
207 You can use a Parser instance to parse any number of programs;
208 each time you call setup() the parser is reset to an initial
209 state determined by the (implicit or explicit) start symbol.
210
211 """
212 if start is None:
213 start = self.grammar.start
214 # Each stack entry is a tuple: (dfa, state, node).
215 # A node is a tuple: (type, value, context, children),
216 # where children is a list of nodes or None, and context may be None.
217 newnode: RawNode = (start, None, None, [])
218 stackentry = (self.grammar.dfas[start], 0, newnode)
219 self.stack: list[tuple[DFAS, int, RawNode]] = [stackentry]
220 self.rootnode: Optional[NL] = None
221 self.used_names: set[str] = set()
222 self.proxy = proxy
223 self.last_token = None
224
225 def addtoken(self, type: int, value: str, context: Context) -> bool:
226 """Add a token; return True iff this is the end of the program."""
227 # Map from token to label
228 ilabels = self.classify(type, value, context)
229 assert len(ilabels) >= 1
230
231 # If we have only one state to advance, we'll directly
232 # take it as is.
233 if len(ilabels) == 1:
234 [ilabel] = ilabels
235 return self._addtoken(ilabel, type, value, context)
236
237 # If there are multiple states which we can advance (only
238 # happen under soft-keywords), then we will try all of them
239 # in parallel and as soon as one state can reach further than
240 # the rest, we'll choose that one. This is a pretty hacky
241 # and hopefully temporary algorithm.
242 #
243 # For a more detailed explanation, check out this post:
244 # https://tree.science/what-the-backtracking.html
245
246 with self.proxy.release() as proxy:
247 counter, force = 0, False
248 recorder = Recorder(self, ilabels, context)
249 recorder.add_token(type, value, raw=True)
250
251 next_token_value = value
252 while recorder.determine_route(next_token_value) is None:
253 if not proxy.can_advance(counter):
254 force = True
255 break
256
257 next_token_type, next_token_value, *_ = proxy.eat(counter)
258 if next_token_type in (tokenize.COMMENT, tokenize.NL):
259 counter += 1
260 continue
261
262 if next_token_type == tokenize.OP:
263 next_token_type = grammar.opmap[next_token_value]
264
265 recorder.add_token(next_token_type, next_token_value)
266 counter += 1
267
268 ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
269 assert ilabel is not None
270
271 return self._addtoken(ilabel, type, value, context)
272
273 def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool:
274 # Loop until the token is shifted; may raise exceptions
275 while True:
276 dfa, state, node = self.stack[-1]
277 states, first = dfa
278 arcs = states[state]
279 # Look for a state with this label
280 for i, newstate in arcs:
281 t = self.grammar.labels[i][0]
282 if t >= 256:
283 # See if it's a symbol and if we're in its first set
284 itsdfa = self.grammar.dfas[t]
285 itsstates, itsfirst = itsdfa
286 if ilabel in itsfirst:
287 # Push a symbol
288 self.push(t, itsdfa, newstate, context)
289 break # To continue the outer while loop
290
291 elif ilabel == i:
292 # Look it up in the list of labels
293 # Shift a token; we're done with it
294 self.shift(type, value, newstate, context)
295 # Pop while we are in an accept-only state
296 state = newstate
297 while states[state] == [(0, state)]:
298 self.pop()
299 if not self.stack:
300 # Done parsing!
301 return True
302 dfa, state, node = self.stack[-1]
303 states, first = dfa
304 # Done with this token
305 self.last_token = type
306 return False
307
308 else:
309 if (0, state) in arcs:
310 # An accepting state, pop it and try something else
311 self.pop()
312 if not self.stack:
313 # Done parsing, but another token is input
314 raise ParseError("too much input", type, value, context)
315 else:
316 # No success finding a transition
317 raise ParseError("bad input", type, value, context)
318
319 def classify(self, type: int, value: str, context: Context) -> list[int]:
320 """Turn a token into a label. (Internal)
321
322 Depending on whether the value is a soft-keyword or not,
323 this function may return multiple labels to choose from."""
324 if type == token.NAME:
325 # Keep a listing of all used names
326 self.used_names.add(value)
327 # Check for reserved words
328 if value in self.grammar.keywords:
329 return [self.grammar.keywords[value]]
330 elif value in self.grammar.soft_keywords:
331 assert type in self.grammar.tokens
332 # Current soft keywords (match, case, type) can only appear at the
333 # beginning of a statement. So as a shortcut, don't try to treat them
334 # like keywords in any other context.
335 # ('_' is also a soft keyword in the real grammar, but for our grammar
336 # it's just an expression, so we don't need to treat it specially.)
337 if self.last_token not in (
338 None,
339 token.INDENT,
340 token.DEDENT,
341 token.NEWLINE,
342 token.SEMI,
343 token.COLON,
344 ):
345 return [self.grammar.tokens[type]]
346 return [
347 self.grammar.tokens[type],
348 self.grammar.soft_keywords[value],
349 ]
350
351 ilabel = self.grammar.tokens.get(type)
352 if ilabel is None:
353 raise ParseError("bad token", type, value, context)
354 return [ilabel]
355
356 def shift(self, type: int, value: str, newstate: int, context: Context) -> None:
357 """Shift a token. (Internal)"""
358 if self.is_backtracking:
359 dfa, state, _ = self.stack[-1]
360 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
361 else:
362 dfa, state, node = self.stack[-1]
363 rawnode: RawNode = (type, value, context, None)
364 newnode = convert(self.grammar, rawnode)
365 assert node[-1] is not None
366 node[-1].append(newnode)
367 self.stack[-1] = (dfa, newstate, node)
368
369 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
370 """Push a nonterminal. (Internal)"""
371 if self.is_backtracking:
372 dfa, state, _ = self.stack[-1]
373 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
374 self.stack.append((newdfa, 0, DUMMY_NODE))
375 else:
376 dfa, state, node = self.stack[-1]
377 newnode: RawNode = (type, None, context, [])
378 self.stack[-1] = (dfa, newstate, node)
379 self.stack.append((newdfa, 0, newnode))
380
381 def pop(self) -> None:
382 """Pop a nonterminal. (Internal)"""
383 if self.is_backtracking:
384 self.stack.pop()
385 else:
386 popdfa, popstate, popnode = self.stack.pop()
387 newnode = convert(self.grammar, popnode)
388 if self.stack:
389 dfa, state, node = self.stack[-1]
390 assert node[-1] is not None
391 node[-1].append(newnode)
392 else:
393 self.rootnode = newnode
394 self.rootnode.used_names = self.used_names