Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/parse.py: 95%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
4"""Parser engine for the grammar tables generated by pgen.
6The grammar table must be loaded first.
8See Parser/parser.c in the Python distribution for additional info on
9how this parsing engine works.
11"""
13from collections.abc import Callable, Iterator
14from contextlib import contextmanager
15from typing import TYPE_CHECKING, Any, Optional, Union, cast
17from blib2to3.pgen2.grammar import Grammar
18from blib2to3.pytree import NL, Context, Leaf, Node, RawNode, convert
20# Local imports
21from . import grammar, token, tokenize
23if TYPE_CHECKING:
24 from blib2to3.pgen2.driver import TokenProxy
27Results = dict[str, NL]
28Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]
29DFA = list[list[tuple[int, int]]]
30DFAS = tuple[DFA, dict[int, int]]
33def lam_sub(grammar: Grammar, node: RawNode) -> NL:
34 assert node[3] is not None
35 return Node(type=node[0], children=node[3], context=node[2])
38# A placeholder node, used when parser is backtracking.
39DUMMY_NODE = (-1, None, None, None)
42def stack_copy(
43 stack: list[tuple[DFAS, int, RawNode]],
44) -> list[tuple[DFAS, int, RawNode]]:
45 """Nodeless stack copy."""
46 return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]
49class Recorder:
50 def __init__(self, parser: "Parser", ilabels: list[int], context: Context) -> None:
51 self.parser = parser
52 self._ilabels = ilabels
53 self.context = context # not really matter
55 self._dead_ilabels: set[int] = set()
56 self._start_point = self.parser.stack
57 self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}
59 @property
60 def ilabels(self) -> set[int]:
61 return self._dead_ilabels.symmetric_difference(self._ilabels)
63 @contextmanager
64 def switch_to(self, ilabel: int) -> Iterator[None]:
65 with self.backtrack():
66 self.parser.stack = self._points[ilabel]
67 try:
68 yield
69 except ParseError:
70 self._dead_ilabels.add(ilabel)
71 finally:
72 self.parser.stack = self._start_point
74 @contextmanager
75 def backtrack(self) -> Iterator[None]:
76 """
77 Use the node-level invariant ones for basic parsing operations (push/pop/shift).
78 These still will operate on the stack; but they won't create any new nodes, or
79 modify the contents of any other existing nodes.
81 This saves us a ton of time when we are backtracking, since we
82 want to restore to the initial state as quick as possible, which
83 can only be done by having as little mutatations as possible.
84 """
85 is_backtracking = self.parser.is_backtracking
86 try:
87 self.parser.is_backtracking = True
88 yield
89 finally:
90 self.parser.is_backtracking = is_backtracking
92 def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None:
93 for ilabel in self.ilabels:
94 with self.switch_to(ilabel):
95 if raw:
96 self.parser._addtoken(ilabel, tok_type, tok_val, self.context)
97 else:
98 self.parser.addtoken(tok_type, tok_val, self.context)
100 def determine_route(
101 self, value: Optional[str] = None, force: bool = False
102 ) -> Optional[int]:
103 alive_ilabels = self.ilabels
104 if len(alive_ilabels) == 0:
105 *_, most_successful_ilabel = self._dead_ilabels
106 raise ParseError("bad input", most_successful_ilabel, value, self.context)
108 ilabel, *rest = alive_ilabels
109 if force or not rest:
110 return ilabel
111 else:
112 return None
115class ParseError(Exception):
116 """Exception to signal the parser is stuck."""
118 def __init__(
119 self, msg: str, type: Optional[int], value: Optional[str], context: Context
120 ) -> None:
121 Exception.__init__(
122 self, f"{msg}: type={type!r}, value={value!r}, context={context!r}"
123 )
124 self.msg = msg
125 self.type = type
126 self.value = value
127 self.context = context
130class Parser:
131 """Parser engine.
133 The proper usage sequence is:
135 p = Parser(grammar, [converter]) # create instance
136 p.setup([start]) # prepare for parsing
137 <for each input token>:
138 if p.addtoken(...): # parse a token; may raise ParseError
139 break
140 root = p.rootnode # root of abstract syntax tree
142 A Parser instance may be reused by calling setup() repeatedly.
144 A Parser instance contains state pertaining to the current token
145 sequence, and should not be used concurrently by different threads
146 to parse separate token sequences.
148 See driver.py for how to get input tokens by tokenizing a file or
149 string.
151 Parsing is complete when addtoken() returns True; the root of the
152 abstract syntax tree can then be retrieved from the rootnode
153 instance variable. When a syntax error occurs, addtoken() raises
154 the ParseError exception. There is no error recovery; the parser
155 cannot be used after a syntax error was reported (but it can be
156 reinitialized by calling setup()).
158 """
160 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:
161 """Constructor.
163 The grammar argument is a grammar.Grammar instance; see the
164 grammar module for more information.
166 The parser is not ready yet for parsing; you must call the
167 setup() method to get it started.
169 The optional convert argument is a function mapping concrete
170 syntax tree nodes to abstract syntax tree nodes. If not
171 given, no conversion is done and the syntax tree produced is
172 the concrete syntax tree. If given, it must be a function of
173 two arguments, the first being the grammar (a grammar.Grammar
174 instance), and the second being the concrete syntax tree node
175 to be converted. The syntax tree is converted from the bottom
176 up.
178 **post-note: the convert argument is ignored since for Black's
179 usage, convert will always be blib2to3.pytree.convert. Allowing
180 this to be dynamic hurts mypyc's ability to use early binding.
181 These docs are left for historical and informational value.
183 A concrete syntax tree node is a (type, value, context, nodes)
184 tuple, where type is the node type (a token or symbol number),
185 value is None for symbols and a string for tokens, context is
186 None or an opaque value used for error reporting (typically a
187 (lineno, offset) pair), and nodes is a list of children for
188 symbols, and None for tokens.
190 An abstract syntax tree node may be anything; this is entirely
191 up to the converter function.
193 """
194 self.grammar = grammar
195 # See note in docstring above. TL;DR this is ignored.
196 self.convert = convert or lam_sub
197 self.is_backtracking = False
198 self.last_token: Optional[int] = None
200 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:
201 """Prepare for parsing.
203 This *must* be called before starting to parse.
205 The optional argument is an alternative start symbol; it
206 defaults to the grammar's start symbol.
208 You can use a Parser instance to parse any number of programs;
209 each time you call setup() the parser is reset to an initial
210 state determined by the (implicit or explicit) start symbol.
212 """
213 if start is None:
214 start = self.grammar.start
215 # Each stack entry is a tuple: (dfa, state, node).
216 # A node is a tuple: (type, value, context, children),
217 # where children is a list of nodes or None, and context may be None.
218 newnode: RawNode = (start, None, None, [])
219 stackentry = (self.grammar.dfas[start], 0, newnode)
220 self.stack: list[tuple[DFAS, int, RawNode]] = [stackentry]
221 self.rootnode: Optional[NL] = None
222 self.used_names: set[str] = set()
223 self.proxy = proxy
224 self.last_token = None
226 def addtoken(self, type: int, value: str, context: Context) -> bool:
227 """Add a token; return True iff this is the end of the program."""
228 # Map from token to label
229 ilabels = self.classify(type, value, context)
230 assert len(ilabels) >= 1
232 # If we have only one state to advance, we'll directly
233 # take it as is.
234 if len(ilabels) == 1:
235 [ilabel] = ilabels
236 return self._addtoken(ilabel, type, value, context)
238 # If there are multiple states which we can advance (only
239 # happen under soft-keywords), then we will try all of them
240 # in parallel and as soon as one state can reach further than
241 # the rest, we'll choose that one. This is a pretty hacky
242 # and hopefully temporary algorithm.
243 #
244 # For a more detailed explanation, check out this post:
245 # https://tree.science/what-the-backtracking.html
247 with self.proxy.release() as proxy:
248 counter, force = 0, False
249 recorder = Recorder(self, ilabels, context)
250 recorder.add_token(type, value, raw=True)
252 next_token_value = value
253 while recorder.determine_route(next_token_value) is None:
254 if not proxy.can_advance(counter):
255 force = True
256 break
258 next_token_type, next_token_value, *_ = proxy.eat(counter)
259 if next_token_type in (tokenize.COMMENT, tokenize.NL):
260 counter += 1
261 continue
263 if next_token_type == tokenize.OP:
264 next_token_type = grammar.opmap[next_token_value]
266 recorder.add_token(next_token_type, next_token_value)
267 counter += 1
269 ilabel = cast(int, recorder.determine_route(next_token_value, force=force))
270 assert ilabel is not None
272 return self._addtoken(ilabel, type, value, context)
274 def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool:
275 # Loop until the token is shifted; may raise exceptions
276 while True:
277 dfa, state, node = self.stack[-1]
278 states, first = dfa
279 arcs = states[state]
280 # Look for a state with this label
281 for i, newstate in arcs:
282 t = self.grammar.labels[i][0]
283 if t >= 256:
284 # See if it's a symbol and if we're in its first set
285 itsdfa = self.grammar.dfas[t]
286 itsstates, itsfirst = itsdfa
287 if ilabel in itsfirst:
288 # Push a symbol
289 self.push(t, itsdfa, newstate, context)
290 break # To continue the outer while loop
292 elif ilabel == i:
293 # Look it up in the list of labels
294 # Shift a token; we're done with it
295 self.shift(type, value, newstate, context)
296 # Pop while we are in an accept-only state
297 state = newstate
298 while states[state] == [(0, state)]:
299 self.pop()
300 if not self.stack:
301 # Done parsing!
302 return True
303 dfa, state, node = self.stack[-1]
304 states, first = dfa
305 # Done with this token
306 self.last_token = type
307 return False
309 else:
310 if (0, state) in arcs:
311 # An accepting state, pop it and try something else
312 self.pop()
313 if not self.stack:
314 # Done parsing, but another token is input
315 raise ParseError("too much input", type, value, context)
316 else:
317 # No success finding a transition
318 raise ParseError("bad input", type, value, context)
320 def classify(self, type: int, value: str, context: Context) -> list[int]:
321 """Turn a token into a label. (Internal)
323 Depending on whether the value is a soft-keyword or not,
324 this function may return multiple labels to choose from."""
325 if type == token.NAME:
326 # Keep a listing of all used names
327 self.used_names.add(value)
328 # Check for reserved words
329 if value in self.grammar.keywords:
330 return [self.grammar.keywords[value]]
331 elif value in self.grammar.soft_keywords:
332 assert type in self.grammar.tokens
333 # Current soft keywords (match, case, type) can only appear at the
334 # beginning of a statement. So as a shortcut, don't try to treat them
335 # like keywords in any other context.
336 # ('_' is also a soft keyword in the real grammar, but for our grammar
337 # it's just an expression, so we don't need to treat it specially.)
338 if self.last_token not in (
339 None,
340 token.INDENT,
341 token.DEDENT,
342 token.NEWLINE,
343 token.SEMI,
344 token.COLON,
345 ):
346 return [self.grammar.tokens[type]]
347 return [
348 self.grammar.tokens[type],
349 self.grammar.soft_keywords[value],
350 ]
352 ilabel = self.grammar.tokens.get(type)
353 if ilabel is None:
354 raise ParseError("bad token", type, value, context)
355 return [ilabel]
357 def shift(self, type: int, value: str, newstate: int, context: Context) -> None:
358 """Shift a token. (Internal)"""
359 if self.is_backtracking:
360 dfa, state, _ = self.stack[-1]
361 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
362 else:
363 dfa, state, node = self.stack[-1]
364 rawnode: RawNode = (type, value, context, None)
365 newnode = convert(self.grammar, rawnode)
366 assert node[-1] is not None
367 node[-1].append(newnode)
368 self.stack[-1] = (dfa, newstate, node)
370 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:
371 """Push a nonterminal. (Internal)"""
372 if self.is_backtracking:
373 dfa, state, _ = self.stack[-1]
374 self.stack[-1] = (dfa, newstate, DUMMY_NODE)
375 self.stack.append((newdfa, 0, DUMMY_NODE))
376 else:
377 dfa, state, node = self.stack[-1]
378 newnode: RawNode = (type, None, context, [])
379 self.stack[-1] = (dfa, newstate, node)
380 self.stack.append((newdfa, 0, newnode))
382 def pop(self) -> None:
383 """Pop a nonterminal. (Internal)"""
384 if self.is_backtracking:
385 self.stack.pop()
386 else:
387 popdfa, popstate, popnode = self.stack.pop()
388 newnode = convert(self.grammar, popnode)
389 if self.stack:
390 dfa, state, node = self.stack[-1]
391 assert node[-1] is not None
392 node[-1].append(newnode)
393 else:
394 self.rootnode = newnode
395 self.rootnode.used_names = self.used_names