Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/blib2to3/pgen2/parse.py: 95%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

184 statements  

1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 

2# Licensed to PSF under a Contributor Agreement. 

3 

4"""Parser engine for the grammar tables generated by pgen. 

5 

6The grammar table must be loaded first. 

7 

8See Parser/parser.c in the Python distribution for additional info on 

9how this parsing engine works. 

10 

11""" 

12from collections.abc import Callable, Iterator 

13from contextlib import contextmanager 

14from typing import TYPE_CHECKING, Any, Optional, Union, cast 

15 

16from blib2to3.pgen2.grammar import Grammar 

17from blib2to3.pytree import NL, Context, Leaf, Node, RawNode, convert 

18 

19# Local imports 

20from . import grammar, token, tokenize 

21 

22if TYPE_CHECKING: 

23 from blib2to3.pgen2.driver import TokenProxy 

24 

25 

26Results = dict[str, NL] 

27Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]] 

28DFA = list[list[tuple[int, int]]] 

29DFAS = tuple[DFA, dict[int, int]] 

30 

31 

32def lam_sub(grammar: Grammar, node: RawNode) -> NL: 

33 assert node[3] is not None 

34 return Node(type=node[0], children=node[3], context=node[2]) 

35 

36 

37# A placeholder node, used when parser is backtracking. 

38DUMMY_NODE = (-1, None, None, None) 

39 

40 

41def stack_copy( 

42 stack: list[tuple[DFAS, int, RawNode]], 

43) -> list[tuple[DFAS, int, RawNode]]: 

44 """Nodeless stack copy.""" 

45 return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack] 

46 

47 

48class Recorder: 

49 def __init__(self, parser: "Parser", ilabels: list[int], context: Context) -> None: 

50 self.parser = parser 

51 self._ilabels = ilabels 

52 self.context = context # not really matter 

53 

54 self._dead_ilabels: set[int] = set() 

55 self._start_point = self.parser.stack 

56 self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels} 

57 

58 @property 

59 def ilabels(self) -> set[int]: 

60 return self._dead_ilabels.symmetric_difference(self._ilabels) 

61 

62 @contextmanager 

63 def switch_to(self, ilabel: int) -> Iterator[None]: 

64 with self.backtrack(): 

65 self.parser.stack = self._points[ilabel] 

66 try: 

67 yield 

68 except ParseError: 

69 self._dead_ilabels.add(ilabel) 

70 finally: 

71 self.parser.stack = self._start_point 

72 

73 @contextmanager 

74 def backtrack(self) -> Iterator[None]: 

75 """ 

76 Use the node-level invariant ones for basic parsing operations (push/pop/shift). 

77 These still will operate on the stack; but they won't create any new nodes, or 

78 modify the contents of any other existing nodes. 

79 

80 This saves us a ton of time when we are backtracking, since we 

81 want to restore to the initial state as quick as possible, which 

82 can only be done by having as little mutatations as possible. 

83 """ 

84 is_backtracking = self.parser.is_backtracking 

85 try: 

86 self.parser.is_backtracking = True 

87 yield 

88 finally: 

89 self.parser.is_backtracking = is_backtracking 

90 

91 def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None: 

92 for ilabel in self.ilabels: 

93 with self.switch_to(ilabel): 

94 if raw: 

95 self.parser._addtoken(ilabel, tok_type, tok_val, self.context) 

96 else: 

97 self.parser.addtoken(tok_type, tok_val, self.context) 

98 

99 def determine_route( 

100 self, value: Optional[str] = None, force: bool = False 

101 ) -> Optional[int]: 

102 alive_ilabels = self.ilabels 

103 if len(alive_ilabels) == 0: 

104 *_, most_successful_ilabel = self._dead_ilabels 

105 raise ParseError("bad input", most_successful_ilabel, value, self.context) 

106 

107 ilabel, *rest = alive_ilabels 

108 if force or not rest: 

109 return ilabel 

110 else: 

111 return None 

112 

113 

114class ParseError(Exception): 

115 """Exception to signal the parser is stuck.""" 

116 

117 def __init__( 

118 self, msg: str, type: Optional[int], value: Optional[str], context: Context 

119 ) -> None: 

120 Exception.__init__( 

121 self, f"{msg}: type={type!r}, value={value!r}, context={context!r}" 

122 ) 

123 self.msg = msg 

124 self.type = type 

125 self.value = value 

126 self.context = context 

127 

128 

129class Parser: 

130 """Parser engine. 

131 

132 The proper usage sequence is: 

133 

134 p = Parser(grammar, [converter]) # create instance 

135 p.setup([start]) # prepare for parsing 

136 <for each input token>: 

137 if p.addtoken(...): # parse a token; may raise ParseError 

138 break 

139 root = p.rootnode # root of abstract syntax tree 

140 

141 A Parser instance may be reused by calling setup() repeatedly. 

142 

143 A Parser instance contains state pertaining to the current token 

144 sequence, and should not be used concurrently by different threads 

145 to parse separate token sequences. 

146 

147 See driver.py for how to get input tokens by tokenizing a file or 

148 string. 

149 

150 Parsing is complete when addtoken() returns True; the root of the 

151 abstract syntax tree can then be retrieved from the rootnode 

152 instance variable. When a syntax error occurs, addtoken() raises 

153 the ParseError exception. There is no error recovery; the parser 

154 cannot be used after a syntax error was reported (but it can be 

155 reinitialized by calling setup()). 

156 

157 """ 

158 

159 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None: 

160 """Constructor. 

161 

162 The grammar argument is a grammar.Grammar instance; see the 

163 grammar module for more information. 

164 

165 The parser is not ready yet for parsing; you must call the 

166 setup() method to get it started. 

167 

168 The optional convert argument is a function mapping concrete 

169 syntax tree nodes to abstract syntax tree nodes. If not 

170 given, no conversion is done and the syntax tree produced is 

171 the concrete syntax tree. If given, it must be a function of 

172 two arguments, the first being the grammar (a grammar.Grammar 

173 instance), and the second being the concrete syntax tree node 

174 to be converted. The syntax tree is converted from the bottom 

175 up. 

176 

177 **post-note: the convert argument is ignored since for Black's 

178 usage, convert will always be blib2to3.pytree.convert. Allowing 

179 this to be dynamic hurts mypyc's ability to use early binding. 

180 These docs are left for historical and informational value. 

181 

182 A concrete syntax tree node is a (type, value, context, nodes) 

183 tuple, where type is the node type (a token or symbol number), 

184 value is None for symbols and a string for tokens, context is 

185 None or an opaque value used for error reporting (typically a 

186 (lineno, offset) pair), and nodes is a list of children for 

187 symbols, and None for tokens. 

188 

189 An abstract syntax tree node may be anything; this is entirely 

190 up to the converter function. 

191 

192 """ 

193 self.grammar = grammar 

194 # See note in docstring above. TL;DR this is ignored. 

195 self.convert = convert or lam_sub 

196 self.is_backtracking = False 

197 self.last_token: Optional[int] = None 

198 

199 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None: 

200 """Prepare for parsing. 

201 

202 This *must* be called before starting to parse. 

203 

204 The optional argument is an alternative start symbol; it 

205 defaults to the grammar's start symbol. 

206 

207 You can use a Parser instance to parse any number of programs; 

208 each time you call setup() the parser is reset to an initial 

209 state determined by the (implicit or explicit) start symbol. 

210 

211 """ 

212 if start is None: 

213 start = self.grammar.start 

214 # Each stack entry is a tuple: (dfa, state, node). 

215 # A node is a tuple: (type, value, context, children), 

216 # where children is a list of nodes or None, and context may be None. 

217 newnode: RawNode = (start, None, None, []) 

218 stackentry = (self.grammar.dfas[start], 0, newnode) 

219 self.stack: list[tuple[DFAS, int, RawNode]] = [stackentry] 

220 self.rootnode: Optional[NL] = None 

221 self.used_names: set[str] = set() 

222 self.proxy = proxy 

223 self.last_token = None 

224 

225 def addtoken(self, type: int, value: str, context: Context) -> bool: 

226 """Add a token; return True iff this is the end of the program.""" 

227 # Map from token to label 

228 ilabels = self.classify(type, value, context) 

229 assert len(ilabels) >= 1 

230 

231 # If we have only one state to advance, we'll directly 

232 # take it as is. 

233 if len(ilabels) == 1: 

234 [ilabel] = ilabels 

235 return self._addtoken(ilabel, type, value, context) 

236 

237 # If there are multiple states which we can advance (only 

238 # happen under soft-keywords), then we will try all of them 

239 # in parallel and as soon as one state can reach further than 

240 # the rest, we'll choose that one. This is a pretty hacky 

241 # and hopefully temporary algorithm. 

242 # 

243 # For a more detailed explanation, check out this post: 

244 # https://tree.science/what-the-backtracking.html 

245 

246 with self.proxy.release() as proxy: 

247 counter, force = 0, False 

248 recorder = Recorder(self, ilabels, context) 

249 recorder.add_token(type, value, raw=True) 

250 

251 next_token_value = value 

252 while recorder.determine_route(next_token_value) is None: 

253 if not proxy.can_advance(counter): 

254 force = True 

255 break 

256 

257 next_token_type, next_token_value, *_ = proxy.eat(counter) 

258 if next_token_type in (tokenize.COMMENT, tokenize.NL): 

259 counter += 1 

260 continue 

261 

262 if next_token_type == tokenize.OP: 

263 next_token_type = grammar.opmap[next_token_value] 

264 

265 recorder.add_token(next_token_type, next_token_value) 

266 counter += 1 

267 

268 ilabel = cast(int, recorder.determine_route(next_token_value, force=force)) 

269 assert ilabel is not None 

270 

271 return self._addtoken(ilabel, type, value, context) 

272 

273 def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool: 

274 # Loop until the token is shifted; may raise exceptions 

275 while True: 

276 dfa, state, node = self.stack[-1] 

277 states, first = dfa 

278 arcs = states[state] 

279 # Look for a state with this label 

280 for i, newstate in arcs: 

281 t = self.grammar.labels[i][0] 

282 if t >= 256: 

283 # See if it's a symbol and if we're in its first set 

284 itsdfa = self.grammar.dfas[t] 

285 itsstates, itsfirst = itsdfa 

286 if ilabel in itsfirst: 

287 # Push a symbol 

288 self.push(t, itsdfa, newstate, context) 

289 break # To continue the outer while loop 

290 

291 elif ilabel == i: 

292 # Look it up in the list of labels 

293 # Shift a token; we're done with it 

294 self.shift(type, value, newstate, context) 

295 # Pop while we are in an accept-only state 

296 state = newstate 

297 while states[state] == [(0, state)]: 

298 self.pop() 

299 if not self.stack: 

300 # Done parsing! 

301 return True 

302 dfa, state, node = self.stack[-1] 

303 states, first = dfa 

304 # Done with this token 

305 self.last_token = type 

306 return False 

307 

308 else: 

309 if (0, state) in arcs: 

310 # An accepting state, pop it and try something else 

311 self.pop() 

312 if not self.stack: 

313 # Done parsing, but another token is input 

314 raise ParseError("too much input", type, value, context) 

315 else: 

316 # No success finding a transition 

317 raise ParseError("bad input", type, value, context) 

318 

319 def classify(self, type: int, value: str, context: Context) -> list[int]: 

320 """Turn a token into a label. (Internal) 

321 

322 Depending on whether the value is a soft-keyword or not, 

323 this function may return multiple labels to choose from.""" 

324 if type == token.NAME: 

325 # Keep a listing of all used names 

326 self.used_names.add(value) 

327 # Check for reserved words 

328 if value in self.grammar.keywords: 

329 return [self.grammar.keywords[value]] 

330 elif value in self.grammar.soft_keywords: 

331 assert type in self.grammar.tokens 

332 # Current soft keywords (match, case, type) can only appear at the 

333 # beginning of a statement. So as a shortcut, don't try to treat them 

334 # like keywords in any other context. 

335 # ('_' is also a soft keyword in the real grammar, but for our grammar 

336 # it's just an expression, so we don't need to treat it specially.) 

337 if self.last_token not in ( 

338 None, 

339 token.INDENT, 

340 token.DEDENT, 

341 token.NEWLINE, 

342 token.SEMI, 

343 token.COLON, 

344 ): 

345 return [self.grammar.tokens[type]] 

346 return [ 

347 self.grammar.tokens[type], 

348 self.grammar.soft_keywords[value], 

349 ] 

350 

351 ilabel = self.grammar.tokens.get(type) 

352 if ilabel is None: 

353 raise ParseError("bad token", type, value, context) 

354 return [ilabel] 

355 

356 def shift(self, type: int, value: str, newstate: int, context: Context) -> None: 

357 """Shift a token. (Internal)""" 

358 if self.is_backtracking: 

359 dfa, state, _ = self.stack[-1] 

360 self.stack[-1] = (dfa, newstate, DUMMY_NODE) 

361 else: 

362 dfa, state, node = self.stack[-1] 

363 rawnode: RawNode = (type, value, context, None) 

364 newnode = convert(self.grammar, rawnode) 

365 assert node[-1] is not None 

366 node[-1].append(newnode) 

367 self.stack[-1] = (dfa, newstate, node) 

368 

369 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None: 

370 """Push a nonterminal. (Internal)""" 

371 if self.is_backtracking: 

372 dfa, state, _ = self.stack[-1] 

373 self.stack[-1] = (dfa, newstate, DUMMY_NODE) 

374 self.stack.append((newdfa, 0, DUMMY_NODE)) 

375 else: 

376 dfa, state, node = self.stack[-1] 

377 newnode: RawNode = (type, None, context, []) 

378 self.stack[-1] = (dfa, newstate, node) 

379 self.stack.append((newdfa, 0, newnode)) 

380 

381 def pop(self) -> None: 

382 """Pop a nonterminal. (Internal)""" 

383 if self.is_backtracking: 

384 self.stack.pop() 

385 else: 

386 popdfa, popstate, popnode = self.stack.pop() 

387 newnode = convert(self.grammar, popnode) 

388 if self.stack: 

389 dfa, state, node = self.stack[-1] 

390 assert node[-1] is not None 

391 node[-1].append(newnode) 

392 else: 

393 self.rootnode = newnode 

394 self.rootnode.used_names = self.used_names