Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/blib2to3/pgen2/parse.py: 95%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

184 statements  

1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 

2# Licensed to PSF under a Contributor Agreement. 

3 

4"""Parser engine for the grammar tables generated by pgen. 

5 

6The grammar table must be loaded first. 

7 

8See Parser/parser.c in the Python distribution for additional info on 

9how this parsing engine works. 

10 

11""" 

12 

13from collections.abc import Callable, Iterator 

14from contextlib import contextmanager 

15from typing import TYPE_CHECKING, Any, Optional, Union, cast 

16 

17from blib2to3.pgen2.grammar import Grammar 

18from blib2to3.pytree import NL, Context, Leaf, Node, RawNode, convert 

19 

20# Local imports 

21from . import grammar, token, tokenize 

22 

23if TYPE_CHECKING: 

24 from blib2to3.pgen2.driver import TokenProxy 

25 

26 

27Results = dict[str, NL] 

28Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]] 

29DFA = list[list[tuple[int, int]]] 

30DFAS = tuple[DFA, dict[int, int]] 

31 

32 

33def lam_sub(grammar: Grammar, node: RawNode) -> NL: 

34 assert node[3] is not None 

35 return Node(type=node[0], children=node[3], context=node[2]) 

36 

37 

38# A placeholder node, used when parser is backtracking. 

39DUMMY_NODE = (-1, None, None, None) 

40 

41 

42def stack_copy( 

43 stack: list[tuple[DFAS, int, RawNode]], 

44) -> list[tuple[DFAS, int, RawNode]]: 

45 """Nodeless stack copy.""" 

46 return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack] 

47 

48 

49class Recorder: 

50 def __init__(self, parser: "Parser", ilabels: list[int], context: Context) -> None: 

51 self.parser = parser 

52 self._ilabels = ilabels 

53 self.context = context # not really matter 

54 

55 self._dead_ilabels: set[int] = set() 

56 self._start_point = self.parser.stack 

57 self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels} 

58 

59 @property 

60 def ilabels(self) -> set[int]: 

61 return self._dead_ilabels.symmetric_difference(self._ilabels) 

62 

63 @contextmanager 

64 def switch_to(self, ilabel: int) -> Iterator[None]: 

65 with self.backtrack(): 

66 self.parser.stack = self._points[ilabel] 

67 try: 

68 yield 

69 except ParseError: 

70 self._dead_ilabels.add(ilabel) 

71 finally: 

72 self.parser.stack = self._start_point 

73 

74 @contextmanager 

75 def backtrack(self) -> Iterator[None]: 

76 """ 

77 Use the node-level invariant ones for basic parsing operations (push/pop/shift). 

78 These still will operate on the stack; but they won't create any new nodes, or 

79 modify the contents of any other existing nodes. 

80 

81 This saves us a ton of time when we are backtracking, since we 

82 want to restore to the initial state as quick as possible, which 

83 can only be done by having as little mutatations as possible. 

84 """ 

85 is_backtracking = self.parser.is_backtracking 

86 try: 

87 self.parser.is_backtracking = True 

88 yield 

89 finally: 

90 self.parser.is_backtracking = is_backtracking 

91 

92 def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None: 

93 for ilabel in self.ilabels: 

94 with self.switch_to(ilabel): 

95 if raw: 

96 self.parser._addtoken(ilabel, tok_type, tok_val, self.context) 

97 else: 

98 self.parser.addtoken(tok_type, tok_val, self.context) 

99 

100 def determine_route( 

101 self, value: Optional[str] = None, force: bool = False 

102 ) -> Optional[int]: 

103 alive_ilabels = self.ilabels 

104 if len(alive_ilabels) == 0: 

105 *_, most_successful_ilabel = self._dead_ilabels 

106 raise ParseError("bad input", most_successful_ilabel, value, self.context) 

107 

108 ilabel, *rest = alive_ilabels 

109 if force or not rest: 

110 return ilabel 

111 else: 

112 return None 

113 

114 

115class ParseError(Exception): 

116 """Exception to signal the parser is stuck.""" 

117 

118 def __init__( 

119 self, msg: str, type: Optional[int], value: Optional[str], context: Context 

120 ) -> None: 

121 Exception.__init__( 

122 self, f"{msg}: type={type!r}, value={value!r}, context={context!r}" 

123 ) 

124 self.msg = msg 

125 self.type = type 

126 self.value = value 

127 self.context = context 

128 

129 

130class Parser: 

131 """Parser engine. 

132 

133 The proper usage sequence is: 

134 

135 p = Parser(grammar, [converter]) # create instance 

136 p.setup([start]) # prepare for parsing 

137 <for each input token>: 

138 if p.addtoken(...): # parse a token; may raise ParseError 

139 break 

140 root = p.rootnode # root of abstract syntax tree 

141 

142 A Parser instance may be reused by calling setup() repeatedly. 

143 

144 A Parser instance contains state pertaining to the current token 

145 sequence, and should not be used concurrently by different threads 

146 to parse separate token sequences. 

147 

148 See driver.py for how to get input tokens by tokenizing a file or 

149 string. 

150 

151 Parsing is complete when addtoken() returns True; the root of the 

152 abstract syntax tree can then be retrieved from the rootnode 

153 instance variable. When a syntax error occurs, addtoken() raises 

154 the ParseError exception. There is no error recovery; the parser 

155 cannot be used after a syntax error was reported (but it can be 

156 reinitialized by calling setup()). 

157 

158 """ 

159 

160 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None: 

161 """Constructor. 

162 

163 The grammar argument is a grammar.Grammar instance; see the 

164 grammar module for more information. 

165 

166 The parser is not ready yet for parsing; you must call the 

167 setup() method to get it started. 

168 

169 The optional convert argument is a function mapping concrete 

170 syntax tree nodes to abstract syntax tree nodes. If not 

171 given, no conversion is done and the syntax tree produced is 

172 the concrete syntax tree. If given, it must be a function of 

173 two arguments, the first being the grammar (a grammar.Grammar 

174 instance), and the second being the concrete syntax tree node 

175 to be converted. The syntax tree is converted from the bottom 

176 up. 

177 

178 **post-note: the convert argument is ignored since for Black's 

179 usage, convert will always be blib2to3.pytree.convert. Allowing 

180 this to be dynamic hurts mypyc's ability to use early binding. 

181 These docs are left for historical and informational value. 

182 

183 A concrete syntax tree node is a (type, value, context, nodes) 

184 tuple, where type is the node type (a token or symbol number), 

185 value is None for symbols and a string for tokens, context is 

186 None or an opaque value used for error reporting (typically a 

187 (lineno, offset) pair), and nodes is a list of children for 

188 symbols, and None for tokens. 

189 

190 An abstract syntax tree node may be anything; this is entirely 

191 up to the converter function. 

192 

193 """ 

194 self.grammar = grammar 

195 # See note in docstring above. TL;DR this is ignored. 

196 self.convert = convert or lam_sub 

197 self.is_backtracking = False 

198 self.last_token: Optional[int] = None 

199 

200 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None: 

201 """Prepare for parsing. 

202 

203 This *must* be called before starting to parse. 

204 

205 The optional argument is an alternative start symbol; it 

206 defaults to the grammar's start symbol. 

207 

208 You can use a Parser instance to parse any number of programs; 

209 each time you call setup() the parser is reset to an initial 

210 state determined by the (implicit or explicit) start symbol. 

211 

212 """ 

213 if start is None: 

214 start = self.grammar.start 

215 # Each stack entry is a tuple: (dfa, state, node). 

216 # A node is a tuple: (type, value, context, children), 

217 # where children is a list of nodes or None, and context may be None. 

218 newnode: RawNode = (start, None, None, []) 

219 stackentry = (self.grammar.dfas[start], 0, newnode) 

220 self.stack: list[tuple[DFAS, int, RawNode]] = [stackentry] 

221 self.rootnode: Optional[NL] = None 

222 self.used_names: set[str] = set() 

223 self.proxy = proxy 

224 self.last_token = None 

225 

226 def addtoken(self, type: int, value: str, context: Context) -> bool: 

227 """Add a token; return True iff this is the end of the program.""" 

228 # Map from token to label 

229 ilabels = self.classify(type, value, context) 

230 assert len(ilabels) >= 1 

231 

232 # If we have only one state to advance, we'll directly 

233 # take it as is. 

234 if len(ilabels) == 1: 

235 [ilabel] = ilabels 

236 return self._addtoken(ilabel, type, value, context) 

237 

238 # If there are multiple states which we can advance (only 

239 # happen under soft-keywords), then we will try all of them 

240 # in parallel and as soon as one state can reach further than 

241 # the rest, we'll choose that one. This is a pretty hacky 

242 # and hopefully temporary algorithm. 

243 # 

244 # For a more detailed explanation, check out this post: 

245 # https://tree.science/what-the-backtracking.html 

246 

247 with self.proxy.release() as proxy: 

248 counter, force = 0, False 

249 recorder = Recorder(self, ilabels, context) 

250 recorder.add_token(type, value, raw=True) 

251 

252 next_token_value = value 

253 while recorder.determine_route(next_token_value) is None: 

254 if not proxy.can_advance(counter): 

255 force = True 

256 break 

257 

258 next_token_type, next_token_value, *_ = proxy.eat(counter) 

259 if next_token_type in (tokenize.COMMENT, tokenize.NL): 

260 counter += 1 

261 continue 

262 

263 if next_token_type == tokenize.OP: 

264 next_token_type = grammar.opmap[next_token_value] 

265 

266 recorder.add_token(next_token_type, next_token_value) 

267 counter += 1 

268 

269 ilabel = cast(int, recorder.determine_route(next_token_value, force=force)) 

270 assert ilabel is not None 

271 

272 return self._addtoken(ilabel, type, value, context) 

273 

274 def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool: 

275 # Loop until the token is shifted; may raise exceptions 

276 while True: 

277 dfa, state, node = self.stack[-1] 

278 states, first = dfa 

279 arcs = states[state] 

280 # Look for a state with this label 

281 for i, newstate in arcs: 

282 t = self.grammar.labels[i][0] 

283 if t >= 256: 

284 # See if it's a symbol and if we're in its first set 

285 itsdfa = self.grammar.dfas[t] 

286 itsstates, itsfirst = itsdfa 

287 if ilabel in itsfirst: 

288 # Push a symbol 

289 self.push(t, itsdfa, newstate, context) 

290 break # To continue the outer while loop 

291 

292 elif ilabel == i: 

293 # Look it up in the list of labels 

294 # Shift a token; we're done with it 

295 self.shift(type, value, newstate, context) 

296 # Pop while we are in an accept-only state 

297 state = newstate 

298 while states[state] == [(0, state)]: 

299 self.pop() 

300 if not self.stack: 

301 # Done parsing! 

302 return True 

303 dfa, state, node = self.stack[-1] 

304 states, first = dfa 

305 # Done with this token 

306 self.last_token = type 

307 return False 

308 

309 else: 

310 if (0, state) in arcs: 

311 # An accepting state, pop it and try something else 

312 self.pop() 

313 if not self.stack: 

314 # Done parsing, but another token is input 

315 raise ParseError("too much input", type, value, context) 

316 else: 

317 # No success finding a transition 

318 raise ParseError("bad input", type, value, context) 

319 

320 def classify(self, type: int, value: str, context: Context) -> list[int]: 

321 """Turn a token into a label. (Internal) 

322 

323 Depending on whether the value is a soft-keyword or not, 

324 this function may return multiple labels to choose from.""" 

325 if type == token.NAME: 

326 # Keep a listing of all used names 

327 self.used_names.add(value) 

328 # Check for reserved words 

329 if value in self.grammar.keywords: 

330 return [self.grammar.keywords[value]] 

331 elif value in self.grammar.soft_keywords: 

332 assert type in self.grammar.tokens 

333 # Current soft keywords (match, case, type) can only appear at the 

334 # beginning of a statement. So as a shortcut, don't try to treat them 

335 # like keywords in any other context. 

336 # ('_' is also a soft keyword in the real grammar, but for our grammar 

337 # it's just an expression, so we don't need to treat it specially.) 

338 if self.last_token not in ( 

339 None, 

340 token.INDENT, 

341 token.DEDENT, 

342 token.NEWLINE, 

343 token.SEMI, 

344 token.COLON, 

345 ): 

346 return [self.grammar.tokens[type]] 

347 return [ 

348 self.grammar.tokens[type], 

349 self.grammar.soft_keywords[value], 

350 ] 

351 

352 ilabel = self.grammar.tokens.get(type) 

353 if ilabel is None: 

354 raise ParseError("bad token", type, value, context) 

355 return [ilabel] 

356 

357 def shift(self, type: int, value: str, newstate: int, context: Context) -> None: 

358 """Shift a token. (Internal)""" 

359 if self.is_backtracking: 

360 dfa, state, _ = self.stack[-1] 

361 self.stack[-1] = (dfa, newstate, DUMMY_NODE) 

362 else: 

363 dfa, state, node = self.stack[-1] 

364 rawnode: RawNode = (type, value, context, None) 

365 newnode = convert(self.grammar, rawnode) 

366 assert node[-1] is not None 

367 node[-1].append(newnode) 

368 self.stack[-1] = (dfa, newstate, node) 

369 

370 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None: 

371 """Push a nonterminal. (Internal)""" 

372 if self.is_backtracking: 

373 dfa, state, _ = self.stack[-1] 

374 self.stack[-1] = (dfa, newstate, DUMMY_NODE) 

375 self.stack.append((newdfa, 0, DUMMY_NODE)) 

376 else: 

377 dfa, state, node = self.stack[-1] 

378 newnode: RawNode = (type, None, context, []) 

379 self.stack[-1] = (dfa, newstate, node) 

380 self.stack.append((newdfa, 0, newnode)) 

381 

382 def pop(self) -> None: 

383 """Pop a nonterminal. (Internal)""" 

384 if self.is_backtracking: 

385 self.stack.pop() 

386 else: 

387 popdfa, popstate, popnode = self.stack.pop() 

388 newnode = convert(self.grammar, popnode) 

389 if self.stack: 

390 dfa, state, node = self.stack[-1] 

391 assert node[-1] is not None 

392 node[-1].append(newnode) 

393 else: 

394 self.rootnode = newnode 

395 self.rootnode.used_names = self.used_names