Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/parse.py: 95%

2# Licensed to PSF under a Contributor Agreement.

4"""Parser engine for the grammar tables generated by pgen.

6The grammar table must be loaded first.

8See Parser/parser.c in the Python distribution for additional info on

9how this parsing engine works.

11"""

13from collections.abc import Callable, Iterator

14from contextlib import contextmanager

15from typing import TYPE_CHECKING, Any, Optional, Union, cast

17from blib2to3.pgen2.grammar import Grammar

18from blib2to3.pytree import NL, Context, Leaf, Node, RawNode, convert

20# Local imports

21from . import grammar, token, tokenize

23if TYPE_CHECKING:

24 from blib2to3.pgen2.driver import TokenProxy

27Results = dict[str, NL]

28Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]

29DFA = list[list[tuple[int, int]]]

30DFAS = tuple[DFA, dict[int, int]]

33def lam_sub(grammar: Grammar, node: RawNode) -> NL:

34 assert node[3] is not None

35 return Node(type=node[0], children=node[3], context=node[2])

38# A placeholder node, used when parser is backtracking.

39DUMMY_NODE = (-1, None, None, None)

42def stack_copy(

43 stack: list[tuple[DFAS, int, RawNode]],

44) -> list[tuple[DFAS, int, RawNode]]:

45 """Nodeless stack copy."""

46 return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]

49class Recorder:

50 def __init__(self, parser: "Parser", ilabels: list[int], context: Context) -> None:

51 self.parser = parser

52 self._ilabels = ilabels

53 self.context = context # not really matter

55 self._dead_ilabels: set[int] = set()

56 self._start_point = self.parser.stack

57 self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}

59 @property

60 def ilabels(self) -> set[int]:

61 return self._dead_ilabels.symmetric_difference(self._ilabels)

63 @contextmanager

64 def switch_to(self, ilabel: int) -> Iterator[None]:

65 with self.backtrack():

66 self.parser.stack = self._points[ilabel]

67 try:

68 yield

69 except ParseError:

70 self._dead_ilabels.add(ilabel)

71 finally:

72 self.parser.stack = self._start_point

74 @contextmanager

75 def backtrack(self) -> Iterator[None]:

76 """

77 Use the node-level invariant ones for basic parsing operations (push/pop/shift).

78 These still will operate on the stack; but they won't create any new nodes, or

79 modify the contents of any other existing nodes.

81 This saves us a ton of time when we are backtracking, since we

82 want to restore to the initial state as quick as possible, which

83 can only be done by having as little mutatations as possible.

84 """

85 is_backtracking = self.parser.is_backtracking

86 try:

87 self.parser.is_backtracking = True

88 yield

89 finally:

90 self.parser.is_backtracking = is_backtracking

92 def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None:

93 for ilabel in self.ilabels:

94 with self.switch_to(ilabel):

95 if raw:

96 self.parser._addtoken(ilabel, tok_type, tok_val, self.context)

97 else:

98 self.parser.addtoken(tok_type, tok_val, self.context)

100 def determine_route(

101 self, value: Optional[str] = None, force: bool = False

102 ) -> Optional[int]:

103 alive_ilabels = self.ilabels

104 if len(alive_ilabels) == 0:

105 *_, most_successful_ilabel = self._dead_ilabels

106 raise ParseError("bad input", most_successful_ilabel, value, self.context)

107

108 ilabel, *rest = alive_ilabels

109 if force or not rest:

110 return ilabel

111 else:

112 return None

113

114

115class ParseError(Exception):

116 """Exception to signal the parser is stuck."""

117

118 def __init__(

119 self, msg: str, type: Optional[int], value: Optional[str], context: Context

120 ) -> None:

121 Exception.__init__(

122 self, f"{msg}: type={type!r}, value={value!r}, context={context!r}"

123 )

124 self.msg = msg

125 self.type = type

126 self.value = value

127 self.context = context

128

129

130class Parser:

131 """Parser engine.

132

133 The proper usage sequence is:

134

135 p = Parser(grammar, [converter]) # create instance

136 p.setup([start]) # prepare for parsing

137 <for each input token>:

138 if p.addtoken(...): # parse a token; may raise ParseError

139 break

140 root = p.rootnode # root of abstract syntax tree

141

142 A Parser instance may be reused by calling setup() repeatedly.

143

144 A Parser instance contains state pertaining to the current token

145 sequence, and should not be used concurrently by different threads

146 to parse separate token sequences.

147

148 See driver.py for how to get input tokens by tokenizing a file or

149 string.

150

151 Parsing is complete when addtoken() returns True; the root of the

152 abstract syntax tree can then be retrieved from the rootnode

153 instance variable. When a syntax error occurs, addtoken() raises

154 the ParseError exception. There is no error recovery; the parser

155 cannot be used after a syntax error was reported (but it can be

156 reinitialized by calling setup()).

157

158 """

159

160 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:

161 """Constructor.

162

163 The grammar argument is a grammar.Grammar instance; see the

164 grammar module for more information.

165

166 The parser is not ready yet for parsing; you must call the

167 setup() method to get it started.

168

169 The optional convert argument is a function mapping concrete

170 syntax tree nodes to abstract syntax tree nodes. If not

171 given, no conversion is done and the syntax tree produced is

172 the concrete syntax tree. If given, it must be a function of

173 two arguments, the first being the grammar (a grammar.Grammar

174 instance), and the second being the concrete syntax tree node

175 to be converted. The syntax tree is converted from the bottom

176 up.

177

178 **post-note: the convert argument is ignored since for Black's

179 usage, convert will always be blib2to3.pytree.convert. Allowing

180 this to be dynamic hurts mypyc's ability to use early binding.

181 These docs are left for historical and informational value.

182

183 A concrete syntax tree node is a (type, value, context, nodes)

184 tuple, where type is the node type (a token or symbol number),

185 value is None for symbols and a string for tokens, context is

186 None or an opaque value used for error reporting (typically a

187 (lineno, offset) pair), and nodes is a list of children for

188 symbols, and None for tokens.

189

190 An abstract syntax tree node may be anything; this is entirely

191 up to the converter function.

192

193 """

194 self.grammar = grammar

195 # See note in docstring above. TL;DR this is ignored.

196 self.convert = convert or lam_sub

197 self.is_backtracking = False

198 self.last_token: Optional[int] = None

199

200 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:

201 """Prepare for parsing.

202

203 This *must* be called before starting to parse.

204

205 The optional argument is an alternative start symbol; it

206 defaults to the grammar's start symbol.

207

208 You can use a Parser instance to parse any number of programs;

209 each time you call setup() the parser is reset to an initial

210 state determined by the (implicit or explicit) start symbol.

211

212 """

213 if start is None:

214 start = self.grammar.start

215 # Each stack entry is a tuple: (dfa, state, node).

216 # A node is a tuple: (type, value, context, children),

217 # where children is a list of nodes or None, and context may be None.

218 newnode: RawNode = (start, None, None, [])

219 stackentry = (self.grammar.dfas[start], 0, newnode)

220 self.stack: list[tuple[DFAS, int, RawNode]] = [stackentry]

221 self.rootnode: Optional[NL] = None

222 self.used_names: set[str] = set()

223 self.proxy = proxy

224 self.last_token = None

225

226 def addtoken(self, type: int, value: str, context: Context) -> bool:

227 """Add a token; return True iff this is the end of the program."""

228 # Map from token to label

229 ilabels = self.classify(type, value, context)

230 assert len(ilabels) >= 1

231

232 # If we have only one state to advance, we'll directly

233 # take it as is.

234 if len(ilabels) == 1:

235 [ilabel] = ilabels

236 return self._addtoken(ilabel, type, value, context)

237

238 # If there are multiple states which we can advance (only

239 # happen under soft-keywords), then we will try all of them

240 # in parallel and as soon as one state can reach further than

241 # the rest, we'll choose that one. This is a pretty hacky

242 # and hopefully temporary algorithm.

243 #

244 # For a more detailed explanation, check out this post:

245 # https://tree.science/what-the-backtracking.html

246

247 with self.proxy.release() as proxy:

248 counter, force = 0, False

249 recorder = Recorder(self, ilabels, context)

250 recorder.add_token(type, value, raw=True)

251

252 next_token_value = value

253 while recorder.determine_route(next_token_value) is None:

254 if not proxy.can_advance(counter):

255 force = True

256 break

257

258 next_token_type, next_token_value, *_ = proxy.eat(counter)

259 if next_token_type in (tokenize.COMMENT, tokenize.NL):

260 counter += 1

261 continue

262

263 if next_token_type == tokenize.OP:

264 next_token_type = grammar.opmap[next_token_value]

265

266 recorder.add_token(next_token_type, next_token_value)

267 counter += 1

268

269 ilabel = cast(int, recorder.determine_route(next_token_value, force=force))

270 assert ilabel is not None

271

272 return self._addtoken(ilabel, type, value, context)

273

274 def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool:

275 # Loop until the token is shifted; may raise exceptions

276 while True:

277 dfa, state, node = self.stack[-1]

278 states, first = dfa

279 arcs = states[state]

280 # Look for a state with this label

281 for i, newstate in arcs:

282 t = self.grammar.labels[i][0]

283 if t >= 256:

284 # See if it's a symbol and if we're in its first set

285 itsdfa = self.grammar.dfas[t]

286 itsstates, itsfirst = itsdfa

287 if ilabel in itsfirst:

288 # Push a symbol

289 self.push(t, itsdfa, newstate, context)

290 break # To continue the outer while loop

291

292 elif ilabel == i:

293 # Look it up in the list of labels

294 # Shift a token; we're done with it

295 self.shift(type, value, newstate, context)

296 # Pop while we are in an accept-only state

297 state = newstate

298 while states[state] == [(0, state)]:

299 self.pop()

300 if not self.stack:

301 # Done parsing!

302 return True

303 dfa, state, node = self.stack[-1]

304 states, first = dfa

305 # Done with this token

306 self.last_token = type

307 return False

308

309 else:

310 if (0, state) in arcs:

311 # An accepting state, pop it and try something else

312 self.pop()

313 if not self.stack:

314 # Done parsing, but another token is input

315 raise ParseError("too much input", type, value, context)

316 else:

317 # No success finding a transition

318 raise ParseError("bad input", type, value, context)

319

320 def classify(self, type: int, value: str, context: Context) -> list[int]:

321 """Turn a token into a label. (Internal)

322

323 Depending on whether the value is a soft-keyword or not,

324 this function may return multiple labels to choose from."""

325 if type == token.NAME:

326 # Keep a listing of all used names

327 self.used_names.add(value)

328 # Check for reserved words

329 if value in self.grammar.keywords:

330 return [self.grammar.keywords[value]]

331 elif value in self.grammar.soft_keywords:

332 assert type in self.grammar.tokens

333 # Current soft keywords (match, case, type) can only appear at the

334 # beginning of a statement. So as a shortcut, don't try to treat them

335 # like keywords in any other context.

336 # ('_' is also a soft keyword in the real grammar, but for our grammar

337 # it's just an expression, so we don't need to treat it specially.)

338 if self.last_token not in (

339 None,

340 token.INDENT,

341 token.DEDENT,

342 token.NEWLINE,

343 token.SEMI,

344 token.COLON,

345 ):

346 return [self.grammar.tokens[type]]

347 return [

348 self.grammar.tokens[type],

349 self.grammar.soft_keywords[value],

350 ]

351

352 ilabel = self.grammar.tokens.get(type)

353 if ilabel is None:

354 raise ParseError("bad token", type, value, context)

355 return [ilabel]

356

357 def shift(self, type: int, value: str, newstate: int, context: Context) -> None:

358 """Shift a token. (Internal)"""

359 if self.is_backtracking:

360 dfa, state, _ = self.stack[-1]

361 self.stack[-1] = (dfa, newstate, DUMMY_NODE)

362 else:

363 dfa, state, node = self.stack[-1]

364 rawnode: RawNode = (type, value, context, None)

365 newnode = convert(self.grammar, rawnode)

366 assert node[-1] is not None

367 node[-1].append(newnode)

368 self.stack[-1] = (dfa, newstate, node)

369

370 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:

371 """Push a nonterminal. (Internal)"""

372 if self.is_backtracking:

373 dfa, state, _ = self.stack[-1]

374 self.stack[-1] = (dfa, newstate, DUMMY_NODE)

375 self.stack.append((newdfa, 0, DUMMY_NODE))

376 else:

377 dfa, state, node = self.stack[-1]

378 newnode: RawNode = (type, None, context, [])

379 self.stack[-1] = (dfa, newstate, node)

380 self.stack.append((newdfa, 0, newnode))

381

382 def pop(self) -> None:

383 """Pop a nonterminal. (Internal)"""

384 if self.is_backtracking:

385 self.stack.pop()

386 else:

387 popdfa, popstate, popnode = self.stack.pop()

388 newnode = convert(self.grammar, popnode)

389 if self.stack:

390 dfa, state, node = self.stack[-1]

391 assert node[-1] is not None

392 node[-1].append(newnode)

393 else:

394 self.rootnode = newnode

395 self.rootnode.used_names = self.used_names