Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/parse.py: 95%

2# Licensed to PSF under a Contributor Agreement.

4"""Parser engine for the grammar tables generated by pgen.

6The grammar table must be loaded first.

8See Parser/parser.c in the Python distribution for additional info on

9how this parsing engine works.

11"""

12from collections.abc import Callable, Iterator

13from contextlib import contextmanager

14from typing import TYPE_CHECKING, Any, Optional, Union, cast

16from blib2to3.pgen2.grammar import Grammar

17from blib2to3.pytree import NL, Context, Leaf, Node, RawNode, convert

19# Local imports

20from . import grammar, token, tokenize

22if TYPE_CHECKING:

23 from blib2to3.pgen2.driver import TokenProxy

26Results = dict[str, NL]

27Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]]

28DFA = list[list[tuple[int, int]]]

29DFAS = tuple[DFA, dict[int, int]]

32def lam_sub(grammar: Grammar, node: RawNode) -> NL:

33 assert node[3] is not None

34 return Node(type=node[0], children=node[3], context=node[2])

37# A placeholder node, used when parser is backtracking.

38DUMMY_NODE = (-1, None, None, None)

41def stack_copy(

42 stack: list[tuple[DFAS, int, RawNode]],

43) -> list[tuple[DFAS, int, RawNode]]:

44 """Nodeless stack copy."""

45 return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack]

48class Recorder:

49 def __init__(self, parser: "Parser", ilabels: list[int], context: Context) -> None:

50 self.parser = parser

51 self._ilabels = ilabels

52 self.context = context # not really matter

54 self._dead_ilabels: set[int] = set()

55 self._start_point = self.parser.stack

56 self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels}

58 @property

59 def ilabels(self) -> set[int]:

60 return self._dead_ilabels.symmetric_difference(self._ilabels)

62 @contextmanager

63 def switch_to(self, ilabel: int) -> Iterator[None]:

64 with self.backtrack():

65 self.parser.stack = self._points[ilabel]

66 try:

67 yield

68 except ParseError:

69 self._dead_ilabels.add(ilabel)

70 finally:

71 self.parser.stack = self._start_point

73 @contextmanager

74 def backtrack(self) -> Iterator[None]:

75 """

76 Use the node-level invariant ones for basic parsing operations (push/pop/shift).

77 These still will operate on the stack; but they won't create any new nodes, or

78 modify the contents of any other existing nodes.

80 This saves us a ton of time when we are backtracking, since we

81 want to restore to the initial state as quick as possible, which

82 can only be done by having as little mutatations as possible.

83 """

84 is_backtracking = self.parser.is_backtracking

85 try:

86 self.parser.is_backtracking = True

87 yield

88 finally:

89 self.parser.is_backtracking = is_backtracking

91 def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None:

92 for ilabel in self.ilabels:

93 with self.switch_to(ilabel):

94 if raw:

95 self.parser._addtoken(ilabel, tok_type, tok_val, self.context)

96 else:

97 self.parser.addtoken(tok_type, tok_val, self.context)

99 def determine_route(

100 self, value: Optional[str] = None, force: bool = False

101 ) -> Optional[int]:

102 alive_ilabels = self.ilabels

103 if len(alive_ilabels) == 0:

104 *_, most_successful_ilabel = self._dead_ilabels

105 raise ParseError("bad input", most_successful_ilabel, value, self.context)

106

107 ilabel, *rest = alive_ilabels

108 if force or not rest:

109 return ilabel

110 else:

111 return None

112

113

114class ParseError(Exception):

115 """Exception to signal the parser is stuck."""

116

117 def __init__(

118 self, msg: str, type: Optional[int], value: Optional[str], context: Context

119 ) -> None:

120 Exception.__init__(

121 self, f"{msg}: type={type!r}, value={value!r}, context={context!r}"

122 )

123 self.msg = msg

124 self.type = type

125 self.value = value

126 self.context = context

127

128

129class Parser:

130 """Parser engine.

131

132 The proper usage sequence is:

133

134 p = Parser(grammar, [converter]) # create instance

135 p.setup([start]) # prepare for parsing

136 <for each input token>:

137 if p.addtoken(...): # parse a token; may raise ParseError

138 break

139 root = p.rootnode # root of abstract syntax tree

140

141 A Parser instance may be reused by calling setup() repeatedly.

142

143 A Parser instance contains state pertaining to the current token

144 sequence, and should not be used concurrently by different threads

145 to parse separate token sequences.

146

147 See driver.py for how to get input tokens by tokenizing a file or

148 string.

149

150 Parsing is complete when addtoken() returns True; the root of the

151 abstract syntax tree can then be retrieved from the rootnode

152 instance variable. When a syntax error occurs, addtoken() raises

153 the ParseError exception. There is no error recovery; the parser

154 cannot be used after a syntax error was reported (but it can be

155 reinitialized by calling setup()).

156

157 """

158

159 def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None:

160 """Constructor.

161

162 The grammar argument is a grammar.Grammar instance; see the

163 grammar module for more information.

164

165 The parser is not ready yet for parsing; you must call the

166 setup() method to get it started.

167

168 The optional convert argument is a function mapping concrete

169 syntax tree nodes to abstract syntax tree nodes. If not

170 given, no conversion is done and the syntax tree produced is

171 the concrete syntax tree. If given, it must be a function of

172 two arguments, the first being the grammar (a grammar.Grammar

173 instance), and the second being the concrete syntax tree node

174 to be converted. The syntax tree is converted from the bottom

175 up.

176

177 **post-note: the convert argument is ignored since for Black's

178 usage, convert will always be blib2to3.pytree.convert. Allowing

179 this to be dynamic hurts mypyc's ability to use early binding.

180 These docs are left for historical and informational value.

181

182 A concrete syntax tree node is a (type, value, context, nodes)

183 tuple, where type is the node type (a token or symbol number),

184 value is None for symbols and a string for tokens, context is

185 None or an opaque value used for error reporting (typically a

186 (lineno, offset) pair), and nodes is a list of children for

187 symbols, and None for tokens.

188

189 An abstract syntax tree node may be anything; this is entirely

190 up to the converter function.

191

192 """

193 self.grammar = grammar

194 # See note in docstring above. TL;DR this is ignored.

195 self.convert = convert or lam_sub

196 self.is_backtracking = False

197 self.last_token: Optional[int] = None

198

199 def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None:

200 """Prepare for parsing.

201

202 This *must* be called before starting to parse.

203

204 The optional argument is an alternative start symbol; it

205 defaults to the grammar's start symbol.

206

207 You can use a Parser instance to parse any number of programs;

208 each time you call setup() the parser is reset to an initial

209 state determined by the (implicit or explicit) start symbol.

210

211 """

212 if start is None:

213 start = self.grammar.start

214 # Each stack entry is a tuple: (dfa, state, node).

215 # A node is a tuple: (type, value, context, children),

216 # where children is a list of nodes or None, and context may be None.

217 newnode: RawNode = (start, None, None, [])

218 stackentry = (self.grammar.dfas[start], 0, newnode)

219 self.stack: list[tuple[DFAS, int, RawNode]] = [stackentry]

220 self.rootnode: Optional[NL] = None

221 self.used_names: set[str] = set()

222 self.proxy = proxy

223 self.last_token = None

224

225 def addtoken(self, type: int, value: str, context: Context) -> bool:

226 """Add a token; return True iff this is the end of the program."""

227 # Map from token to label

228 ilabels = self.classify(type, value, context)

229 assert len(ilabels) >= 1

230

231 # If we have only one state to advance, we'll directly

232 # take it as is.

233 if len(ilabels) == 1:

234 [ilabel] = ilabels

235 return self._addtoken(ilabel, type, value, context)

236

237 # If there are multiple states which we can advance (only

238 # happen under soft-keywords), then we will try all of them

239 # in parallel and as soon as one state can reach further than

240 # the rest, we'll choose that one. This is a pretty hacky

241 # and hopefully temporary algorithm.

242 #

243 # For a more detailed explanation, check out this post:

244 # https://tree.science/what-the-backtracking.html

245

246 with self.proxy.release() as proxy:

247 counter, force = 0, False

248 recorder = Recorder(self, ilabels, context)

249 recorder.add_token(type, value, raw=True)

250

251 next_token_value = value

252 while recorder.determine_route(next_token_value) is None:

253 if not proxy.can_advance(counter):

254 force = True

255 break

256

257 next_token_type, next_token_value, *_ = proxy.eat(counter)

258 if next_token_type in (tokenize.COMMENT, tokenize.NL):

259 counter += 1

260 continue

261

262 if next_token_type == tokenize.OP:

263 next_token_type = grammar.opmap[next_token_value]

264

265 recorder.add_token(next_token_type, next_token_value)

266 counter += 1

267

268 ilabel = cast(int, recorder.determine_route(next_token_value, force=force))

269 assert ilabel is not None

270

271 return self._addtoken(ilabel, type, value, context)

272

273 def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool:

274 # Loop until the token is shifted; may raise exceptions

275 while True:

276 dfa, state, node = self.stack[-1]

277 states, first = dfa

278 arcs = states[state]

279 # Look for a state with this label

280 for i, newstate in arcs:

281 t = self.grammar.labels[i][0]

282 if t >= 256:

283 # See if it's a symbol and if we're in its first set

284 itsdfa = self.grammar.dfas[t]

285 itsstates, itsfirst = itsdfa

286 if ilabel in itsfirst:

287 # Push a symbol

288 self.push(t, itsdfa, newstate, context)

289 break # To continue the outer while loop

290

291 elif ilabel == i:

292 # Look it up in the list of labels

293 # Shift a token; we're done with it

294 self.shift(type, value, newstate, context)

295 # Pop while we are in an accept-only state

296 state = newstate

297 while states[state] == [(0, state)]:

298 self.pop()

299 if not self.stack:

300 # Done parsing!

301 return True

302 dfa, state, node = self.stack[-1]

303 states, first = dfa

304 # Done with this token

305 self.last_token = type

306 return False

307

308 else:

309 if (0, state) in arcs:

310 # An accepting state, pop it and try something else

311 self.pop()

312 if not self.stack:

313 # Done parsing, but another token is input

314 raise ParseError("too much input", type, value, context)

315 else:

316 # No success finding a transition

317 raise ParseError("bad input", type, value, context)

318

319 def classify(self, type: int, value: str, context: Context) -> list[int]:

320 """Turn a token into a label. (Internal)

321

322 Depending on whether the value is a soft-keyword or not,

323 this function may return multiple labels to choose from."""

324 if type == token.NAME:

325 # Keep a listing of all used names

326 self.used_names.add(value)

327 # Check for reserved words

328 if value in self.grammar.keywords:

329 return [self.grammar.keywords[value]]

330 elif value in self.grammar.soft_keywords:

331 assert type in self.grammar.tokens

332 # Current soft keywords (match, case, type) can only appear at the

333 # beginning of a statement. So as a shortcut, don't try to treat them

334 # like keywords in any other context.

335 # ('_' is also a soft keyword in the real grammar, but for our grammar

336 # it's just an expression, so we don't need to treat it specially.)

337 if self.last_token not in (

338 None,

339 token.INDENT,

340 token.DEDENT,

341 token.NEWLINE,

342 token.SEMI,

343 token.COLON,

344 ):

345 return [self.grammar.tokens[type]]

346 return [

347 self.grammar.tokens[type],

348 self.grammar.soft_keywords[value],

349 ]

350

351 ilabel = self.grammar.tokens.get(type)

352 if ilabel is None:

353 raise ParseError("bad token", type, value, context)

354 return [ilabel]

355

356 def shift(self, type: int, value: str, newstate: int, context: Context) -> None:

357 """Shift a token. (Internal)"""

358 if self.is_backtracking:

359 dfa, state, _ = self.stack[-1]

360 self.stack[-1] = (dfa, newstate, DUMMY_NODE)

361 else:

362 dfa, state, node = self.stack[-1]

363 rawnode: RawNode = (type, value, context, None)

364 newnode = convert(self.grammar, rawnode)

365 assert node[-1] is not None

366 node[-1].append(newnode)

367 self.stack[-1] = (dfa, newstate, node)

368

369 def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None:

370 """Push a nonterminal. (Internal)"""

371 if self.is_backtracking:

372 dfa, state, _ = self.stack[-1]

373 self.stack[-1] = (dfa, newstate, DUMMY_NODE)

374 self.stack.append((newdfa, 0, DUMMY_NODE))

375 else:

376 dfa, state, node = self.stack[-1]

377 newnode: RawNode = (type, None, context, [])

378 self.stack[-1] = (dfa, newstate, node)

379 self.stack.append((newdfa, 0, newnode))

380

381 def pop(self) -> None:

382 """Pop a nonterminal. (Internal)"""

383 if self.is_backtracking:

384 self.stack.pop()

385 else:

386 popdfa, popstate, popnode = self.stack.pop()

387 newnode = convert(self.grammar, popnode)

388 if self.stack:

389 dfa, state, node = self.stack[-1]

390 assert node[-1] is not None

391 node[-1].append(newnode)

392 else:

393 self.rootnode = newnode

394 self.rootnode.used_names = self.used_names