Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/lark/parsers/earley.py: 60%

1"""This module implements an Earley parser.

3The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:

4 https://www.sciencedirect.com/science/article/pii/S1571066108001497

6That is probably the best reference for understanding the algorithm here.

8The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format

9is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html

10"""

12from typing import TYPE_CHECKING, Callable, Optional, List, Any

13from collections import deque

15from ..lexer import Token

16from ..tree import Tree

17from ..exceptions import UnexpectedEOF, UnexpectedToken

18from ..utils import logger, OrderedSet, dedup_list

19from .grammar_analysis import GrammarAnalyzer

20from ..grammar import NonTerminal

21from .earley_common import Item

22from .earley_forest import ForestSumVisitor, SymbolNode, StableSymbolNode, TokenNode, ForestToParseTree

24if TYPE_CHECKING:

25 from ..common import LexerConf, ParserConf

27class Parser:

28 lexer_conf: 'LexerConf'

29 parser_conf: 'ParserConf'

30 debug: bool

32 def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,

33 resolve_ambiguity: bool=True, debug: bool=False,

34 tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):

35 analysis = GrammarAnalyzer(parser_conf)

36 self.lexer_conf = lexer_conf

37 self.parser_conf = parser_conf

38 self.resolve_ambiguity = resolve_ambiguity

39 self.debug = debug

40 self.Tree = tree_class

41 self.Set = OrderedSet if ordered_sets else set

42 self.SymbolNode = StableSymbolNode if ordered_sets else SymbolNode

44 self.FIRST = analysis.FIRST

45 self.NULLABLE = analysis.NULLABLE

46 self.callbacks = parser_conf.callbacks

47 # TODO add typing info

48 self.predictions = {} # type: ignore[var-annotated]

50 ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than

51 # the slow 'isupper' in is_terminal.

52 self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }

53 self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }

55 self.forest_sum_visitor = None

56 for rule in parser_conf.rules:

57 if rule.origin not in self.predictions:

58 self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]

60 ## Detect if any rules/terminals have priorities set. If the user specified priority = None, then

61 # the priorities will be stripped from all rules/terminals before they reach us, allowing us to

62 # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities

63 # on any rules/terminals.

64 if self.forest_sum_visitor is None and rule.options.priority is not None:

65 self.forest_sum_visitor = ForestSumVisitor

67 # Check terminals for priorities

68 # Ignore terminal priorities if the basic lexer is used

69 if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None:

70 for term in self.lexer_conf.terminals:

71 if term.priority:

72 self.forest_sum_visitor = ForestSumVisitor

73 break

75 self.term_matcher = term_matcher

78 def predict_and_complete(self, i, to_scan, columns, transitives, node_cache):

79 """The core Earley Predictor and Completer.

81 At each stage of the input, we handling any completed items (things

82 that matched on the last cycle) and use those to predict what should

83 come next in the input stream. The completions and any predicted

84 non-terminals are recursively processed until we reach a set of,

85 which can be added to the scan list for the next scanner cycle."""

86 # Held Completions (H in E.Scotts paper).

87 held_completions = {}

89 column = columns[i]

90 # R (items) = Ei (column.items)

91 items = deque(column)

92 while items:

93 item = items.pop() # remove an element, A say, from R

95 ### The Earley completer

96 if item.is_complete: ### (item.s == string)

97 if item.node is None:

98 label = (item.s, item.start, i)

99 item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))

100 item.node.add_family(item.s, item.rule, item.start, None, None)

101

102 # create_leo_transitives(item.rule.origin, item.start)

103

104 ###R Joop Leo right recursion Completer

105 if item.rule.origin in transitives[item.start]:

106 transitive = transitives[item.start][item.s]

107 if transitive.previous in transitives[transitive.column]:

108 root_transitive = transitives[transitive.column][transitive.previous]

109 else:

110 root_transitive = transitive

111

112 new_item = Item(transitive.rule, transitive.ptr, transitive.start)

113 label = (root_transitive.s, root_transitive.start, i)

114 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))

115 new_item.node.add_path(root_transitive, item.node)

116 if new_item.expect in self.TERMINALS:

117 # Add (B :: aC.B, h, y) to Q

118 to_scan.add(new_item)

119 elif new_item not in column:

120 # Add (B :: aC.B, h, y) to Ei and R

121 column.add(new_item)

122 items.append(new_item)

123 ###R Regular Earley completer

124 else:

125 # Empty has 0 length. If we complete an empty symbol in a particular

126 # parse step, we need to be able to use that same empty symbol to complete

127 # any predictions that result, that themselves require empty. Avoids

128 # infinite recursion on empty symbols.

129 # held_completions is 'H' in E.Scott's paper.

130 is_empty_item = item.start == i

131 if is_empty_item:

132 held_completions[item.rule.origin] = item.node

133

134 originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]

135 for originator in originators:

136 new_item = originator.advance()

137 label = (new_item.s, originator.start, i)

138 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))

139 new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)

140 if new_item.expect in self.TERMINALS:

141 # Add (B :: aC.B, h, y) to Q

142 to_scan.add(new_item)

143 elif new_item not in column:

144 # Add (B :: aC.B, h, y) to Ei and R

145 column.add(new_item)

146 items.append(new_item)

147

148 ### The Earley predictor

149 elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)

150 new_items = []

151 for rule in self.predictions[item.expect]:

152 new_item = Item(rule, 0, i)

153 new_items.append(new_item)

154

155 # Process any held completions (H).

156 if item.expect in held_completions:

157 new_item = item.advance()

158 label = (new_item.s, item.start, i)

159 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))

160 new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])

161 new_items.append(new_item)

162

163 for new_item in new_items:

164 if new_item.expect in self.TERMINALS:

165 to_scan.add(new_item)

166 elif new_item not in column:

167 column.add(new_item)

168 items.append(new_item)

169

170 def _parse(self, lexer, columns, to_scan, start_symbol=None):

171

172 def is_quasi_complete(item):

173 if item.is_complete:

174 return True

175

176 quasi = item.advance()

177 while not quasi.is_complete:

178 if quasi.expect not in self.NULLABLE:

179 return False

180 if quasi.rule.origin == start_symbol and quasi.expect == start_symbol:

181 return False

182 quasi = quasi.advance()

183 return True

184

185 # def create_leo_transitives(origin, start):

186 # ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420

187

188 def scan(i, token, to_scan):

189 """The core Earley Scanner.

190

191 This is a custom implementation of the scanner that uses the

192 Lark lexer to match tokens. The scan list is built by the

193 Earley predictor, based on the previously completed tokens.

194 This ensures that at each phase of the parse we have a custom

195 lexer context, allowing for more complex ambiguities."""

196 next_to_scan = self.Set()

197 next_set = self.Set()

198 columns.append(next_set)

199 transitives.append({})

200 node_cache = {}

201

202 for item in self.Set(to_scan):

203 if match(item.expect, token):

204 new_item = item.advance()

205 label = (new_item.s, new_item.start, i + 1)

206 # 'terminals' may not contain token.type when using %declare

207 # Additionally, token is not always a Token

208 # For example, it can be a Tree when using TreeMatcher

209 term = terminals.get(token.type) if isinstance(token, Token) else None

210 # Set the priority of the token node to 0 so that the

211 # terminal priorities do not affect the Tree chosen by

212 # ForestSumVisitor after the basic lexer has already

213 # "used up" the terminal priorities

214 token_node = TokenNode(token, term, priority=0)

215 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))

216 new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)

217

218 if new_item.expect in self.TERMINALS:

219 # add (B ::= Aai+1.B, h, y) to Q'

220 next_to_scan.add(new_item)

221 else:

222 # add (B ::= Aa+1.B, h, y) to Ei+1

223 next_set.add(new_item)

224

225 if not next_set and not next_to_scan:

226 expect = {i.expect.name for i in to_scan}

227 raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))

228

229 return next_to_scan, node_cache

230

231

232 # Define parser functions

233 match = self.term_matcher

234

235 terminals = self.lexer_conf.terminals_by_name

236

237 # Cache for nodes & tokens created in a particular parse step.

238 transitives = [{}]

239

240 ## The main Earley loop.

241 # Run the Prediction/Completion cycle for any Items in the current Earley set.

242 # Completions will be added to the SPPF tree, and predictions will be recursively

243 # processed down to terminals/empty nodes to be added to the scanner for the next

244 # step.

245 expects = {i.expect for i in to_scan}

246 i = 0

247 node_cache = {}

248 for token in lexer.lex(expects):

249 self.predict_and_complete(i, to_scan, columns, transitives, node_cache)

250

251 to_scan, node_cache = scan(i, token, to_scan)

252 i += 1

253

254 expects.clear()

255 expects |= {i.expect for i in to_scan}

256

257 self.predict_and_complete(i, to_scan, columns, transitives, node_cache)

258

259 ## Column is now the final column in the parse.

260 assert i == len(columns)-1

261 return to_scan

262

263 def parse(self, lexer, start):

264 assert start, start

265 start_symbol = NonTerminal(start)

266

267 columns = [self.Set()]

268 to_scan = self.Set() # The scan buffer. 'Q' in E.Scott's paper.

269

270 ## Predict for the start_symbol.

271 # Add predicted items to the first Earley set (for the predictor) if they

272 # result in a non-terminal, or the scanner if they result in a terminal.

273 for rule in self.predictions[start_symbol]:

274 item = Item(rule, 0, 0)

275 if item.expect in self.TERMINALS:

276 to_scan.add(item)

277 else:

278 columns[0].add(item)

279

280 to_scan = self._parse(lexer, columns, to_scan, start_symbol)

281

282 # If the parse was successful, the start

283 # symbol should have been completed in the last step of the Earley cycle, and will be in

284 # this column. Find the item for the start_symbol, which is the root of the SPPF tree.

285 solutions = dedup_list(n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0)

286 if not solutions:

287 expected_terminals = [t.expect.name for t in to_scan]

288 raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))

289 if len(solutions) > 1:

290 raise RuntimeError('Earley should not generate multiple start symbol items! Please report this bug.')

291 solution ,= solutions

292

293 if self.debug:

294 from .earley_forest import ForestToPyDotVisitor

295 try:

296 debug_walker = ForestToPyDotVisitor()

297 except ImportError:

298 logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")

299 else:

300 debug_walker.visit(solution, "sppf.png")

301

302

303 if self.Tree is not None:

304 # Perform our SPPF -> AST conversion

305 # Disable the ForestToParseTree cache when ambiguity='resolve'

306 # to prevent a tree construction bug. See issue #1283

307 use_cache = not self.resolve_ambiguity

308 transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity, use_cache)

309 return transformer.transform(solution)

310

311 # return the root of the SPPF

312 return solution