1"""This module implements an Earley parser.
2
3The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
4 https://www.sciencedirect.com/science/article/pii/S1571066108001497
5
6That is probably the best reference for understanding the algorithm here.
7
8The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
9is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html
10"""
11
12from typing import TYPE_CHECKING, Callable, Optional, List, Any
13from collections import deque
14
15from ..lexer import Token
16from ..tree import Tree
17from ..exceptions import UnexpectedEOF, UnexpectedToken
18from ..utils import logger, OrderedSet, dedup_list
19from .grammar_analysis import GrammarAnalyzer
20from ..grammar import NonTerminal
21from .earley_common import Item
22from .earley_forest import ForestSumVisitor, SymbolNode, StableSymbolNode, TokenNode, ForestToParseTree
23
24if TYPE_CHECKING:
25 from ..common import LexerConf, ParserConf
26
27class Parser:
28 lexer_conf: 'LexerConf'
29 parser_conf: 'ParserConf'
30 debug: bool
31
32 def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
33 resolve_ambiguity: bool=True, debug: bool=False,
34 tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):
35 analysis = GrammarAnalyzer(parser_conf)
36 self.lexer_conf = lexer_conf
37 self.parser_conf = parser_conf
38 self.resolve_ambiguity = resolve_ambiguity
39 self.debug = debug
40 self.Tree = tree_class
41 self.Set = OrderedSet if ordered_sets else set
42 self.SymbolNode = StableSymbolNode if ordered_sets else SymbolNode
43
44 self.FIRST = analysis.FIRST
45 self.NULLABLE = analysis.NULLABLE
46 self.callbacks = parser_conf.callbacks
47 # TODO add typing info
48 self.predictions = {} # type: ignore[var-annotated]
49
50 ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than
51 # the slow 'isupper' in is_terminal.
52 self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
53 self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }
54
55 self.forest_sum_visitor = None
56 for rule in parser_conf.rules:
57 if rule.origin not in self.predictions:
58 self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]
59
60 ## Detect if any rules/terminals have priorities set. If the user specified priority = None, then
61 # the priorities will be stripped from all rules/terminals before they reach us, allowing us to
62 # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities
63 # on any rules/terminals.
64 if self.forest_sum_visitor is None and rule.options.priority is not None:
65 self.forest_sum_visitor = ForestSumVisitor
66
67 # Check terminals for priorities
68 # Ignore terminal priorities if the basic lexer is used
69 if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None:
70 for term in self.lexer_conf.terminals:
71 if term.priority:
72 self.forest_sum_visitor = ForestSumVisitor
73 break
74
75 self.term_matcher = term_matcher
76
77
78 def predict_and_complete(self, i, to_scan, columns, transitives):
79 """The core Earley Predictor and Completer.
80
81 At each stage of the input, we handling any completed items (things
82 that matched on the last cycle) and use those to predict what should
83 come next in the input stream. The completions and any predicted
84 non-terminals are recursively processed until we reach a set of,
85 which can be added to the scan list for the next scanner cycle."""
86 # Held Completions (H in E.Scotts paper).
87 node_cache = {}
88 held_completions = {}
89
90 column = columns[i]
91 # R (items) = Ei (column.items)
92 items = deque(column)
93 while items:
94 item = items.pop() # remove an element, A say, from R
95
96 ### The Earley completer
97 if item.is_complete: ### (item.s == string)
98 if item.node is None:
99 label = (item.s, item.start, i)
100 item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
101 item.node.add_family(item.s, item.rule, item.start, None, None)
102
103 # create_leo_transitives(item.rule.origin, item.start)
104
105 ###R Joop Leo right recursion Completer
106 if item.rule.origin in transitives[item.start]:
107 transitive = transitives[item.start][item.s]
108 if transitive.previous in transitives[transitive.column]:
109 root_transitive = transitives[transitive.column][transitive.previous]
110 else:
111 root_transitive = transitive
112
113 new_item = Item(transitive.rule, transitive.ptr, transitive.start)
114 label = (root_transitive.s, root_transitive.start, i)
115 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
116 new_item.node.add_path(root_transitive, item.node)
117 if new_item.expect in self.TERMINALS:
118 # Add (B :: aC.B, h, y) to Q
119 to_scan.add(new_item)
120 elif new_item not in column:
121 # Add (B :: aC.B, h, y) to Ei and R
122 column.add(new_item)
123 items.append(new_item)
124 ###R Regular Earley completer
125 else:
126 # Empty has 0 length. If we complete an empty symbol in a particular
127 # parse step, we need to be able to use that same empty symbol to complete
128 # any predictions that result, that themselves require empty. Avoids
129 # infinite recursion on empty symbols.
130 # held_completions is 'H' in E.Scott's paper.
131 is_empty_item = item.start == i
132 if is_empty_item:
133 held_completions[item.rule.origin] = item.node
134
135 originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
136 for originator in originators:
137 new_item = originator.advance()
138 label = (new_item.s, originator.start, i)
139 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
140 new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
141 if new_item.expect in self.TERMINALS:
142 # Add (B :: aC.B, h, y) to Q
143 to_scan.add(new_item)
144 elif new_item not in column:
145 # Add (B :: aC.B, h, y) to Ei and R
146 column.add(new_item)
147 items.append(new_item)
148
149 ### The Earley predictor
150 elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
151 new_items = []
152 for rule in self.predictions[item.expect]:
153 new_item = Item(rule, 0, i)
154 new_items.append(new_item)
155
156 # Process any held completions (H).
157 if item.expect in held_completions:
158 new_item = item.advance()
159 label = (new_item.s, item.start, i)
160 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
161 new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
162 new_items.append(new_item)
163
164 for new_item in new_items:
165 if new_item.expect in self.TERMINALS:
166 to_scan.add(new_item)
167 elif new_item not in column:
168 column.add(new_item)
169 items.append(new_item)
170
171 def _parse(self, lexer, columns, to_scan, start_symbol=None):
172
173 def is_quasi_complete(item):
174 if item.is_complete:
175 return True
176
177 quasi = item.advance()
178 while not quasi.is_complete:
179 if quasi.expect not in self.NULLABLE:
180 return False
181 if quasi.rule.origin == start_symbol and quasi.expect == start_symbol:
182 return False
183 quasi = quasi.advance()
184 return True
185
186 # def create_leo_transitives(origin, start):
187 # ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420
188
189 def scan(i, token, to_scan):
190 """The core Earley Scanner.
191
192 This is a custom implementation of the scanner that uses the
193 Lark lexer to match tokens. The scan list is built by the
194 Earley predictor, based on the previously completed tokens.
195 This ensures that at each phase of the parse we have a custom
196 lexer context, allowing for more complex ambiguities."""
197 next_to_scan = self.Set()
198 next_set = self.Set()
199 columns.append(next_set)
200 transitives.append({})
201 node_cache = {}
202
203 for item in self.Set(to_scan):
204 if match(item.expect, token):
205 new_item = item.advance()
206 label = (new_item.s, new_item.start, i)
207 # 'terminals' may not contain token.type when using %declare
208 # Additionally, token is not always a Token
209 # For example, it can be a Tree when using TreeMatcher
210 term = terminals.get(token.type) if isinstance(token, Token) else None
211 # Set the priority of the token node to 0 so that the
212 # terminal priorities do not affect the Tree chosen by
213 # ForestSumVisitor after the basic lexer has already
214 # "used up" the terminal priorities
215 token_node = TokenNode(token, term, priority=0)
216 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
217 new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
218
219 if new_item.expect in self.TERMINALS:
220 # add (B ::= Aai+1.B, h, y) to Q'
221 next_to_scan.add(new_item)
222 else:
223 # add (B ::= Aa+1.B, h, y) to Ei+1
224 next_set.add(new_item)
225
226 if not next_set and not next_to_scan:
227 expect = {i.expect.name for i in to_scan}
228 raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))
229
230 return next_to_scan
231
232
233 # Define parser functions
234 match = self.term_matcher
235
236 terminals = self.lexer_conf.terminals_by_name
237
238 # Cache for nodes & tokens created in a particular parse step.
239 transitives = [{}]
240
241 ## The main Earley loop.
242 # Run the Prediction/Completion cycle for any Items in the current Earley set.
243 # Completions will be added to the SPPF tree, and predictions will be recursively
244 # processed down to terminals/empty nodes to be added to the scanner for the next
245 # step.
246 expects = {i.expect for i in to_scan}
247 i = 0
248 for token in lexer.lex(expects):
249 self.predict_and_complete(i, to_scan, columns, transitives)
250
251 to_scan = scan(i, token, to_scan)
252 i += 1
253
254 expects.clear()
255 expects |= {i.expect for i in to_scan}
256
257 self.predict_and_complete(i, to_scan, columns, transitives)
258
259 ## Column is now the final column in the parse.
260 assert i == len(columns)-1
261 return to_scan
262
263 def parse(self, lexer, start):
264 assert start, start
265 start_symbol = NonTerminal(start)
266
267 columns = [self.Set()]
268 to_scan = self.Set() # The scan buffer. 'Q' in E.Scott's paper.
269
270 ## Predict for the start_symbol.
271 # Add predicted items to the first Earley set (for the predictor) if they
272 # result in a non-terminal, or the scanner if they result in a terminal.
273 for rule in self.predictions[start_symbol]:
274 item = Item(rule, 0, 0)
275 if item.expect in self.TERMINALS:
276 to_scan.add(item)
277 else:
278 columns[0].add(item)
279
280 to_scan = self._parse(lexer, columns, to_scan, start_symbol)
281
282 # If the parse was successful, the start
283 # symbol should have been completed in the last step of the Earley cycle, and will be in
284 # this column. Find the item for the start_symbol, which is the root of the SPPF tree.
285 solutions = dedup_list(n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0)
286 if not solutions:
287 expected_terminals = [t.expect.name for t in to_scan]
288 raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))
289
290 if self.debug:
291 from .earley_forest import ForestToPyDotVisitor
292 try:
293 debug_walker = ForestToPyDotVisitor()
294 except ImportError:
295 logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
296 else:
297 for i, s in enumerate(solutions):
298 debug_walker.visit(s, f"sppf{i}.png")
299
300
301 if self.Tree is not None:
302 # Perform our SPPF -> AST conversion
303 # Disable the ForestToParseTree cache when ambiguity='resolve'
304 # to prevent a tree construction bug. See issue #1283
305 use_cache = not self.resolve_ambiguity
306 transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity, use_cache)
307 solutions = [transformer.transform(s) for s in solutions]
308
309 if len(solutions) > 1 and not self.resolve_ambiguity:
310 t: Tree = self.Tree('_ambig', solutions)
311 t.expand_kids_by_data('_ambig') # solutions may themselves be _ambig nodes
312 return t
313 return solutions[0]
314
315 # return the root of the SPPF
316 # TODO return a list of solutions, or join them together somehow
317 return solutions[0]