Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/lark/parsers/earley.py: 60%
173 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:30 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:30 +0000
1"""This module implements an Earley parser.
3The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
4 https://www.sciencedirect.com/science/article/pii/S1571066108001497
6That is probably the best reference for understanding the algorithm here.
8The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
9is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html
10"""
12import typing
14from collections import deque
16from ..lexer import Token
17from ..tree import Tree
18from ..exceptions import UnexpectedEOF, UnexpectedToken
19from ..utils import logger
20from .grammar_analysis import GrammarAnalyzer
21from ..grammar import NonTerminal
22from .earley_common import Item
23from .earley_forest import ForestSumVisitor, SymbolNode, TokenNode, ForestToParseTree
25if typing.TYPE_CHECKING:
26 from ..common import LexerConf, ParserConf
28class Parser:
29 lexer_conf: 'LexerConf'
30 parser_conf: 'ParserConf'
31 debug: bool
33 def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher, resolve_ambiguity=True, debug=False, tree_class=Tree):
34 analysis = GrammarAnalyzer(parser_conf)
35 self.lexer_conf = lexer_conf
36 self.parser_conf = parser_conf
37 self.resolve_ambiguity = resolve_ambiguity
38 self.debug = debug
39 self.tree_class = tree_class
41 self.FIRST = analysis.FIRST
42 self.NULLABLE = analysis.NULLABLE
43 self.callbacks = parser_conf.callbacks
44 # TODO add typing info
45 self.predictions = {} # type: ignore[var-annotated]
47 ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than
48 # the slow 'isupper' in is_terminal.
49 self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
50 self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }
52 self.forest_sum_visitor = None
53 for rule in parser_conf.rules:
54 if rule.origin not in self.predictions:
55 self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]
57 ## Detect if any rules/terminals have priorities set. If the user specified priority = None, then
58 # the priorities will be stripped from all rules/terminals before they reach us, allowing us to
59 # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities
60 # on any rules/terminals.
61 if self.forest_sum_visitor is None and rule.options.priority is not None:
62 self.forest_sum_visitor = ForestSumVisitor
64 # Check terminals for priorities
65 # Ignore terminal priorities if the basic lexer is used
66 if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None:
67 for term in self.lexer_conf.terminals:
68 if term.priority:
69 self.forest_sum_visitor = ForestSumVisitor
70 break
72 self.term_matcher = term_matcher
75 def predict_and_complete(self, i, to_scan, columns, transitives):
76 """The core Earley Predictor and Completer.
78 At each stage of the input, we handling any completed items (things
79 that matched on the last cycle) and use those to predict what should
80 come next in the input stream. The completions and any predicted
81 non-terminals are recursively processed until we reach a set of,
82 which can be added to the scan list for the next scanner cycle."""
83 # Held Completions (H in E.Scotts paper).
84 node_cache = {}
85 held_completions = {}
87 column = columns[i]
88 # R (items) = Ei (column.items)
89 items = deque(column)
90 while items:
91 item = items.pop() # remove an element, A say, from R
93 ### The Earley completer
94 if item.is_complete: ### (item.s == string)
95 if item.node is None:
96 label = (item.s, item.start, i)
97 item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
98 item.node.add_family(item.s, item.rule, item.start, None, None)
100 # create_leo_transitives(item.rule.origin, item.start)
102 ###R Joop Leo right recursion Completer
103 if item.rule.origin in transitives[item.start]:
104 transitive = transitives[item.start][item.s]
105 if transitive.previous in transitives[transitive.column]:
106 root_transitive = transitives[transitive.column][transitive.previous]
107 else:
108 root_transitive = transitive
110 new_item = Item(transitive.rule, transitive.ptr, transitive.start)
111 label = (root_transitive.s, root_transitive.start, i)
112 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
113 new_item.node.add_path(root_transitive, item.node)
114 if new_item.expect in self.TERMINALS:
115 # Add (B :: aC.B, h, y) to Q
116 to_scan.add(new_item)
117 elif new_item not in column:
118 # Add (B :: aC.B, h, y) to Ei and R
119 column.add(new_item)
120 items.append(new_item)
121 ###R Regular Earley completer
122 else:
123 # Empty has 0 length. If we complete an empty symbol in a particular
124 # parse step, we need to be able to use that same empty symbol to complete
125 # any predictions that result, that themselves require empty. Avoids
126 # infinite recursion on empty symbols.
127 # held_completions is 'H' in E.Scott's paper.
128 is_empty_item = item.start == i
129 if is_empty_item:
130 held_completions[item.rule.origin] = item.node
132 originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
133 for originator in originators:
134 new_item = originator.advance()
135 label = (new_item.s, originator.start, i)
136 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
137 new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
138 if new_item.expect in self.TERMINALS:
139 # Add (B :: aC.B, h, y) to Q
140 to_scan.add(new_item)
141 elif new_item not in column:
142 # Add (B :: aC.B, h, y) to Ei and R
143 column.add(new_item)
144 items.append(new_item)
146 ### The Earley predictor
147 elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
148 new_items = []
149 for rule in self.predictions[item.expect]:
150 new_item = Item(rule, 0, i)
151 new_items.append(new_item)
153 # Process any held completions (H).
154 if item.expect in held_completions:
155 new_item = item.advance()
156 label = (new_item.s, item.start, i)
157 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
158 new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
159 new_items.append(new_item)
161 for new_item in new_items:
162 if new_item.expect in self.TERMINALS:
163 to_scan.add(new_item)
164 elif new_item not in column:
165 column.add(new_item)
166 items.append(new_item)
168 def _parse(self, lexer, columns, to_scan, start_symbol=None):
169 def is_quasi_complete(item):
170 if item.is_complete:
171 return True
173 quasi = item.advance()
174 while not quasi.is_complete:
175 if quasi.expect not in self.NULLABLE:
176 return False
177 if quasi.rule.origin == start_symbol and quasi.expect == start_symbol:
178 return False
179 quasi = quasi.advance()
180 return True
182 # def create_leo_transitives(origin, start):
183 # ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420
185 def scan(i, token, to_scan):
186 """The core Earley Scanner.
188 This is a custom implementation of the scanner that uses the
189 Lark lexer to match tokens. The scan list is built by the
190 Earley predictor, based on the previously completed tokens.
191 This ensures that at each phase of the parse we have a custom
192 lexer context, allowing for more complex ambiguities."""
193 next_to_scan = set()
194 next_set = set()
195 columns.append(next_set)
196 transitives.append({})
197 node_cache = {}
199 for item in set(to_scan):
200 if match(item.expect, token):
201 new_item = item.advance()
202 label = (new_item.s, new_item.start, i)
203 # 'terminals' may not contain token.type when using %declare
204 # Additionally, token is not always a Token
205 # For example, it can be a Tree when using TreeMatcher
206 term = terminals.get(token.type) if isinstance(token, Token) else None
207 # Set the priority of the token node to 0 so that the
208 # terminal priorities do not affect the Tree chosen by
209 # ForestSumVisitor after the basic lexer has already
210 # "used up" the terminal priorities
211 token_node = TokenNode(token, term, priority=0)
212 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
213 new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
215 if new_item.expect in self.TERMINALS:
216 # add (B ::= Aai+1.B, h, y) to Q'
217 next_to_scan.add(new_item)
218 else:
219 # add (B ::= Aa+1.B, h, y) to Ei+1
220 next_set.add(new_item)
222 if not next_set and not next_to_scan:
223 expect = {i.expect.name for i in to_scan}
224 raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))
226 return next_to_scan
229 # Define parser functions
230 match = self.term_matcher
232 terminals = self.lexer_conf.terminals_by_name
234 # Cache for nodes & tokens created in a particular parse step.
235 transitives = [{}]
237 ## The main Earley loop.
238 # Run the Prediction/Completion cycle for any Items in the current Earley set.
239 # Completions will be added to the SPPF tree, and predictions will be recursively
240 # processed down to terminals/empty nodes to be added to the scanner for the next
241 # step.
242 expects = {i.expect for i in to_scan}
243 i = 0
244 for token in lexer.lex(expects):
245 self.predict_and_complete(i, to_scan, columns, transitives)
247 to_scan = scan(i, token, to_scan)
248 i += 1
250 expects.clear()
251 expects |= {i.expect for i in to_scan}
253 self.predict_and_complete(i, to_scan, columns, transitives)
255 ## Column is now the final column in the parse.
256 assert i == len(columns)-1
257 return to_scan
259 def parse(self, lexer, start):
260 assert start, start
261 start_symbol = NonTerminal(start)
263 columns = [set()]
264 to_scan = set() # The scan buffer. 'Q' in E.Scott's paper.
266 ## Predict for the start_symbol.
267 # Add predicted items to the first Earley set (for the predictor) if they
268 # result in a non-terminal, or the scanner if they result in a terminal.
269 for rule in self.predictions[start_symbol]:
270 item = Item(rule, 0, 0)
271 if item.expect in self.TERMINALS:
272 to_scan.add(item)
273 else:
274 columns[0].add(item)
276 to_scan = self._parse(lexer, columns, to_scan, start_symbol)
278 # If the parse was successful, the start
279 # symbol should have been completed in the last step of the Earley cycle, and will be in
280 # this column. Find the item for the start_symbol, which is the root of the SPPF tree.
281 solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]
282 if not solutions:
283 expected_terminals = [t.expect.name for t in to_scan]
284 raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))
286 if self.debug:
287 from .earley_forest import ForestToPyDotVisitor
288 try:
289 debug_walker = ForestToPyDotVisitor()
290 except ImportError:
291 logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
292 else:
293 debug_walker.visit(solutions[0], "sppf.png")
296 if len(solutions) > 1:
297 assert False, 'Earley should not generate multiple start symbol items!'
299 if self.tree_class is not None:
300 # Perform our SPPF -> AST conversion
301 transformer = ForestToParseTree(self.tree_class, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity)
302 return transformer.transform(solutions[0])
304 # return the root of the SPPF
305 return solutions[0]