Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/lark/parsers/earley.py: 60%

1"""This module implements an Earley parser.

3The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:

4 https://www.sciencedirect.com/science/article/pii/S1571066108001497

6That is probably the best reference for understanding the algorithm here.

8The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format

9is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html

10"""

12import typing

14from collections import deque

16from ..lexer import Token

17from ..tree import Tree

18from ..exceptions import UnexpectedEOF, UnexpectedToken

19from ..utils import logger

20from .grammar_analysis import GrammarAnalyzer

21from ..grammar import NonTerminal

22from .earley_common import Item

23from .earley_forest import ForestSumVisitor, SymbolNode, TokenNode, ForestToParseTree

25if typing.TYPE_CHECKING:

26 from ..common import LexerConf, ParserConf

28class Parser:

29 lexer_conf: 'LexerConf'

30 parser_conf: 'ParserConf'

31 debug: bool

33 def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher, resolve_ambiguity=True, debug=False, tree_class=Tree):

34 analysis = GrammarAnalyzer(parser_conf)

35 self.lexer_conf = lexer_conf

36 self.parser_conf = parser_conf

37 self.resolve_ambiguity = resolve_ambiguity

38 self.debug = debug

39 self.tree_class = tree_class

41 self.FIRST = analysis.FIRST

42 self.NULLABLE = analysis.NULLABLE

43 self.callbacks = parser_conf.callbacks

44 # TODO add typing info

45 self.predictions = {} # type: ignore[var-annotated]

47 ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than

48 # the slow 'isupper' in is_terminal.

49 self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }

50 self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }

52 self.forest_sum_visitor = None

53 for rule in parser_conf.rules:

54 if rule.origin not in self.predictions:

55 self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]

57 ## Detect if any rules/terminals have priorities set. If the user specified priority = None, then

58 # the priorities will be stripped from all rules/terminals before they reach us, allowing us to

59 # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities

60 # on any rules/terminals.

61 if self.forest_sum_visitor is None and rule.options.priority is not None:

62 self.forest_sum_visitor = ForestSumVisitor

64 # Check terminals for priorities

65 # Ignore terminal priorities if the basic lexer is used

66 if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None:

67 for term in self.lexer_conf.terminals:

68 if term.priority:

69 self.forest_sum_visitor = ForestSumVisitor

70 break

72 self.term_matcher = term_matcher

75 def predict_and_complete(self, i, to_scan, columns, transitives):

76 """The core Earley Predictor and Completer.

78 At each stage of the input, we handling any completed items (things

79 that matched on the last cycle) and use those to predict what should

80 come next in the input stream. The completions and any predicted

81 non-terminals are recursively processed until we reach a set of,

82 which can be added to the scan list for the next scanner cycle."""

83 # Held Completions (H in E.Scotts paper).

84 node_cache = {}

85 held_completions = {}

87 column = columns[i]

88 # R (items) = Ei (column.items)

89 items = deque(column)

90 while items:

91 item = items.pop() # remove an element, A say, from R

93 ### The Earley completer

94 if item.is_complete: ### (item.s == string)

95 if item.node is None:

96 label = (item.s, item.start, i)

97 item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))

98 item.node.add_family(item.s, item.rule, item.start, None, None)

100 # create_leo_transitives(item.rule.origin, item.start)

101

102 ###R Joop Leo right recursion Completer

103 if item.rule.origin in transitives[item.start]:

104 transitive = transitives[item.start][item.s]

105 if transitive.previous in transitives[transitive.column]:

106 root_transitive = transitives[transitive.column][transitive.previous]

107 else:

108 root_transitive = transitive

109

110 new_item = Item(transitive.rule, transitive.ptr, transitive.start)

111 label = (root_transitive.s, root_transitive.start, i)

112 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))

113 new_item.node.add_path(root_transitive, item.node)

114 if new_item.expect in self.TERMINALS:

115 # Add (B :: aC.B, h, y) to Q

116 to_scan.add(new_item)

117 elif new_item not in column:

118 # Add (B :: aC.B, h, y) to Ei and R

119 column.add(new_item)

120 items.append(new_item)

121 ###R Regular Earley completer

122 else:

123 # Empty has 0 length. If we complete an empty symbol in a particular

124 # parse step, we need to be able to use that same empty symbol to complete

125 # any predictions that result, that themselves require empty. Avoids

126 # infinite recursion on empty symbols.

127 # held_completions is 'H' in E.Scott's paper.

128 is_empty_item = item.start == i

129 if is_empty_item:

130 held_completions[item.rule.origin] = item.node

131

132 originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]

133 for originator in originators:

134 new_item = originator.advance()

135 label = (new_item.s, originator.start, i)

136 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))

137 new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)

138 if new_item.expect in self.TERMINALS:

139 # Add (B :: aC.B, h, y) to Q

140 to_scan.add(new_item)

141 elif new_item not in column:

142 # Add (B :: aC.B, h, y) to Ei and R

143 column.add(new_item)

144 items.append(new_item)

145

146 ### The Earley predictor

147 elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)

148 new_items = []

149 for rule in self.predictions[item.expect]:

150 new_item = Item(rule, 0, i)

151 new_items.append(new_item)

152

153 # Process any held completions (H).

154 if item.expect in held_completions:

155 new_item = item.advance()

156 label = (new_item.s, item.start, i)

157 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))

158 new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])

159 new_items.append(new_item)

160

161 for new_item in new_items:

162 if new_item.expect in self.TERMINALS:

163 to_scan.add(new_item)

164 elif new_item not in column:

165 column.add(new_item)

166 items.append(new_item)

167

168 def _parse(self, lexer, columns, to_scan, start_symbol=None):

169 def is_quasi_complete(item):

170 if item.is_complete:

171 return True

172

173 quasi = item.advance()

174 while not quasi.is_complete:

175 if quasi.expect not in self.NULLABLE:

176 return False

177 if quasi.rule.origin == start_symbol and quasi.expect == start_symbol:

178 return False

179 quasi = quasi.advance()

180 return True

181

182 # def create_leo_transitives(origin, start):

183 # ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420

184

185 def scan(i, token, to_scan):

186 """The core Earley Scanner.

187

188 This is a custom implementation of the scanner that uses the

189 Lark lexer to match tokens. The scan list is built by the

190 Earley predictor, based on the previously completed tokens.

191 This ensures that at each phase of the parse we have a custom

192 lexer context, allowing for more complex ambiguities."""

193 next_to_scan = set()

194 next_set = set()

195 columns.append(next_set)

196 transitives.append({})

197 node_cache = {}

198

199 for item in set(to_scan):

200 if match(item.expect, token):

201 new_item = item.advance()

202 label = (new_item.s, new_item.start, i)

203 # 'terminals' may not contain token.type when using %declare

204 # Additionally, token is not always a Token

205 # For example, it can be a Tree when using TreeMatcher

206 term = terminals.get(token.type) if isinstance(token, Token) else None

207 # Set the priority of the token node to 0 so that the

208 # terminal priorities do not affect the Tree chosen by

209 # ForestSumVisitor after the basic lexer has already

210 # "used up" the terminal priorities

211 token_node = TokenNode(token, term, priority=0)

212 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))

213 new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)

214

215 if new_item.expect in self.TERMINALS:

216 # add (B ::= Aai+1.B, h, y) to Q'

217 next_to_scan.add(new_item)

218 else:

219 # add (B ::= Aa+1.B, h, y) to Ei+1

220 next_set.add(new_item)

221

222 if not next_set and not next_to_scan:

223 expect = {i.expect.name for i in to_scan}

224 raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))

225

226 return next_to_scan

227

228

229 # Define parser functions

230 match = self.term_matcher

231

232 terminals = self.lexer_conf.terminals_by_name

233

234 # Cache for nodes & tokens created in a particular parse step.

235 transitives = [{}]

236

237 ## The main Earley loop.

238 # Run the Prediction/Completion cycle for any Items in the current Earley set.

239 # Completions will be added to the SPPF tree, and predictions will be recursively

240 # processed down to terminals/empty nodes to be added to the scanner for the next

241 # step.

242 expects = {i.expect for i in to_scan}

243 i = 0

244 for token in lexer.lex(expects):

245 self.predict_and_complete(i, to_scan, columns, transitives)

246

247 to_scan = scan(i, token, to_scan)

248 i += 1

249

250 expects.clear()

251 expects |= {i.expect for i in to_scan}

252

253 self.predict_and_complete(i, to_scan, columns, transitives)

254

255 ## Column is now the final column in the parse.

256 assert i == len(columns)-1

257 return to_scan

258

259 def parse(self, lexer, start):

260 assert start, start

261 start_symbol = NonTerminal(start)

262

263 columns = [set()]

264 to_scan = set() # The scan buffer. 'Q' in E.Scott's paper.

265

266 ## Predict for the start_symbol.

267 # Add predicted items to the first Earley set (for the predictor) if they

268 # result in a non-terminal, or the scanner if they result in a terminal.

269 for rule in self.predictions[start_symbol]:

270 item = Item(rule, 0, 0)

271 if item.expect in self.TERMINALS:

272 to_scan.add(item)

273 else:

274 columns[0].add(item)

275

276 to_scan = self._parse(lexer, columns, to_scan, start_symbol)

277

278 # If the parse was successful, the start

279 # symbol should have been completed in the last step of the Earley cycle, and will be in

280 # this column. Find the item for the start_symbol, which is the root of the SPPF tree.

281 solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]

282 if not solutions:

283 expected_terminals = [t.expect.name for t in to_scan]

284 raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))

285

286 if self.debug:

287 from .earley_forest import ForestToPyDotVisitor

288 try:

289 debug_walker = ForestToPyDotVisitor()

290 except ImportError:

291 logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")

292 else:

293 debug_walker.visit(solutions[0], "sppf.png")

294

295

296 if len(solutions) > 1:

297 assert False, 'Earley should not generate multiple start symbol items!'

298

299 if self.tree_class is not None:

300 # Perform our SPPF -> AST conversion

301 transformer = ForestToParseTree(self.tree_class, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity)

302 return transformer.transform(solutions[0])

303

304 # return the root of the SPPF

305 return solutions[0]