Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/lark/parsers/earley.py: 60%

173 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-25 06:30 +0000

1"""This module implements an Earley parser. 

2 

3The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: 

4 https://www.sciencedirect.com/science/article/pii/S1571066108001497 

5 

6That is probably the best reference for understanding the algorithm here. 

7 

8The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format 

9is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html 

10""" 

11 

12import typing 

13 

14from collections import deque 

15 

16from ..lexer import Token 

17from ..tree import Tree 

18from ..exceptions import UnexpectedEOF, UnexpectedToken 

19from ..utils import logger 

20from .grammar_analysis import GrammarAnalyzer 

21from ..grammar import NonTerminal 

22from .earley_common import Item 

23from .earley_forest import ForestSumVisitor, SymbolNode, TokenNode, ForestToParseTree 

24 

25if typing.TYPE_CHECKING: 

26 from ..common import LexerConf, ParserConf 

27 

28class Parser: 

29 lexer_conf: 'LexerConf' 

30 parser_conf: 'ParserConf' 

31 debug: bool 

32 

33 def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher, resolve_ambiguity=True, debug=False, tree_class=Tree): 

34 analysis = GrammarAnalyzer(parser_conf) 

35 self.lexer_conf = lexer_conf 

36 self.parser_conf = parser_conf 

37 self.resolve_ambiguity = resolve_ambiguity 

38 self.debug = debug 

39 self.tree_class = tree_class 

40 

41 self.FIRST = analysis.FIRST 

42 self.NULLABLE = analysis.NULLABLE 

43 self.callbacks = parser_conf.callbacks 

44 # TODO add typing info 

45 self.predictions = {} # type: ignore[var-annotated] 

46 

47 ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than 

48 # the slow 'isupper' in is_terminal. 

49 self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } 

50 self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } 

51 

52 self.forest_sum_visitor = None 

53 for rule in parser_conf.rules: 

54 if rule.origin not in self.predictions: 

55 self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] 

56 

57 ## Detect if any rules/terminals have priorities set. If the user specified priority = None, then 

58 # the priorities will be stripped from all rules/terminals before they reach us, allowing us to 

59 # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities 

60 # on any rules/terminals. 

61 if self.forest_sum_visitor is None and rule.options.priority is not None: 

62 self.forest_sum_visitor = ForestSumVisitor 

63 

64 # Check terminals for priorities 

65 # Ignore terminal priorities if the basic lexer is used 

66 if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None: 

67 for term in self.lexer_conf.terminals: 

68 if term.priority: 

69 self.forest_sum_visitor = ForestSumVisitor 

70 break 

71 

72 self.term_matcher = term_matcher 

73 

74 

75 def predict_and_complete(self, i, to_scan, columns, transitives): 

76 """The core Earley Predictor and Completer. 

77 

78 At each stage of the input, we handling any completed items (things 

79 that matched on the last cycle) and use those to predict what should 

80 come next in the input stream. The completions and any predicted 

81 non-terminals are recursively processed until we reach a set of, 

82 which can be added to the scan list for the next scanner cycle.""" 

83 # Held Completions (H in E.Scotts paper). 

84 node_cache = {} 

85 held_completions = {} 

86 

87 column = columns[i] 

88 # R (items) = Ei (column.items) 

89 items = deque(column) 

90 while items: 

91 item = items.pop() # remove an element, A say, from R 

92 

93 ### The Earley completer 

94 if item.is_complete: ### (item.s == string) 

95 if item.node is None: 

96 label = (item.s, item.start, i) 

97 item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) 

98 item.node.add_family(item.s, item.rule, item.start, None, None) 

99 

100 # create_leo_transitives(item.rule.origin, item.start) 

101 

102 ###R Joop Leo right recursion Completer 

103 if item.rule.origin in transitives[item.start]: 

104 transitive = transitives[item.start][item.s] 

105 if transitive.previous in transitives[transitive.column]: 

106 root_transitive = transitives[transitive.column][transitive.previous] 

107 else: 

108 root_transitive = transitive 

109 

110 new_item = Item(transitive.rule, transitive.ptr, transitive.start) 

111 label = (root_transitive.s, root_transitive.start, i) 

112 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) 

113 new_item.node.add_path(root_transitive, item.node) 

114 if new_item.expect in self.TERMINALS: 

115 # Add (B :: aC.B, h, y) to Q 

116 to_scan.add(new_item) 

117 elif new_item not in column: 

118 # Add (B :: aC.B, h, y) to Ei and R 

119 column.add(new_item) 

120 items.append(new_item) 

121 ###R Regular Earley completer 

122 else: 

123 # Empty has 0 length. If we complete an empty symbol in a particular 

124 # parse step, we need to be able to use that same empty symbol to complete 

125 # any predictions that result, that themselves require empty. Avoids 

126 # infinite recursion on empty symbols. 

127 # held_completions is 'H' in E.Scott's paper. 

128 is_empty_item = item.start == i 

129 if is_empty_item: 

130 held_completions[item.rule.origin] = item.node 

131 

132 originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] 

133 for originator in originators: 

134 new_item = originator.advance() 

135 label = (new_item.s, originator.start, i) 

136 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) 

137 new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) 

138 if new_item.expect in self.TERMINALS: 

139 # Add (B :: aC.B, h, y) to Q 

140 to_scan.add(new_item) 

141 elif new_item not in column: 

142 # Add (B :: aC.B, h, y) to Ei and R 

143 column.add(new_item) 

144 items.append(new_item) 

145 

146 ### The Earley predictor 

147 elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) 

148 new_items = [] 

149 for rule in self.predictions[item.expect]: 

150 new_item = Item(rule, 0, i) 

151 new_items.append(new_item) 

152 

153 # Process any held completions (H). 

154 if item.expect in held_completions: 

155 new_item = item.advance() 

156 label = (new_item.s, item.start, i) 

157 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) 

158 new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) 

159 new_items.append(new_item) 

160 

161 for new_item in new_items: 

162 if new_item.expect in self.TERMINALS: 

163 to_scan.add(new_item) 

164 elif new_item not in column: 

165 column.add(new_item) 

166 items.append(new_item) 

167 

168 def _parse(self, lexer, columns, to_scan, start_symbol=None): 

169 def is_quasi_complete(item): 

170 if item.is_complete: 

171 return True 

172 

173 quasi = item.advance() 

174 while not quasi.is_complete: 

175 if quasi.expect not in self.NULLABLE: 

176 return False 

177 if quasi.rule.origin == start_symbol and quasi.expect == start_symbol: 

178 return False 

179 quasi = quasi.advance() 

180 return True 

181 

182 # def create_leo_transitives(origin, start): 

183 # ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420 

184 

185 def scan(i, token, to_scan): 

186 """The core Earley Scanner. 

187 

188 This is a custom implementation of the scanner that uses the 

189 Lark lexer to match tokens. The scan list is built by the 

190 Earley predictor, based on the previously completed tokens. 

191 This ensures that at each phase of the parse we have a custom 

192 lexer context, allowing for more complex ambiguities.""" 

193 next_to_scan = set() 

194 next_set = set() 

195 columns.append(next_set) 

196 transitives.append({}) 

197 node_cache = {} 

198 

199 for item in set(to_scan): 

200 if match(item.expect, token): 

201 new_item = item.advance() 

202 label = (new_item.s, new_item.start, i) 

203 # 'terminals' may not contain token.type when using %declare 

204 # Additionally, token is not always a Token 

205 # For example, it can be a Tree when using TreeMatcher 

206 term = terminals.get(token.type) if isinstance(token, Token) else None 

207 # Set the priority of the token node to 0 so that the 

208 # terminal priorities do not affect the Tree chosen by 

209 # ForestSumVisitor after the basic lexer has already 

210 # "used up" the terminal priorities 

211 token_node = TokenNode(token, term, priority=0) 

212 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) 

213 new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node) 

214 

215 if new_item.expect in self.TERMINALS: 

216 # add (B ::= Aai+1.B, h, y) to Q' 

217 next_to_scan.add(new_item) 

218 else: 

219 # add (B ::= Aa+1.B, h, y) to Ei+1 

220 next_set.add(new_item) 

221 

222 if not next_set and not next_to_scan: 

223 expect = {i.expect.name for i in to_scan} 

224 raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan)) 

225 

226 return next_to_scan 

227 

228 

229 # Define parser functions 

230 match = self.term_matcher 

231 

232 terminals = self.lexer_conf.terminals_by_name 

233 

234 # Cache for nodes & tokens created in a particular parse step. 

235 transitives = [{}] 

236 

237 ## The main Earley loop. 

238 # Run the Prediction/Completion cycle for any Items in the current Earley set. 

239 # Completions will be added to the SPPF tree, and predictions will be recursively 

240 # processed down to terminals/empty nodes to be added to the scanner for the next 

241 # step. 

242 expects = {i.expect for i in to_scan} 

243 i = 0 

244 for token in lexer.lex(expects): 

245 self.predict_and_complete(i, to_scan, columns, transitives) 

246 

247 to_scan = scan(i, token, to_scan) 

248 i += 1 

249 

250 expects.clear() 

251 expects |= {i.expect for i in to_scan} 

252 

253 self.predict_and_complete(i, to_scan, columns, transitives) 

254 

255 ## Column is now the final column in the parse. 

256 assert i == len(columns)-1 

257 return to_scan 

258 

259 def parse(self, lexer, start): 

260 assert start, start 

261 start_symbol = NonTerminal(start) 

262 

263 columns = [set()] 

264 to_scan = set() # The scan buffer. 'Q' in E.Scott's paper. 

265 

266 ## Predict for the start_symbol. 

267 # Add predicted items to the first Earley set (for the predictor) if they 

268 # result in a non-terminal, or the scanner if they result in a terminal. 

269 for rule in self.predictions[start_symbol]: 

270 item = Item(rule, 0, 0) 

271 if item.expect in self.TERMINALS: 

272 to_scan.add(item) 

273 else: 

274 columns[0].add(item) 

275 

276 to_scan = self._parse(lexer, columns, to_scan, start_symbol) 

277 

278 # If the parse was successful, the start 

279 # symbol should have been completed in the last step of the Earley cycle, and will be in 

280 # this column. Find the item for the start_symbol, which is the root of the SPPF tree. 

281 solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] 

282 if not solutions: 

283 expected_terminals = [t.expect.name for t in to_scan] 

284 raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) 

285 

286 if self.debug: 

287 from .earley_forest import ForestToPyDotVisitor 

288 try: 

289 debug_walker = ForestToPyDotVisitor() 

290 except ImportError: 

291 logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image") 

292 else: 

293 debug_walker.visit(solutions[0], "sppf.png") 

294 

295 

296 if len(solutions) > 1: 

297 assert False, 'Earley should not generate multiple start symbol items!' 

298 

299 if self.tree_class is not None: 

300 # Perform our SPPF -> AST conversion 

301 transformer = ForestToParseTree(self.tree_class, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity) 

302 return transformer.transform(solutions[0]) 

303 

304 # return the root of the SPPF 

305 return solutions[0]