Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/lark/parsers/earley.py: 60%

175 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-14 06:19 +0000

1"""This module implements an Earley parser. 

2 

3The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: 

4 https://www.sciencedirect.com/science/article/pii/S1571066108001497 

5 

6That is probably the best reference for understanding the algorithm here. 

7 

8The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format 

9is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html 

10""" 

11 

12from typing import TYPE_CHECKING, Callable, Optional, List, Any 

13from collections import deque 

14 

15from ..lexer import Token 

16from ..tree import Tree 

17from ..exceptions import UnexpectedEOF, UnexpectedToken 

18from ..utils import logger, OrderedSet 

19from .grammar_analysis import GrammarAnalyzer 

20from ..grammar import NonTerminal 

21from .earley_common import Item 

22from .earley_forest import ForestSumVisitor, SymbolNode, StableSymbolNode, TokenNode, ForestToParseTree 

23 

24if TYPE_CHECKING: 

25 from ..common import LexerConf, ParserConf 

26 

27class Parser: 

28 lexer_conf: 'LexerConf' 

29 parser_conf: 'ParserConf' 

30 debug: bool 

31 

32 def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable, 

33 resolve_ambiguity: bool=True, debug: bool=False, 

34 tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True): 

35 analysis = GrammarAnalyzer(parser_conf) 

36 self.lexer_conf = lexer_conf 

37 self.parser_conf = parser_conf 

38 self.resolve_ambiguity = resolve_ambiguity 

39 self.debug = debug 

40 self.Tree = tree_class 

41 self.Set = OrderedSet if ordered_sets else set 

42 self.SymbolNode = StableSymbolNode if ordered_sets else SymbolNode 

43 

44 self.FIRST = analysis.FIRST 

45 self.NULLABLE = analysis.NULLABLE 

46 self.callbacks = parser_conf.callbacks 

47 # TODO add typing info 

48 self.predictions = {} # type: ignore[var-annotated] 

49 

50 ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than 

51 # the slow 'isupper' in is_terminal. 

52 self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } 

53 self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } 

54 

55 self.forest_sum_visitor = None 

56 for rule in parser_conf.rules: 

57 if rule.origin not in self.predictions: 

58 self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] 

59 

60 ## Detect if any rules/terminals have priorities set. If the user specified priority = None, then 

61 # the priorities will be stripped from all rules/terminals before they reach us, allowing us to 

62 # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities 

63 # on any rules/terminals. 

64 if self.forest_sum_visitor is None and rule.options.priority is not None: 

65 self.forest_sum_visitor = ForestSumVisitor 

66 

67 # Check terminals for priorities 

68 # Ignore terminal priorities if the basic lexer is used 

69 if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None: 

70 for term in self.lexer_conf.terminals: 

71 if term.priority: 

72 self.forest_sum_visitor = ForestSumVisitor 

73 break 

74 

75 self.term_matcher = term_matcher 

76 

77 

78 def predict_and_complete(self, i, to_scan, columns, transitives): 

79 """The core Earley Predictor and Completer. 

80 

81 At each stage of the input, we handling any completed items (things 

82 that matched on the last cycle) and use those to predict what should 

83 come next in the input stream. The completions and any predicted 

84 non-terminals are recursively processed until we reach a set of, 

85 which can be added to the scan list for the next scanner cycle.""" 

86 # Held Completions (H in E.Scotts paper). 

87 node_cache = {} 

88 held_completions = {} 

89 

90 column = columns[i] 

91 # R (items) = Ei (column.items) 

92 items = deque(column) 

93 while items: 

94 item = items.pop() # remove an element, A say, from R 

95 

96 ### The Earley completer 

97 if item.is_complete: ### (item.s == string) 

98 if item.node is None: 

99 label = (item.s, item.start, i) 

100 item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label)) 

101 item.node.add_family(item.s, item.rule, item.start, None, None) 

102 

103 # create_leo_transitives(item.rule.origin, item.start) 

104 

105 ###R Joop Leo right recursion Completer 

106 if item.rule.origin in transitives[item.start]: 

107 transitive = transitives[item.start][item.s] 

108 if transitive.previous in transitives[transitive.column]: 

109 root_transitive = transitives[transitive.column][transitive.previous] 

110 else: 

111 root_transitive = transitive 

112 

113 new_item = Item(transitive.rule, transitive.ptr, transitive.start) 

114 label = (root_transitive.s, root_transitive.start, i) 

115 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label)) 

116 new_item.node.add_path(root_transitive, item.node) 

117 if new_item.expect in self.TERMINALS: 

118 # Add (B :: aC.B, h, y) to Q 

119 to_scan.add(new_item) 

120 elif new_item not in column: 

121 # Add (B :: aC.B, h, y) to Ei and R 

122 column.add(new_item) 

123 items.append(new_item) 

124 ###R Regular Earley completer 

125 else: 

126 # Empty has 0 length. If we complete an empty symbol in a particular 

127 # parse step, we need to be able to use that same empty symbol to complete 

128 # any predictions that result, that themselves require empty. Avoids 

129 # infinite recursion on empty symbols. 

130 # held_completions is 'H' in E.Scott's paper. 

131 is_empty_item = item.start == i 

132 if is_empty_item: 

133 held_completions[item.rule.origin] = item.node 

134 

135 originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] 

136 for originator in originators: 

137 new_item = originator.advance() 

138 label = (new_item.s, originator.start, i) 

139 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label)) 

140 new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) 

141 if new_item.expect in self.TERMINALS: 

142 # Add (B :: aC.B, h, y) to Q 

143 to_scan.add(new_item) 

144 elif new_item not in column: 

145 # Add (B :: aC.B, h, y) to Ei and R 

146 column.add(new_item) 

147 items.append(new_item) 

148 

149 ### The Earley predictor 

150 elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) 

151 new_items = [] 

152 for rule in self.predictions[item.expect]: 

153 new_item = Item(rule, 0, i) 

154 new_items.append(new_item) 

155 

156 # Process any held completions (H). 

157 if item.expect in held_completions: 

158 new_item = item.advance() 

159 label = (new_item.s, item.start, i) 

160 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label)) 

161 new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) 

162 new_items.append(new_item) 

163 

164 for new_item in new_items: 

165 if new_item.expect in self.TERMINALS: 

166 to_scan.add(new_item) 

167 elif new_item not in column: 

168 column.add(new_item) 

169 items.append(new_item) 

170 

171 def _parse(self, lexer, columns, to_scan, start_symbol=None): 

172 def is_quasi_complete(item): 

173 if item.is_complete: 

174 return True 

175 

176 quasi = item.advance() 

177 while not quasi.is_complete: 

178 if quasi.expect not in self.NULLABLE: 

179 return False 

180 if quasi.rule.origin == start_symbol and quasi.expect == start_symbol: 

181 return False 

182 quasi = quasi.advance() 

183 return True 

184 

185 # def create_leo_transitives(origin, start): 

186 # ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420 

187 

188 def scan(i, token, to_scan): 

189 """The core Earley Scanner. 

190 

191 This is a custom implementation of the scanner that uses the 

192 Lark lexer to match tokens. The scan list is built by the 

193 Earley predictor, based on the previously completed tokens. 

194 This ensures that at each phase of the parse we have a custom 

195 lexer context, allowing for more complex ambiguities.""" 

196 next_to_scan = self.Set() 

197 next_set = self.Set() 

198 columns.append(next_set) 

199 transitives.append({}) 

200 node_cache = {} 

201 

202 for item in self.Set(to_scan): 

203 if match(item.expect, token): 

204 new_item = item.advance() 

205 label = (new_item.s, new_item.start, i) 

206 # 'terminals' may not contain token.type when using %declare 

207 # Additionally, token is not always a Token 

208 # For example, it can be a Tree when using TreeMatcher 

209 term = terminals.get(token.type) if isinstance(token, Token) else None 

210 # Set the priority of the token node to 0 so that the 

211 # terminal priorities do not affect the Tree chosen by 

212 # ForestSumVisitor after the basic lexer has already 

213 # "used up" the terminal priorities 

214 token_node = TokenNode(token, term, priority=0) 

215 new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label)) 

216 new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node) 

217 

218 if new_item.expect in self.TERMINALS: 

219 # add (B ::= Aai+1.B, h, y) to Q' 

220 next_to_scan.add(new_item) 

221 else: 

222 # add (B ::= Aa+1.B, h, y) to Ei+1 

223 next_set.add(new_item) 

224 

225 if not next_set and not next_to_scan: 

226 expect = {i.expect.name for i in to_scan} 

227 raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan)) 

228 

229 return next_to_scan 

230 

231 

232 # Define parser functions 

233 match = self.term_matcher 

234 

235 terminals = self.lexer_conf.terminals_by_name 

236 

237 # Cache for nodes & tokens created in a particular parse step. 

238 transitives = [{}] 

239 

240 ## The main Earley loop. 

241 # Run the Prediction/Completion cycle for any Items in the current Earley set. 

242 # Completions will be added to the SPPF tree, and predictions will be recursively 

243 # processed down to terminals/empty nodes to be added to the scanner for the next 

244 # step. 

245 expects = {i.expect for i in to_scan} 

246 i = 0 

247 for token in lexer.lex(expects): 

248 self.predict_and_complete(i, to_scan, columns, transitives) 

249 

250 to_scan = scan(i, token, to_scan) 

251 i += 1 

252 

253 expects.clear() 

254 expects |= {i.expect for i in to_scan} 

255 

256 self.predict_and_complete(i, to_scan, columns, transitives) 

257 

258 ## Column is now the final column in the parse. 

259 assert i == len(columns)-1 

260 return to_scan 

261 

262 def parse(self, lexer, start): 

263 assert start, start 

264 start_symbol = NonTerminal(start) 

265 

266 columns = [self.Set()] 

267 to_scan = self.Set() # The scan buffer. 'Q' in E.Scott's paper. 

268 

269 ## Predict for the start_symbol. 

270 # Add predicted items to the first Earley set (for the predictor) if they 

271 # result in a non-terminal, or the scanner if they result in a terminal. 

272 for rule in self.predictions[start_symbol]: 

273 item = Item(rule, 0, 0) 

274 if item.expect in self.TERMINALS: 

275 to_scan.add(item) 

276 else: 

277 columns[0].add(item) 

278 

279 to_scan = self._parse(lexer, columns, to_scan, start_symbol) 

280 

281 # If the parse was successful, the start 

282 # symbol should have been completed in the last step of the Earley cycle, and will be in 

283 # this column. Find the item for the start_symbol, which is the root of the SPPF tree. 

284 solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] 

285 if not solutions: 

286 expected_terminals = [t.expect.name for t in to_scan] 

287 raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) 

288 

289 if self.debug: 

290 from .earley_forest import ForestToPyDotVisitor 

291 try: 

292 debug_walker = ForestToPyDotVisitor() 

293 except ImportError: 

294 logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image") 

295 else: 

296 debug_walker.visit(solutions[0], "sppf.png") 

297 

298 

299 if len(solutions) > 1: 

300 assert False, 'Earley should not generate multiple start symbol items!' 

301 

302 if self.Tree is not None: 

303 # Perform our SPPF -> AST conversion 

304 transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity) 

305 return transformer.transform(solutions[0]) 

306 

307 # return the root of the SPPF 

308 return solutions[0]