1"Provides for superficial grammar analysis."
2
3from collections import Counter, defaultdict
4from typing import List, Dict, Iterator, FrozenSet, Set
5
6from ..utils import bfs, fzset, classify, OrderedSet
7from ..exceptions import GrammarError
8from ..grammar import Rule, Terminal, NonTerminal, Symbol
9from ..common import ParserConf
10
11
12class RulePtr:
13 __slots__ = ('rule', 'index')
14 rule: Rule
15 index: int
16
17 def __init__(self, rule: Rule, index: int):
18 assert isinstance(rule, Rule)
19 assert index <= len(rule.expansion)
20 self.rule = rule
21 self.index = index
22
23 def __repr__(self):
24 before = [x.name for x in self.rule.expansion[:self.index]]
25 after = [x.name for x in self.rule.expansion[self.index:]]
26 return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after))
27
28 @property
29 def next(self) -> Symbol:
30 return self.rule.expansion[self.index]
31
32 def advance(self, sym: Symbol) -> 'RulePtr':
33 assert self.next == sym
34 return RulePtr(self.rule, self.index+1)
35
36 @property
37 def is_satisfied(self) -> bool:
38 return self.index == len(self.rule.expansion)
39
40 def __eq__(self, other) -> bool:
41 if not isinstance(other, RulePtr):
42 return NotImplemented
43 return self.rule == other.rule and self.index == other.index
44
45 def __hash__(self) -> int:
46 return hash((self.rule, self.index))
47
48
49State = FrozenSet[RulePtr]
50
51# state generation ensures no duplicate LR0ItemSets
52class LR0ItemSet:
53 __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads')
54
55 kernel: State
56 closure: State
57 transitions: Dict[Symbol, 'LR0ItemSet']
58 lookaheads: Dict[Symbol, Set[Rule]]
59
60 def __init__(self, kernel, closure):
61 self.kernel = fzset(kernel)
62 self.closure = fzset(closure)
63 self.transitions = {}
64 self.lookaheads = defaultdict(set)
65
66 def __repr__(self):
67 return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))
68
69
70def update_set(set1, set2):
71 if not set2 or set1 > set2:
72 return False
73
74 copy = set(set1)
75 set1 |= set2
76 return set1 != copy
77
78def calculate_sets(rules):
79 """Calculate FOLLOW sets.
80
81 Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
82 symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
83
84 # foreach grammar rule X ::= Y(1) ... Y(k)
85 # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
86 # NULLABLE = NULLABLE union {X}
87 # for i = 1 to k
88 # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
89 # FIRST(X) = FIRST(X) union FIRST(Y(i))
90 # for j = i+1 to k
91 # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
92 # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
93 # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
94 # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
95 # until none of NULLABLE,FIRST,FOLLOW changed in last iteration
96
97 NULLABLE = set()
98 FIRST = {}
99 FOLLOW = {}
100 for sym in symbols:
101 FIRST[sym]={sym} if sym.is_term else set()
102 FOLLOW[sym]=set()
103
104 # Calculate NULLABLE and FIRST
105 changed = True
106 while changed:
107 changed = False
108
109 for rule in rules:
110 if set(rule.expansion) <= NULLABLE:
111 if update_set(NULLABLE, {rule.origin}):
112 changed = True
113
114 for i, sym in enumerate(rule.expansion):
115 if set(rule.expansion[:i]) <= NULLABLE:
116 if update_set(FIRST[rule.origin], FIRST[sym]):
117 changed = True
118 else:
119 break
120
121 # Calculate FOLLOW
122 changed = True
123 while changed:
124 changed = False
125
126 for rule in rules:
127 for i, sym in enumerate(rule.expansion):
128 if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE:
129 if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
130 changed = True
131
132 for j in range(i+1, len(rule.expansion)):
133 if set(rule.expansion[i+1:j]) <= NULLABLE:
134 if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
135 changed = True
136
137 return FIRST, FOLLOW, NULLABLE
138
139
140class GrammarAnalyzer:
141 def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
142 self.debug = debug
143 self.strict = strict
144
145 root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
146 for start in parser_conf.start}
147
148 rules = parser_conf.rules + list(root_rules.values())
149 self.rules_by_origin: Dict[NonTerminal, List[Rule]] = classify(rules, lambda r: r.origin)
150
151 if len(rules) != len(set(rules)):
152 duplicates = [item for item, count in Counter(rules).items() if count > 1]
153 raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
154
155 for r in rules:
156 for sym in r.expansion:
157 if not (sym.is_term or sym in self.rules_by_origin):
158 raise GrammarError("Using an undefined rule: %s" % sym)
159
160 self.start_states = {start: self.expand_rule(root_rule.origin)
161 for start, root_rule in root_rules.items()}
162
163 self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))})
164 for start, root_rule in root_rules.items()}
165
166 lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)])
167 for start in parser_conf.start}
168
169 lr0_rules = parser_conf.rules + list(lr0_root_rules.values())
170 assert(len(lr0_rules) == len(set(lr0_rules)))
171
172 self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)
173
174 # cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
175 self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
176 for start, root_rule in lr0_root_rules.items()}
177
178 self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)
179
180 def expand_rule(self, source_rule: NonTerminal, rules_by_origin=None) -> OrderedSet[RulePtr]:
181 "Returns all init_ptrs accessible by rule (recursive)"
182
183 if rules_by_origin is None:
184 rules_by_origin = self.rules_by_origin
185
186 init_ptrs = OrderedSet[RulePtr]()
187 def _expand_rule(rule: NonTerminal) -> Iterator[NonTerminal]:
188 assert not rule.is_term, rule
189
190 for r in rules_by_origin[rule]:
191 init_ptr = RulePtr(r, 0)
192 init_ptrs.add(init_ptr)
193
194 if r.expansion: # if not empty rule
195 new_r = init_ptr.next
196 if not new_r.is_term:
197 assert isinstance(new_r, NonTerminal)
198 yield new_r
199
200 for _ in bfs([source_rule], _expand_rule):
201 pass
202
203 return init_ptrs