1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4import os
5from collections.abc import Iterator, Sequence
6from typing import IO, Any, NoReturn, Optional, Union
7
8from blib2to3.pgen2 import grammar, token, tokenize
9from blib2to3.pgen2.tokenize import TokenInfo
10
11Path = Union[str, "os.PathLike[str]"]
12
13
14class PgenGrammar(grammar.Grammar):
15 pass
16
17
18class ParserGenerator:
19 filename: Path
20 stream: IO[str]
21 generator: Iterator[TokenInfo]
22 first: dict[str, Optional[dict[str, int]]]
23
24 def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
25 close_stream = None
26 if stream is None:
27 stream = open(filename, encoding="utf-8")
28 close_stream = stream.close
29 self.filename = filename
30 self.generator = tokenize.tokenize(stream.read())
31 self.gettoken() # Initialize lookahead
32 self.dfas, self.startsymbol = self.parse()
33 if close_stream is not None:
34 close_stream()
35 self.first = {} # map from symbol name to set of tokens
36 self.addfirstsets()
37
38 def make_grammar(self) -> PgenGrammar:
39 c = PgenGrammar()
40 names = list(self.dfas.keys())
41 names.sort()
42 names.remove(self.startsymbol)
43 names.insert(0, self.startsymbol)
44 for name in names:
45 i = 256 + len(c.symbol2number)
46 c.symbol2number[name] = i
47 c.number2symbol[i] = name
48 for name in names:
49 dfa = self.dfas[name]
50 states = []
51 for state in dfa:
52 arcs = []
53 for label, next in sorted(state.arcs.items()):
54 arcs.append((self.make_label(c, label), dfa.index(next)))
55 if state.isfinal:
56 arcs.append((0, dfa.index(state)))
57 states.append(arcs)
58 c.states.append(states)
59 c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
60 c.start = c.symbol2number[self.startsymbol]
61 return c
62
63 def make_first(self, c: PgenGrammar, name: str) -> dict[int, int]:
64 rawfirst = self.first[name]
65 assert rawfirst is not None
66 first = {}
67 for label in sorted(rawfirst):
68 ilabel = self.make_label(c, label)
69 ##assert ilabel not in first # XXX failed on <> ... !=
70 first[ilabel] = 1
71 return first
72
73 def make_label(self, c: PgenGrammar, label: str) -> int:
74 # XXX Maybe this should be a method on a subclass of converter?
75 ilabel = len(c.labels)
76 if label[0].isalpha():
77 # Either a symbol name or a named token
78 if label in c.symbol2number:
79 # A symbol name (a non-terminal)
80 if label in c.symbol2label:
81 return c.symbol2label[label]
82 else:
83 c.labels.append((c.symbol2number[label], None))
84 c.symbol2label[label] = ilabel
85 return ilabel
86 else:
87 # A named token (NAME, NUMBER, STRING)
88 itoken = getattr(token, label, None)
89 assert isinstance(itoken, int), label
90 assert itoken in token.tok_name, label
91 if itoken in c.tokens:
92 return c.tokens[itoken]
93 else:
94 c.labels.append((itoken, None))
95 c.tokens[itoken] = ilabel
96 return ilabel
97 else:
98 # Either a keyword or an operator
99 assert label[0] in ('"', "'"), label
100 value = eval(label)
101 if value[0].isalpha():
102 if label[0] == '"':
103 keywords = c.soft_keywords
104 else:
105 keywords = c.keywords
106
107 # A keyword
108 if value in keywords:
109 return keywords[value]
110 else:
111 c.labels.append((token.NAME, value))
112 keywords[value] = ilabel
113 return ilabel
114 else:
115 # An operator (any non-numeric token)
116 itoken = grammar.opmap[value] # Fails if unknown token
117 if itoken in c.tokens:
118 return c.tokens[itoken]
119 else:
120 c.labels.append((itoken, None))
121 c.tokens[itoken] = ilabel
122 return ilabel
123
124 def addfirstsets(self) -> None:
125 names = list(self.dfas.keys())
126 names.sort()
127 for name in names:
128 if name not in self.first:
129 self.calcfirst(name)
130 # print name, self.first[name].keys()
131
132 def calcfirst(self, name: str) -> None:
133 dfa = self.dfas[name]
134 self.first[name] = None # dummy to detect left recursion
135 state = dfa[0]
136 totalset: dict[str, int] = {}
137 overlapcheck = {}
138 for label in state.arcs:
139 if label in self.dfas:
140 if label in self.first:
141 fset = self.first[label]
142 if fset is None:
143 raise ValueError(f"recursion for rule {name!r}")
144 else:
145 self.calcfirst(label)
146 fset = self.first[label]
147 assert fset is not None
148 totalset.update(fset)
149 overlapcheck[label] = fset
150 else:
151 totalset[label] = 1
152 overlapcheck[label] = {label: 1}
153 inverse: dict[str, str] = {}
154 for label, itsfirst in overlapcheck.items():
155 for symbol in itsfirst:
156 if symbol in inverse:
157 raise ValueError(
158 f"rule {name} is ambiguous; {symbol} is in the first sets of"
159 f" {label} as well as {inverse[symbol]}"
160 )
161 inverse[symbol] = label
162 self.first[name] = totalset
163
164 def parse(self) -> tuple[dict[str, list["DFAState"]], str]:
165 dfas = {}
166 startsymbol: Optional[str] = None
167 # MSTART: (NEWLINE | RULE)* ENDMARKER
168 while self.type != token.ENDMARKER:
169 while self.type == token.NEWLINE:
170 self.gettoken()
171 # RULE: NAME ':' RHS NEWLINE
172 name = self.expect(token.NAME)
173 self.expect(token.OP, ":")
174 a, z = self.parse_rhs()
175 self.expect(token.NEWLINE)
176 # self.dump_nfa(name, a, z)
177 dfa = self.make_dfa(a, z)
178 # self.dump_dfa(name, dfa)
179 # oldlen = len(dfa)
180 self.simplify_dfa(dfa)
181 # newlen = len(dfa)
182 dfas[name] = dfa
183 # print name, oldlen, newlen
184 if startsymbol is None:
185 startsymbol = name
186 assert startsymbol is not None
187 return dfas, startsymbol
188
189 def make_dfa(self, start: "NFAState", finish: "NFAState") -> list["DFAState"]:
190 # To turn an NFA into a DFA, we define the states of the DFA
191 # to correspond to *sets* of states of the NFA. Then do some
192 # state reduction. Let's represent sets as dicts with 1 for
193 # values.
194 assert isinstance(start, NFAState)
195 assert isinstance(finish, NFAState)
196
197 def closure(state: NFAState) -> dict[NFAState, int]:
198 base: dict[NFAState, int] = {}
199 addclosure(state, base)
200 return base
201
202 def addclosure(state: NFAState, base: dict[NFAState, int]) -> None:
203 assert isinstance(state, NFAState)
204 if state in base:
205 return
206 base[state] = 1
207 for label, next in state.arcs:
208 if label is None:
209 addclosure(next, base)
210
211 states = [DFAState(closure(start), finish)]
212 for state in states: # NB states grows while we're iterating
213 arcs: dict[str, dict[NFAState, int]] = {}
214 for nfastate in state.nfaset:
215 for label, next in nfastate.arcs:
216 if label is not None:
217 addclosure(next, arcs.setdefault(label, {}))
218 for label, nfaset in sorted(arcs.items()):
219 for st in states:
220 if st.nfaset == nfaset:
221 break
222 else:
223 st = DFAState(nfaset, finish)
224 states.append(st)
225 state.addarc(st, label)
226 return states # List of DFAState instances; first one is start
227
228 def dump_nfa(self, name: str, start: "NFAState", finish: "NFAState") -> None:
229 print("Dump of NFA for", name)
230 todo = [start]
231 for i, state in enumerate(todo):
232 print(" State", i, state is finish and "(final)" or "")
233 for label, next in state.arcs:
234 if next in todo:
235 j = todo.index(next)
236 else:
237 j = len(todo)
238 todo.append(next)
239 if label is None:
240 print(f" -> {j}")
241 else:
242 print(f" {label} -> {j}")
243
244 def dump_dfa(self, name: str, dfa: Sequence["DFAState"]) -> None:
245 print("Dump of DFA for", name)
246 for i, state in enumerate(dfa):
247 print(" State", i, state.isfinal and "(final)" or "")
248 for label, next in sorted(state.arcs.items()):
249 print(f" {label} -> {dfa.index(next)}")
250
251 def simplify_dfa(self, dfa: list["DFAState"]) -> None:
252 # This is not theoretically optimal, but works well enough.
253 # Algorithm: repeatedly look for two states that have the same
254 # set of arcs (same labels pointing to the same nodes) and
255 # unify them, until things stop changing.
256
257 # dfa is a list of DFAState instances
258 changes = True
259 while changes:
260 changes = False
261 for i, state_i in enumerate(dfa):
262 for j in range(i + 1, len(dfa)):
263 state_j = dfa[j]
264 if state_i == state_j:
265 # print " unify", i, j
266 del dfa[j]
267 for state in dfa:
268 state.unifystate(state_j, state_i)
269 changes = True
270 break
271
272 def parse_rhs(self) -> tuple["NFAState", "NFAState"]:
273 # RHS: ALT ('|' ALT)*
274 a, z = self.parse_alt()
275 if self.value != "|":
276 return a, z
277 else:
278 aa = NFAState()
279 zz = NFAState()
280 aa.addarc(a)
281 z.addarc(zz)
282 while self.value == "|":
283 self.gettoken()
284 a, z = self.parse_alt()
285 aa.addarc(a)
286 z.addarc(zz)
287 return aa, zz
288
289 def parse_alt(self) -> tuple["NFAState", "NFAState"]:
290 # ALT: ITEM+
291 a, b = self.parse_item()
292 while self.value in ("(", "[") or self.type in (token.NAME, token.STRING):
293 c, d = self.parse_item()
294 b.addarc(c)
295 b = d
296 return a, b
297
298 def parse_item(self) -> tuple["NFAState", "NFAState"]:
299 # ITEM: '[' RHS ']' | ATOM ['+' | '*']
300 if self.value == "[":
301 self.gettoken()
302 a, z = self.parse_rhs()
303 self.expect(token.OP, "]")
304 a.addarc(z)
305 return a, z
306 else:
307 a, z = self.parse_atom()
308 value = self.value
309 if value not in ("+", "*"):
310 return a, z
311 self.gettoken()
312 z.addarc(a)
313 if value == "+":
314 return a, z
315 else:
316 return a, a
317
318 def parse_atom(self) -> tuple["NFAState", "NFAState"]:
319 # ATOM: '(' RHS ')' | NAME | STRING
320 if self.value == "(":
321 self.gettoken()
322 a, z = self.parse_rhs()
323 self.expect(token.OP, ")")
324 return a, z
325 elif self.type in (token.NAME, token.STRING):
326 a = NFAState()
327 z = NFAState()
328 a.addarc(z, self.value)
329 self.gettoken()
330 return a, z
331 else:
332 self.raise_error(
333 f"expected (...) or NAME or STRING, got {self.type}/{self.value}"
334 )
335
336 def expect(self, type: int, value: Optional[Any] = None) -> str:
337 if self.type != type or (value is not None and self.value != value):
338 self.raise_error(f"expected {type}/{value}, got {self.type}/{self.value}")
339 value = self.value
340 self.gettoken()
341 return value
342
343 def gettoken(self) -> None:
344 tup = next(self.generator)
345 while tup[0] in (tokenize.COMMENT, tokenize.NL):
346 tup = next(self.generator)
347 self.type, self.value, self.begin, self.end, self.line = tup
348 # print token.tok_name[self.type], repr(self.value)
349
350 def raise_error(self, msg: str) -> NoReturn:
351 raise SyntaxError(
352 msg, (str(self.filename), self.end[0], self.end[1], self.line)
353 )
354
355
356class NFAState:
357 arcs: list[tuple[Optional[str], "NFAState"]]
358
359 def __init__(self) -> None:
360 self.arcs = [] # list of (label, NFAState) pairs
361
362 def addarc(self, next: "NFAState", label: Optional[str] = None) -> None:
363 assert label is None or isinstance(label, str)
364 assert isinstance(next, NFAState)
365 self.arcs.append((label, next))
366
367
368class DFAState:
369 nfaset: dict[NFAState, Any]
370 isfinal: bool
371 arcs: dict[str, "DFAState"]
372
373 def __init__(self, nfaset: dict[NFAState, Any], final: NFAState) -> None:
374 assert isinstance(nfaset, dict)
375 assert isinstance(next(iter(nfaset)), NFAState)
376 assert isinstance(final, NFAState)
377 self.nfaset = nfaset
378 self.isfinal = final in nfaset
379 self.arcs = {} # map from label to DFAState
380
381 def addarc(self, next: "DFAState", label: str) -> None:
382 assert isinstance(label, str)
383 assert label not in self.arcs
384 assert isinstance(next, DFAState)
385 self.arcs[label] = next
386
387 def unifystate(self, old: "DFAState", new: "DFAState") -> None:
388 for label, next in self.arcs.items():
389 if next is old:
390 self.arcs[label] = new
391
392 def __eq__(self, other: Any) -> bool:
393 # Equality test -- ignore the nfaset instance variable
394 assert isinstance(other, DFAState)
395 if self.isfinal != other.isfinal:
396 return False
397 # Can't just return self.arcs == other.arcs, because that
398 # would invoke this method recursively, with cycles...
399 if len(self.arcs) != len(other.arcs):
400 return False
401 for label, next in self.arcs.items():
402 if next is not other.arcs.get(label):
403 return False
404 return True
405
406 __hash__: Any = None # For Py3 compatibility.
407
408
409def generate_grammar(filename: Path = "Grammar.txt") -> PgenGrammar:
410 p = ParserGenerator(filename)
411 return p.make_grammar()