Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/blib2to3/pgen2/pgen.py: 91%

317 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:15 +0000

1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 

2# Licensed to PSF under a Contributor Agreement. 

3 

4# Pgen imports 

5from . import grammar, token, tokenize 

6 

7from typing import ( 

8 Any, 

9 Dict, 

10 IO, 

11 Iterator, 

12 List, 

13 Optional, 

14 Text, 

15 Tuple, 

16 Union, 

17 Sequence, 

18 NoReturn, 

19) 

20from blib2to3.pgen2 import grammar 

21from blib2to3.pgen2.tokenize import GoodTokenInfo 

22import os 

23 

24 

25Path = Union[str, "os.PathLike[str]"] 

26 

27 

28class PgenGrammar(grammar.Grammar): 

29 pass 

30 

31 

32class ParserGenerator(object): 

33 filename: Path 

34 stream: IO[Text] 

35 generator: Iterator[GoodTokenInfo] 

36 first: Dict[Text, Optional[Dict[Text, int]]] 

37 

38 def __init__(self, filename: Path, stream: Optional[IO[Text]] = None) -> None: 

39 close_stream = None 

40 if stream is None: 

41 stream = open(filename, encoding="utf-8") 

42 close_stream = stream.close 

43 self.filename = filename 

44 self.stream = stream 

45 self.generator = tokenize.generate_tokens(stream.readline) 

46 self.gettoken() # Initialize lookahead 

47 self.dfas, self.startsymbol = self.parse() 

48 if close_stream is not None: 

49 close_stream() 

50 self.first = {} # map from symbol name to set of tokens 

51 self.addfirstsets() 

52 

53 def make_grammar(self) -> PgenGrammar: 

54 c = PgenGrammar() 

55 names = list(self.dfas.keys()) 

56 names.sort() 

57 names.remove(self.startsymbol) 

58 names.insert(0, self.startsymbol) 

59 for name in names: 

60 i = 256 + len(c.symbol2number) 

61 c.symbol2number[name] = i 

62 c.number2symbol[i] = name 

63 for name in names: 

64 dfa = self.dfas[name] 

65 states = [] 

66 for state in dfa: 

67 arcs = [] 

68 for label, next in sorted(state.arcs.items()): 

69 arcs.append((self.make_label(c, label), dfa.index(next))) 

70 if state.isfinal: 

71 arcs.append((0, dfa.index(state))) 

72 states.append(arcs) 

73 c.states.append(states) 

74 c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) 

75 c.start = c.symbol2number[self.startsymbol] 

76 return c 

77 

78 def make_first(self, c: PgenGrammar, name: Text) -> Dict[int, int]: 

79 rawfirst = self.first[name] 

80 assert rawfirst is not None 

81 first = {} 

82 for label in sorted(rawfirst): 

83 ilabel = self.make_label(c, label) 

84 ##assert ilabel not in first # XXX failed on <> ... != 

85 first[ilabel] = 1 

86 return first 

87 

88 def make_label(self, c: PgenGrammar, label: Text) -> int: 

89 # XXX Maybe this should be a method on a subclass of converter? 

90 ilabel = len(c.labels) 

91 if label[0].isalpha(): 

92 # Either a symbol name or a named token 

93 if label in c.symbol2number: 

94 # A symbol name (a non-terminal) 

95 if label in c.symbol2label: 

96 return c.symbol2label[label] 

97 else: 

98 c.labels.append((c.symbol2number[label], None)) 

99 c.symbol2label[label] = ilabel 

100 return ilabel 

101 else: 

102 # A named token (NAME, NUMBER, STRING) 

103 itoken = getattr(token, label, None) 

104 assert isinstance(itoken, int), label 

105 assert itoken in token.tok_name, label 

106 if itoken in c.tokens: 

107 return c.tokens[itoken] 

108 else: 

109 c.labels.append((itoken, None)) 

110 c.tokens[itoken] = ilabel 

111 return ilabel 

112 else: 

113 # Either a keyword or an operator 

114 assert label[0] in ('"', "'"), label 

115 value = eval(label) 

116 if value[0].isalpha(): 

117 if label[0] == '"': 

118 keywords = c.soft_keywords 

119 else: 

120 keywords = c.keywords 

121 

122 # A keyword 

123 if value in keywords: 

124 return keywords[value] 

125 else: 

126 c.labels.append((token.NAME, value)) 

127 keywords[value] = ilabel 

128 return ilabel 

129 else: 

130 # An operator (any non-numeric token) 

131 itoken = grammar.opmap[value] # Fails if unknown token 

132 if itoken in c.tokens: 

133 return c.tokens[itoken] 

134 else: 

135 c.labels.append((itoken, None)) 

136 c.tokens[itoken] = ilabel 

137 return ilabel 

138 

139 def addfirstsets(self) -> None: 

140 names = list(self.dfas.keys()) 

141 names.sort() 

142 for name in names: 

143 if name not in self.first: 

144 self.calcfirst(name) 

145 # print name, self.first[name].keys() 

146 

147 def calcfirst(self, name: Text) -> None: 

148 dfa = self.dfas[name] 

149 self.first[name] = None # dummy to detect left recursion 

150 state = dfa[0] 

151 totalset: Dict[str, int] = {} 

152 overlapcheck = {} 

153 for label, next in state.arcs.items(): 

154 if label in self.dfas: 

155 if label in self.first: 

156 fset = self.first[label] 

157 if fset is None: 

158 raise ValueError("recursion for rule %r" % name) 

159 else: 

160 self.calcfirst(label) 

161 fset = self.first[label] 

162 assert fset is not None 

163 totalset.update(fset) 

164 overlapcheck[label] = fset 

165 else: 

166 totalset[label] = 1 

167 overlapcheck[label] = {label: 1} 

168 inverse: Dict[str, str] = {} 

169 for label, itsfirst in overlapcheck.items(): 

170 for symbol in itsfirst: 

171 if symbol in inverse: 

172 raise ValueError( 

173 "rule %s is ambiguous; %s is in the first sets of %s as well" 

174 " as %s" % (name, symbol, label, inverse[symbol]) 

175 ) 

176 inverse[symbol] = label 

177 self.first[name] = totalset 

178 

179 def parse(self) -> Tuple[Dict[Text, List["DFAState"]], Text]: 

180 dfas = {} 

181 startsymbol: Optional[str] = None 

182 # MSTART: (NEWLINE | RULE)* ENDMARKER 

183 while self.type != token.ENDMARKER: 

184 while self.type == token.NEWLINE: 

185 self.gettoken() 

186 # RULE: NAME ':' RHS NEWLINE 

187 name = self.expect(token.NAME) 

188 self.expect(token.OP, ":") 

189 a, z = self.parse_rhs() 

190 self.expect(token.NEWLINE) 

191 # self.dump_nfa(name, a, z) 

192 dfa = self.make_dfa(a, z) 

193 # self.dump_dfa(name, dfa) 

194 oldlen = len(dfa) 

195 self.simplify_dfa(dfa) 

196 newlen = len(dfa) 

197 dfas[name] = dfa 

198 # print name, oldlen, newlen 

199 if startsymbol is None: 

200 startsymbol = name 

201 assert startsymbol is not None 

202 return dfas, startsymbol 

203 

204 def make_dfa(self, start: "NFAState", finish: "NFAState") -> List["DFAState"]: 

205 # To turn an NFA into a DFA, we define the states of the DFA 

206 # to correspond to *sets* of states of the NFA. Then do some 

207 # state reduction. Let's represent sets as dicts with 1 for 

208 # values. 

209 assert isinstance(start, NFAState) 

210 assert isinstance(finish, NFAState) 

211 

212 def closure(state: NFAState) -> Dict[NFAState, int]: 

213 base: Dict[NFAState, int] = {} 

214 addclosure(state, base) 

215 return base 

216 

217 def addclosure(state: NFAState, base: Dict[NFAState, int]) -> None: 

218 assert isinstance(state, NFAState) 

219 if state in base: 

220 return 

221 base[state] = 1 

222 for label, next in state.arcs: 

223 if label is None: 

224 addclosure(next, base) 

225 

226 states = [DFAState(closure(start), finish)] 

227 for state in states: # NB states grows while we're iterating 

228 arcs: Dict[str, Dict[NFAState, int]] = {} 

229 for nfastate in state.nfaset: 

230 for label, next in nfastate.arcs: 

231 if label is not None: 

232 addclosure(next, arcs.setdefault(label, {})) 

233 for label, nfaset in sorted(arcs.items()): 

234 for st in states: 

235 if st.nfaset == nfaset: 

236 break 

237 else: 

238 st = DFAState(nfaset, finish) 

239 states.append(st) 

240 state.addarc(st, label) 

241 return states # List of DFAState instances; first one is start 

242 

243 def dump_nfa(self, name: Text, start: "NFAState", finish: "NFAState") -> None: 

244 print("Dump of NFA for", name) 

245 todo = [start] 

246 for i, state in enumerate(todo): 

247 print(" State", i, state is finish and "(final)" or "") 

248 for label, next in state.arcs: 

249 if next in todo: 

250 j = todo.index(next) 

251 else: 

252 j = len(todo) 

253 todo.append(next) 

254 if label is None: 

255 print(" -> %d" % j) 

256 else: 

257 print(" %s -> %d" % (label, j)) 

258 

259 def dump_dfa(self, name: Text, dfa: Sequence["DFAState"]) -> None: 

260 print("Dump of DFA for", name) 

261 for i, state in enumerate(dfa): 

262 print(" State", i, state.isfinal and "(final)" or "") 

263 for label, next in sorted(state.arcs.items()): 

264 print(" %s -> %d" % (label, dfa.index(next))) 

265 

266 def simplify_dfa(self, dfa: List["DFAState"]) -> None: 

267 # This is not theoretically optimal, but works well enough. 

268 # Algorithm: repeatedly look for two states that have the same 

269 # set of arcs (same labels pointing to the same nodes) and 

270 # unify them, until things stop changing. 

271 

272 # dfa is a list of DFAState instances 

273 changes = True 

274 while changes: 

275 changes = False 

276 for i, state_i in enumerate(dfa): 

277 for j in range(i + 1, len(dfa)): 

278 state_j = dfa[j] 

279 if state_i == state_j: 

280 # print " unify", i, j 

281 del dfa[j] 

282 for state in dfa: 

283 state.unifystate(state_j, state_i) 

284 changes = True 

285 break 

286 

287 def parse_rhs(self) -> Tuple["NFAState", "NFAState"]: 

288 # RHS: ALT ('|' ALT)* 

289 a, z = self.parse_alt() 

290 if self.value != "|": 

291 return a, z 

292 else: 

293 aa = NFAState() 

294 zz = NFAState() 

295 aa.addarc(a) 

296 z.addarc(zz) 

297 while self.value == "|": 

298 self.gettoken() 

299 a, z = self.parse_alt() 

300 aa.addarc(a) 

301 z.addarc(zz) 

302 return aa, zz 

303 

304 def parse_alt(self) -> Tuple["NFAState", "NFAState"]: 

305 # ALT: ITEM+ 

306 a, b = self.parse_item() 

307 while self.value in ("(", "[") or self.type in (token.NAME, token.STRING): 

308 c, d = self.parse_item() 

309 b.addarc(c) 

310 b = d 

311 return a, b 

312 

313 def parse_item(self) -> Tuple["NFAState", "NFAState"]: 

314 # ITEM: '[' RHS ']' | ATOM ['+' | '*'] 

315 if self.value == "[": 

316 self.gettoken() 

317 a, z = self.parse_rhs() 

318 self.expect(token.OP, "]") 

319 a.addarc(z) 

320 return a, z 

321 else: 

322 a, z = self.parse_atom() 

323 value = self.value 

324 if value not in ("+", "*"): 

325 return a, z 

326 self.gettoken() 

327 z.addarc(a) 

328 if value == "+": 

329 return a, z 

330 else: 

331 return a, a 

332 

333 def parse_atom(self) -> Tuple["NFAState", "NFAState"]: 

334 # ATOM: '(' RHS ')' | NAME | STRING 

335 if self.value == "(": 

336 self.gettoken() 

337 a, z = self.parse_rhs() 

338 self.expect(token.OP, ")") 

339 return a, z 

340 elif self.type in (token.NAME, token.STRING): 

341 a = NFAState() 

342 z = NFAState() 

343 a.addarc(z, self.value) 

344 self.gettoken() 

345 return a, z 

346 else: 

347 self.raise_error( 

348 "expected (...) or NAME or STRING, got %s/%s", self.type, self.value 

349 ) 

350 assert False 

351 

352 def expect(self, type: int, value: Optional[Any] = None) -> Text: 

353 if self.type != type or (value is not None and self.value != value): 

354 self.raise_error( 

355 "expected %s/%s, got %s/%s", type, value, self.type, self.value 

356 ) 

357 value = self.value 

358 self.gettoken() 

359 return value 

360 

361 def gettoken(self) -> None: 

362 tup = next(self.generator) 

363 while tup[0] in (tokenize.COMMENT, tokenize.NL): 

364 tup = next(self.generator) 

365 self.type, self.value, self.begin, self.end, self.line = tup 

366 # print token.tok_name[self.type], repr(self.value) 

367 

368 def raise_error(self, msg: str, *args: Any) -> NoReturn: 

369 if args: 

370 try: 

371 msg = msg % args 

372 except: 

373 msg = " ".join([msg] + list(map(str, args))) 

374 raise SyntaxError(msg, (self.filename, self.end[0], self.end[1], self.line)) 

375 

376 

377class NFAState(object): 

378 arcs: List[Tuple[Optional[Text], "NFAState"]] 

379 

380 def __init__(self) -> None: 

381 self.arcs = [] # list of (label, NFAState) pairs 

382 

383 def addarc(self, next: "NFAState", label: Optional[Text] = None) -> None: 

384 assert label is None or isinstance(label, str) 

385 assert isinstance(next, NFAState) 

386 self.arcs.append((label, next)) 

387 

388 

389class DFAState(object): 

390 nfaset: Dict[NFAState, Any] 

391 isfinal: bool 

392 arcs: Dict[Text, "DFAState"] 

393 

394 def __init__(self, nfaset: Dict[NFAState, Any], final: NFAState) -> None: 

395 assert isinstance(nfaset, dict) 

396 assert isinstance(next(iter(nfaset)), NFAState) 

397 assert isinstance(final, NFAState) 

398 self.nfaset = nfaset 

399 self.isfinal = final in nfaset 

400 self.arcs = {} # map from label to DFAState 

401 

402 def addarc(self, next: "DFAState", label: Text) -> None: 

403 assert isinstance(label, str) 

404 assert label not in self.arcs 

405 assert isinstance(next, DFAState) 

406 self.arcs[label] = next 

407 

408 def unifystate(self, old: "DFAState", new: "DFAState") -> None: 

409 for label, next in self.arcs.items(): 

410 if next is old: 

411 self.arcs[label] = new 

412 

413 def __eq__(self, other: Any) -> bool: 

414 # Equality test -- ignore the nfaset instance variable 

415 assert isinstance(other, DFAState) 

416 if self.isfinal != other.isfinal: 

417 return False 

418 # Can't just return self.arcs == other.arcs, because that 

419 # would invoke this method recursively, with cycles... 

420 if len(self.arcs) != len(other.arcs): 

421 return False 

422 for label, next in self.arcs.items(): 

423 if next is not other.arcs.get(label): 

424 return False 

425 return True 

426 

427 __hash__: Any = None # For Py3 compatibility. 

428 

429 

430def generate_grammar(filename: Path = "Grammar.txt") -> PgenGrammar: 

431 p = ParserGenerator(filename) 

432 return p.make_grammar()