Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/parsimonious/grammar.py: 90%

167 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:15 +0000

1"""A convenience which constructs expression trees from an easy-to-read syntax 

2 

3Use this unless you have a compelling reason not to; it performs some 

4optimizations that would be tedious to do when constructing an expression tree 

5by hand. 

6 

7""" 

8from collections import OrderedDict 

9from textwrap import dedent 

10 

11from parsimonious.exceptions import BadGrammar, UndefinedLabel 

12from parsimonious.expressions import (Literal, Regex, Sequence, OneOf, 

13 Lookahead, Quantifier, Optional, ZeroOrMore, OneOrMore, Not, TokenMatcher, 

14 expression, is_callable) 

15from parsimonious.nodes import NodeVisitor 

16from parsimonious.utils import evaluate_string 

17 

18class Grammar(OrderedDict): 

19 """A collection of rules that describe a language 

20 

21 You can start parsing from the default rule by calling ``parse()`` 

22 directly on the ``Grammar`` object:: 

23 

24 g = Grammar(''' 

25 polite_greeting = greeting ", my good " title 

26 greeting = "Hi" / "Hello" 

27 title = "madam" / "sir" 

28 ''') 

29 g.parse('Hello, my good sir') 

30 

31 Or start parsing from any of the other rules; you can pull them out of the 

32 grammar as if it were a dictionary:: 

33 

34 g['title'].parse('sir') 

35 

36 You could also just construct a bunch of ``Expression`` objects yourself 

37 and stitch them together into a language, but using a ``Grammar`` has some 

38 important advantages: 

39 

40 * Languages are much easier to define in the nice syntax it provides. 

41 * Circular references aren't a pain. 

42 * It does all kinds of whizzy space- and time-saving optimizations, like 

43 factoring up repeated subexpressions into a single object, which should 

44 increase cache hit ratio. [Is this implemented yet?] 

45 

46 """ 

47 def __init__(self, rules='', **more_rules): 

48 """Construct a grammar. 

49 

50 :arg rules: A string of production rules, one per line. 

51 :arg default_rule: The name of the rule invoked when you call 

52 :meth:`parse()` or :meth:`match()` on the grammar. Defaults to the 

53 first rule. Falls back to None if there are no string-based rules 

54 in this grammar. 

55 :arg more_rules: Additional kwargs whose names are rule names and 

56 values are Expressions or custom-coded callables which accomplish 

57 things the built-in rule syntax cannot. These take precedence over 

58 ``rules`` in case of naming conflicts. 

59 

60 """ 

61 

62 decorated_custom_rules = { 

63 k: (expression(v, k, self) if is_callable(v) else v) 

64 for k, v in more_rules.items()} 

65 

66 exprs, first = self._expressions_from_rules(rules, decorated_custom_rules) 

67 super().__init__(exprs.items()) 

68 self.default_rule = first # may be None 

69 

70 def default(self, rule_name): 

71 """Return a new Grammar whose :term:`default rule` is ``rule_name``.""" 

72 new = self._copy() 

73 new.default_rule = new[rule_name] 

74 return new 

75 

76 def _copy(self): 

77 """Return a shallow copy of myself. 

78 

79 Deep is unnecessary, since Expression trees are immutable. Subgrammars 

80 recreate all the Expressions from scratch, and AbstractGrammars have 

81 no Expressions. 

82 

83 """ 

84 new = Grammar.__new__(Grammar) 

85 super(Grammar, new).__init__(self.items()) 

86 new.default_rule = self.default_rule 

87 return new 

88 

89 def _expressions_from_rules(self, rules, custom_rules): 

90 """Return a 2-tuple: a dict of rule names pointing to their 

91 expressions, and then the first rule. 

92 

93 It's a web of expressions, all referencing each other. Typically, 

94 there's a single root to the web of references, and that root is the 

95 starting symbol for parsing, but there's nothing saying you can't have 

96 multiple roots. 

97 

98 :arg custom_rules: A map of rule names to custom-coded rules: 

99 Expressions 

100 

101 """ 

102 tree = rule_grammar.parse(rules) 

103 return RuleVisitor(custom_rules).visit(tree) 

104 

105 def parse(self, text, pos=0): 

106 """Parse some text with the :term:`default rule`. 

107 

108 :arg pos: The index at which to start parsing 

109 

110 """ 

111 self._check_default_rule() 

112 return self.default_rule.parse(text, pos=pos) 

113 

114 def match(self, text, pos=0): 

115 """Parse some text with the :term:`default rule` but not necessarily 

116 all the way to the end. 

117 

118 :arg pos: The index at which to start parsing 

119 

120 """ 

121 self._check_default_rule() 

122 return self.default_rule.match(text, pos=pos) 

123 

124 def _check_default_rule(self): 

125 """Raise RuntimeError if there is no default rule defined.""" 

126 if not self.default_rule: 

127 raise RuntimeError("Can't call parse() on a Grammar that has no " 

128 "default rule. Choose a specific rule instead, " 

129 "like some_grammar['some_rule'].parse(...).") 

130 

131 def __str__(self): 

132 """Return a rule string that, when passed to the constructor, would 

133 reconstitute the grammar.""" 

134 exprs = [self.default_rule] if self.default_rule else [] 

135 exprs.extend(expr for expr in self.values() if 

136 expr is not self.default_rule) 

137 return '\n'.join(expr.as_rule() for expr in exprs) 

138 

139 def __repr__(self): 

140 """Return an expression that will reconstitute the grammar.""" 

141 return "Grammar({!r})".format(str(self)) 

142 

143 

144class TokenGrammar(Grammar): 

145 """A Grammar which takes a list of pre-lexed tokens instead of text 

146 

147 This is useful if you want to do the lexing yourself, as a separate pass: 

148 for example, to implement indentation-based languages. 

149 

150 """ 

151 def _expressions_from_rules(self, rules, custom_rules): 

152 tree = rule_grammar.parse(rules) 

153 return TokenRuleVisitor(custom_rules).visit(tree) 

154 

155 

156class BootstrappingGrammar(Grammar): 

157 """The grammar used to recognize the textual rules that describe other 

158 grammars 

159 

160 This grammar gets its start from some hard-coded Expressions and claws its 

161 way from there to an expression tree that describes how to parse the 

162 grammar description syntax. 

163 

164 """ 

165 def _expressions_from_rules(self, rule_syntax, custom_rules): 

166 """Return the rules for parsing the grammar definition syntax. 

167 

168 Return a 2-tuple: a dict of rule names pointing to their expressions, 

169 and then the top-level expression for the first rule. 

170 

171 """ 

172 # Hard-code enough of the rules to parse the grammar that describes the 

173 # grammar description language, to bootstrap: 

174 comment = Regex(r'#[^\r\n]*', name='comment') 

175 meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness') 

176 _ = ZeroOrMore(meaninglessness, name='_') 

177 equals = Sequence(Literal('='), _, name='equals') 

178 label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label') 

179 reference = Sequence(label, Not(equals), name='reference') 

180 quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier') 

181 # This pattern supports empty literals. TODO: A problem? 

182 spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"', 

183 ignore_case=True, 

184 dot_all=True, 

185 name='spaceless_literal') 

186 literal = Sequence(spaceless_literal, _, name='literal') 

187 regex = Sequence(Literal('~'), 

188 literal, 

189 Regex('[ilmsuxa]*', ignore_case=True), 

190 _, 

191 name='regex') 

192 atom = OneOf(reference, literal, regex, name='atom') 

193 quantified = Sequence(atom, quantifier, name='quantified') 

194 

195 term = OneOf(quantified, atom, name='term') 

196 not_term = Sequence(Literal('!'), term, _, name='not_term') 

197 term.members = (not_term,) + term.members 

198 

199 sequence = Sequence(term, OneOrMore(term), name='sequence') 

200 or_term = Sequence(Literal('/'), _, term, name='or_term') 

201 ored = Sequence(term, OneOrMore(or_term), name='ored') 

202 expression = OneOf(ored, sequence, term, name='expression') 

203 rule = Sequence(label, equals, expression, name='rule') 

204 rules = Sequence(_, OneOrMore(rule), name='rules') 

205 

206 # Use those hard-coded rules to parse the (more extensive) rule syntax. 

207 # (For example, unless I start using parentheses in the rule language 

208 # definition itself, I should never have to hard-code expressions for 

209 # those above.) 

210 

211 rule_tree = rules.parse(rule_syntax) 

212 

213 # Turn the parse tree into a map of expressions: 

214 return RuleVisitor().visit(rule_tree) 

215 

216 

217# The grammar for parsing PEG grammar definitions: 

218# This is a nice, simple grammar. We may someday add to it, but it's a safe bet 

219# that the future will always be a superset of this. 

220rule_syntax = (r''' 

221 # Ignored things (represented by _) are typically hung off the end of the 

222 # leafmost kinds of nodes. Literals like "/" count as leaves. 

223 

224 rules = _ rule* 

225 rule = label equals expression 

226 equals = "=" _ 

227 literal = spaceless_literal _ 

228 

229 # So you can't spell a regex like `~"..." ilm`: 

230 spaceless_literal = ~"u?r?b?\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""is / 

231 ~"u?r?b?'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'"is 

232 

233 expression = ored / sequence / term 

234 or_term = "/" _ term 

235 ored = term or_term+ 

236 sequence = term term+ 

237 not_term = "!" term _ 

238 lookahead_term = "&" term _ 

239 term = not_term / lookahead_term / quantified / atom 

240 quantified = atom quantifier 

241 atom = reference / literal / regex / parenthesized 

242 regex = "~" spaceless_literal ~"[ilmsuxa]*"i _ 

243 parenthesized = "(" _ expression ")" _ 

244 quantifier = ~r"[*+?]|\{\d*,\d+\}|\{\d+,\d*\}|\{\d+\}" _ 

245 reference = label !equals 

246 

247 # A subsequent equal sign is the only thing that distinguishes a label 

248 # (which begins a new rule) from a reference (which is just a pointer to a 

249 # rule defined somewhere else): 

250 label = ~"[a-zA-Z_][a-zA-Z_0-9]*(?![\"'])" _ 

251 

252 # _ = ~r"\s*(?:#[^\r\n]*)?\s*" 

253 _ = meaninglessness* 

254 meaninglessness = ~r"\s+" / comment 

255 comment = ~r"#[^\r\n]*" 

256 ''') 

257 

258 

259class LazyReference(str): 

260 """A lazy reference to a rule, which we resolve after grokking all the 

261 rules""" 

262 

263 name = '' 

264 

265 def resolve_refs(self, rule_map): 

266 """ 

267 Traverse the rule map following top-level lazy references, 

268 until we reach a cycle (raise an error) or a concrete expression. 

269 

270 For example, the following is a circular reference: 

271 foo = bar 

272 baz = foo2 

273 foo2 = foo 

274 

275 Note that every RHS of a grammar rule _must_ be either a 

276 LazyReference or a concrete expression, so the reference chain will 

277 eventually either terminate or find a cycle. 

278 """ 

279 seen = set() 

280 cur = self 

281 while True: 

282 if cur in seen: 

283 raise BadGrammar(f"Circular Reference resolving {self.name}={self}.") 

284 else: 

285 seen.add(cur) 

286 try: 

287 cur = rule_map[str(cur)] 

288 except KeyError: 

289 raise UndefinedLabel(cur) 

290 if not isinstance(cur, LazyReference): 

291 return cur 

292 

293 # Just for debugging: 

294 def _as_rhs(self): 

295 return '<LazyReference to %s>' % self 

296 

297 

298class RuleVisitor(NodeVisitor): 

299 """Turns a parse tree of a grammar definition into a map of ``Expression`` 

300 objects 

301 

302 This is the magic piece that breathes life into a parsed bunch of parse 

303 rules, allowing them to go forth and parse other things. 

304 

305 """ 

306 quantifier_classes = {'?': Optional, '*': ZeroOrMore, '+': OneOrMore} 

307 

308 visit_expression = visit_term = visit_atom = NodeVisitor.lift_child 

309 

310 def __init__(self, custom_rules=None): 

311 """Construct. 

312 

313 :arg custom_rules: A dict of {rule name: expression} holding custom 

314 rules which will take precedence over the others 

315 

316 """ 

317 self.custom_rules = custom_rules or {} 

318 self._last_literal_node_and_type = None 

319 

320 def visit_parenthesized(self, node, parenthesized): 

321 """Treat a parenthesized subexpression as just its contents. 

322 

323 Its position in the tree suffices to maintain its grouping semantics. 

324 

325 """ 

326 left_paren, _, expression, right_paren, _ = parenthesized 

327 return expression 

328 

329 def visit_quantifier(self, node, quantifier): 

330 """Turn a quantifier into just its symbol-matching node.""" 

331 symbol, _ = quantifier 

332 return symbol 

333 

334 def visit_quantified(self, node, quantified): 

335 atom, quantifier = quantified 

336 try: 

337 return self.quantifier_classes[quantifier.text](atom) 

338 except KeyError: 

339 # This should pass: assert re.full_match("\{(\d*)(,(\d*))?\}", quantifier) 

340 quantifier = quantifier.text[1:-1].split(",") 

341 if len(quantifier) == 1: 

342 min_match = max_match = int(quantifier[0]) 

343 else: 

344 min_match = int(quantifier[0]) if quantifier[0] else 0 

345 max_match = int(quantifier[1]) if quantifier[1] else float('inf') 

346 return Quantifier(atom, min=min_match, max=max_match) 

347 

348 def visit_lookahead_term(self, node, lookahead_term): 

349 ampersand, term, _ = lookahead_term 

350 return Lookahead(term) 

351 

352 def visit_not_term(self, node, not_term): 

353 exclamation, term, _ = not_term 

354 return Not(term) 

355 

356 def visit_rule(self, node, rule): 

357 """Assign a name to the Expression and return it.""" 

358 label, equals, expression = rule 

359 expression.name = label # Assign a name to the expr. 

360 return expression 

361 

362 def visit_sequence(self, node, sequence): 

363 """A parsed Sequence looks like [term node, OneOrMore node of 

364 ``another_term``s]. Flatten it out.""" 

365 term, other_terms = sequence 

366 return Sequence(term, *other_terms) 

367 

368 def visit_ored(self, node, ored): 

369 first_term, other_terms = ored 

370 return OneOf(first_term, *other_terms) 

371 

372 def visit_or_term(self, node, or_term): 

373 """Return just the term from an ``or_term``. 

374 

375 We already know it's going to be ored, from the containing ``ored``. 

376 

377 """ 

378 slash, _, term = or_term 

379 return term 

380 

381 def visit_label(self, node, label): 

382 """Turn a label into a unicode string.""" 

383 name, _ = label 

384 return name.text 

385 

386 def visit_reference(self, node, reference): 

387 """Stick a :class:`LazyReference` in the tree as a placeholder. 

388 

389 We resolve them all later. 

390 

391 """ 

392 label, not_equals = reference 

393 return LazyReference(label) 

394 

395 def visit_regex(self, node, regex): 

396 """Return a ``Regex`` expression.""" 

397 tilde, literal, flags, _ = regex 

398 flags = flags.text.upper() 

399 pattern = literal.literal # Pull the string back out of the Literal 

400 # object. 

401 return Regex(pattern, ignore_case='I' in flags, 

402 locale='L' in flags, 

403 multiline='M' in flags, 

404 dot_all='S' in flags, 

405 unicode='U' in flags, 

406 verbose='X' in flags, 

407 ascii='A' in flags) 

408 

409 def visit_spaceless_literal(self, spaceless_literal, visited_children): 

410 """Turn a string literal into a ``Literal`` that recognizes it.""" 

411 literal_value = evaluate_string(spaceless_literal.text) 

412 if self._last_literal_node_and_type: 

413 last_node, last_type = self._last_literal_node_and_type 

414 if last_type != type(literal_value): 

415 raise BadGrammar(dedent(f"""\ 

416 Found {last_node.text} ({last_type}) and {spaceless_literal.text} ({type(literal_value)}) string literals. 

417 All strings in a single grammar must be of the same type. 

418 """) 

419 ) 

420 

421 self._last_literal_node_and_type = spaceless_literal, type(literal_value) 

422 

423 return Literal(literal_value) 

424 

425 def visit_literal(self, node, literal): 

426 """Pick just the literal out of a literal-and-junk combo.""" 

427 spaceless_literal, _ = literal 

428 return spaceless_literal 

429 

430 def generic_visit(self, node, visited_children): 

431 """Replace childbearing nodes with a list of their children; keep 

432 others untouched. 

433 

434 For our case, if a node has children, only the children are important. 

435 Otherwise, keep the node around for (for example) the flags of the 

436 regex rule. Most of these kept-around nodes are subsequently thrown 

437 away by the other visitor methods. 

438 

439 We can't simply hang the visited children off the original node; that 

440 would be disastrous if the node occurred in more than one place in the 

441 tree. 

442 

443 """ 

444 return visited_children or node # should semantically be a tuple 

445 

446 def visit_rules(self, node, rules_list): 

447 """Collate all the rules into a map. Return (map, default rule). 

448 

449 The default rule is the first one. Or, if you have more than one rule 

450 of that name, it's the last-occurring rule of that name. (This lets you 

451 override the default rule when you extend a grammar.) If there are no 

452 string-based rules, the default rule is None, because the custom rules, 

453 due to being kwarg-based, are unordered. 

454 

455 """ 

456 _, rules = rules_list 

457 

458 # Map each rule's name to its Expression. Later rules of the same name 

459 # override earlier ones. This lets us define rules multiple times and 

460 # have the last declaration win, so you can extend grammars by 

461 # concatenation. 

462 rule_map = OrderedDict((expr.name, expr) for expr in rules) 

463 

464 # And custom rules override string-based rules. This is the least 

465 # surprising choice when you compare the dict constructor: 

466 # dict({'x': 5}, x=6). 

467 rule_map.update(self.custom_rules) 

468 

469 # Resolve references. This tolerates forward references. 

470 for name, rule in list(rule_map.items()): 

471 if hasattr(rule, 'resolve_refs'): 

472 # Some custom rules may not define a resolve_refs method, 

473 # though anything that inherits from Expression will have it. 

474 rule_map[name] = rule.resolve_refs(rule_map) 

475 

476 # isinstance() is a temporary hack around the fact that * rules don't 

477 # always get transformed into lists by NodeVisitor. We should fix that; 

478 # it's surprising and requires writing lame branches like this. 

479 return rule_map, (rule_map[rules[0].name] 

480 if isinstance(rules, list) and rules else None) 

481 

482 

483class TokenRuleVisitor(RuleVisitor): 

484 """A visitor which builds expression trees meant to work on sequences of 

485 pre-lexed tokens rather than strings""" 

486 

487 def visit_spaceless_literal(self, spaceless_literal, visited_children): 

488 """Turn a string literal into a ``TokenMatcher`` that matches 

489 ``Token`` objects by their ``type`` attributes.""" 

490 return TokenMatcher(evaluate_string(spaceless_literal.text)) 

491 

492 def visit_regex(self, node, regex): 

493 tilde, literal, flags, _ = regex 

494 raise BadGrammar('Regexes do not make sense in TokenGrammars, since ' 

495 'TokenGrammars operate on pre-lexed tokens rather ' 

496 'than characters.') 

497 

498 

499# Bootstrap to level 1... 

500rule_grammar = BootstrappingGrammar(rule_syntax) 

501# ...and then to level 2. This establishes that the node tree of our rule 

502# syntax is built by the same machinery that will build trees of our users' 

503# grammars. And the correctness of that tree is tested, indirectly, in 

504# test_grammar. 

505rule_grammar = Grammar(rule_syntax) 

506 

507 

508# TODO: Teach Expression trees how to spit out Python representations of 

509# themselves. Then we can just paste that in above, and we won't have to 

510# bootstrap on import. Though it'll be a little less DRY. [Ah, but this is not 

511# so clean, because it would have to output multiple statements to get multiple 

512# refs to a single expression hooked up.]