Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/parsimonious/grammar.py: 84%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

174 statements  

1"""A convenience which constructs expression trees from an easy-to-read syntax 

2 

3Use this unless you have a compelling reason not to; it performs some 

4optimizations that would be tedious to do when constructing an expression tree 

5by hand. 

6 

7""" 

8from collections import OrderedDict 

9from textwrap import dedent 

10 

11from parsimonious.exceptions import BadGrammar, UndefinedLabel 

12from parsimonious.expressions import (Literal, Regex, Sequence, OneOf, 

13 Lookahead, Quantifier, Optional, ZeroOrMore, OneOrMore, Not, TokenMatcher, 

14 expression, is_callable) 

15from parsimonious.nodes import NodeVisitor 

16from parsimonious.utils import evaluate_string 

17 

18class Grammar(OrderedDict): 

19 """A collection of rules that describe a language 

20 

21 You can start parsing from the default rule by calling ``parse()`` 

22 directly on the ``Grammar`` object:: 

23 

24 g = Grammar(''' 

25 polite_greeting = greeting ", my good " title 

26 greeting = "Hi" / "Hello" 

27 title = "madam" / "sir" 

28 ''') 

29 g.parse('Hello, my good sir') 

30 

31 Or start parsing from any of the other rules; you can pull them out of the 

32 grammar as if it were a dictionary:: 

33 

34 g['title'].parse('sir') 

35 

36 You could also just construct a bunch of ``Expression`` objects yourself 

37 and stitch them together into a language, but using a ``Grammar`` has some 

38 important advantages: 

39 

40 * Languages are much easier to define in the nice syntax it provides. 

41 * Circular references aren't a pain. 

42 * It does all kinds of whizzy space- and time-saving optimizations, like 

43 factoring up repeated subexpressions into a single object, which should 

44 increase cache hit ratio. [Is this implemented yet?] 

45 

46 """ 

47 def __init__(self, rules='', **more_rules): 

48 """Construct a grammar. 

49 

50 :arg rules: A string of production rules, one per line. 

51 :arg default_rule: The name of the rule invoked when you call 

52 :meth:`parse()` or :meth:`match()` on the grammar. Defaults to the 

53 first rule. Falls back to None if there are no string-based rules 

54 in this grammar. 

55 :arg more_rules: Additional kwargs whose names are rule names and 

56 values are Expressions or custom-coded callables which accomplish 

57 things the built-in rule syntax cannot. These take precedence over 

58 ``rules`` in case of naming conflicts. 

59 

60 """ 

61 

62 decorated_custom_rules = { 

63 k: (expression(v, k, self) if is_callable(v) else v) 

64 for k, v in more_rules.items()} 

65 

66 exprs, first = self._expressions_from_rules(rules, decorated_custom_rules) 

67 super().__init__(exprs.items()) 

68 self.default_rule = first # may be None 

69 

70 def default(self, rule_name): 

71 """Return a new Grammar whose :term:`default rule` is ``rule_name``.""" 

72 new = self._copy() 

73 new.default_rule = new[rule_name] 

74 return new 

75 

76 def _copy(self): 

77 """Return a shallow copy of myself. 

78 

79 Deep is unnecessary, since Expression trees are immutable. Subgrammars 

80 recreate all the Expressions from scratch, and AbstractGrammars have 

81 no Expressions. 

82 

83 """ 

84 new = Grammar.__new__(Grammar) 

85 super(Grammar, new).__init__(self.items()) 

86 new.default_rule = self.default_rule 

87 return new 

88 

89 def _expressions_from_rules(self, rules, custom_rules): 

90 """Return a 2-tuple: a dict of rule names pointing to their 

91 expressions, and then the first rule. 

92 

93 It's a web of expressions, all referencing each other. Typically, 

94 there's a single root to the web of references, and that root is the 

95 starting symbol for parsing, but there's nothing saying you can't have 

96 multiple roots. 

97 

98 :arg custom_rules: A map of rule names to custom-coded rules: 

99 Expressions 

100 

101 """ 

102 tree = rule_grammar.parse(rules) 

103 return RuleVisitor(custom_rules).visit(tree) 

104 

105 def parse(self, text, pos=0): 

106 """Parse some text with the :term:`default rule`. 

107 

108 :arg pos: The index at which to start parsing 

109 

110 """ 

111 self._check_default_rule() 

112 return self.default_rule.parse(text, pos=pos) 

113 

114 def match(self, text, pos=0): 

115 """Parse some text with the :term:`default rule` but not necessarily 

116 all the way to the end. 

117 

118 :arg pos: The index at which to start parsing 

119 

120 """ 

121 self._check_default_rule() 

122 return self.default_rule.match(text, pos=pos) 

123 

124 def _check_default_rule(self): 

125 """Raise RuntimeError if there is no default rule defined.""" 

126 if not self.default_rule: 

127 raise RuntimeError("Can't call parse() on a Grammar that has no " 

128 "default rule. Choose a specific rule instead, " 

129 "like some_grammar['some_rule'].parse(...).") 

130 

131 def __str__(self): 

132 """Return a rule string that, when passed to the constructor, would 

133 reconstitute the grammar.""" 

134 exprs = [self.default_rule] if self.default_rule else [] 

135 exprs.extend(expr for expr in self.values() if 

136 expr is not self.default_rule) 

137 return '\n'.join(expr.as_rule() for expr in exprs) 

138 

139 def __repr__(self): 

140 """Return an expression that will reconstitute the grammar.""" 

141 return "Grammar({!r})".format(str(self)) 

142 

143 

144class TokenGrammar(Grammar): 

145 """A Grammar which takes a list of pre-lexed tokens instead of text 

146 

147 This is useful if you want to do the lexing yourself, as a separate pass: 

148 for example, to implement indentation-based languages. 

149 

150 """ 

151 def _expressions_from_rules(self, rules, custom_rules): 

152 tree = rule_grammar.parse(rules) 

153 return TokenRuleVisitor(custom_rules).visit(tree) 

154 

155 

156class BootstrappingGrammar(Grammar): 

157 """The grammar used to recognize the textual rules that describe other 

158 grammars 

159 

160 This grammar gets its start from some hard-coded Expressions and claws its 

161 way from there to an expression tree that describes how to parse the 

162 grammar description syntax. 

163 

164 """ 

165 def _expressions_from_rules(self, rule_syntax, custom_rules): 

166 """Return the rules for parsing the grammar definition syntax. 

167 

168 Return a 2-tuple: a dict of rule names pointing to their expressions, 

169 and then the top-level expression for the first rule. 

170 

171 """ 

172 # Hard-code enough of the rules to parse the grammar that describes the 

173 # grammar description language, to bootstrap: 

174 comment = Regex(r'#[^\r\n]*', name='comment') 

175 meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness') 

176 _ = ZeroOrMore(meaninglessness, name='_') 

177 equals = Sequence(Literal('='), _, name='equals') 

178 label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label') 

179 reference = Sequence(label, Not(equals), name='reference') 

180 quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier') 

181 # This pattern supports empty literals. TODO: A problem? 

182 spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"', 

183 ignore_case=True, 

184 dot_all=True, 

185 name='spaceless_literal') 

186 literal = Sequence(spaceless_literal, _, name='literal') 

187 regex = Sequence(Literal('~'), 

188 literal, 

189 Regex('[ilmsuxa]*', ignore_case=True), 

190 _, 

191 name='regex') 

192 atom = OneOf(reference, literal, regex, name='atom') 

193 quantified = Sequence(atom, quantifier, name='quantified') 

194 

195 term = OneOf(quantified, atom, name='term') 

196 not_term = Sequence(Literal('!'), term, _, name='not_term') 

197 term.members = (not_term,) + term.members 

198 

199 sequence = Sequence(term, OneOrMore(term), name='sequence') 

200 or_term = Sequence(Literal('/'), _, OneOrMore(term), name='or_term') 

201 ored = Sequence(OneOrMore(term), OneOrMore(or_term), name='ored') 

202 expression = OneOf(ored, sequence, term, name='expression') 

203 rule = Sequence(label, equals, expression, name='rule') 

204 rules = Sequence(_, OneOrMore(rule), name='rules') 

205 

206 # Use those hard-coded rules to parse the (more extensive) rule syntax. 

207 # (For example, unless I start using parentheses in the rule language 

208 # definition itself, I should never have to hard-code expressions for 

209 # those above.) 

210 

211 rule_tree = rules.parse(rule_syntax) 

212 

213 # Turn the parse tree into a map of expressions: 

214 return RuleVisitor().visit(rule_tree) 

215 

216 

217# The grammar for parsing PEG grammar definitions: 

218# This is a nice, simple grammar. We may someday add to it, but it's a safe bet 

219# that the future will always be a superset of this. 

220rule_syntax = (r''' 

221 # Ignored things (represented by _) are typically hung off the end of the 

222 # leafmost kinds of nodes. Literals like "/" count as leaves. 

223 

224 rules = _ rule* 

225 rule = label equals expression 

226 equals = "=" _ 

227 literal = spaceless_literal _ 

228 

229 # So you can't spell a regex like `~"..." ilm`: 

230 spaceless_literal = ~"u?r?b?\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""is / 

231 ~"u?r?b?'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'"is 

232 

233 expression = ored / sequence / term 

234 or_term = "/" _ term+ 

235 ored = term+ or_term+ 

236 sequence = term term+ 

237 not_term = "!" term _ 

238 lookahead_term = "&" term _ 

239 term = not_term / lookahead_term / quantified / atom 

240 quantified = atom quantifier 

241 atom = reference / literal / regex / parenthesized 

242 regex = "~" spaceless_literal ~"[ilmsuxa]*"i _ 

243 parenthesized = "(" _ expression ")" _ 

244 quantifier = ~r"[*+?]|\{\d*,\d+\}|\{\d+,\d*\}|\{\d+\}" _ 

245 reference = label !equals 

246 

247 # A subsequent equal sign is the only thing that distinguishes a label 

248 # (which begins a new rule) from a reference (which is just a pointer to a 

249 # rule defined somewhere else): 

250 label = ~"[a-zA-Z_][a-zA-Z_0-9]*(?![\"'])" _ 

251 

252 # _ = ~r"\s*(?:#[^\r\n]*)?\s*" 

253 _ = meaninglessness* 

254 meaninglessness = ~r"\s+" / comment 

255 comment = ~r"#[^\r\n]*" 

256 ''') 

257 

258 

259class LazyReference(str): 

260 """A lazy reference to a rule, which we resolve after grokking all the 

261 rules""" 

262 

263 name = '' 

264 

265 def resolve_refs(self, rule_map): 

266 """ 

267 Traverse the rule map following top-level lazy references, 

268 until we reach a cycle (raise an error) or a concrete expression. 

269 

270 For example, the following is a circular reference: 

271 foo = bar 

272 baz = foo2 

273 foo2 = foo 

274 

275 Note that every RHS of a grammar rule _must_ be either a 

276 LazyReference or a concrete expression, so the reference chain will 

277 eventually either terminate or find a cycle. 

278 """ 

279 seen = set() 

280 cur = self 

281 while True: 

282 if cur in seen: 

283 raise BadGrammar(f"Circular Reference resolving {self.name}={self}.") 

284 else: 

285 seen.add(cur) 

286 try: 

287 cur = rule_map[str(cur)] 

288 except KeyError: 

289 raise UndefinedLabel(cur) 

290 if not isinstance(cur, LazyReference): 

291 return cur 

292 

293 # Just for debugging: 

294 def _as_rhs(self): 

295 return '<LazyReference to %s>' % self 

296 

297 

298class RuleVisitor(NodeVisitor): 

299 """Turns a parse tree of a grammar definition into a map of ``Expression`` 

300 objects 

301 

302 This is the magic piece that breathes life into a parsed bunch of parse 

303 rules, allowing them to go forth and parse other things. 

304 

305 """ 

306 quantifier_classes = {'?': Optional, '*': ZeroOrMore, '+': OneOrMore} 

307 

308 visit_expression = visit_term = visit_atom = NodeVisitor.lift_child 

309 

310 def __init__(self, custom_rules=None): 

311 """Construct. 

312 

313 :arg custom_rules: A dict of {rule name: expression} holding custom 

314 rules which will take precedence over the others 

315 

316 """ 

317 self.custom_rules = custom_rules or {} 

318 self._last_literal_node_and_type = None 

319 

320 def visit_parenthesized(self, node, parenthesized): 

321 """Treat a parenthesized subexpression as just its contents. 

322 

323 Its position in the tree suffices to maintain its grouping semantics. 

324 

325 """ 

326 left_paren, _, expression, right_paren, _ = parenthesized 

327 return expression 

328 

329 def visit_quantifier(self, node, quantifier): 

330 """Turn a quantifier into just its symbol-matching node.""" 

331 symbol, _ = quantifier 

332 return symbol 

333 

334 def visit_quantified(self, node, quantified): 

335 atom, quantifier = quantified 

336 try: 

337 return self.quantifier_classes[quantifier.text](atom) 

338 except KeyError: 

339 # This should pass: assert re.full_match("\{(\d*)(,(\d*))?\}", quantifier) 

340 quantifier = quantifier.text[1:-1].split(",") 

341 if len(quantifier) == 1: 

342 min_match = max_match = int(quantifier[0]) 

343 else: 

344 min_match = int(quantifier[0]) if quantifier[0] else 0 

345 max_match = int(quantifier[1]) if quantifier[1] else float('inf') 

346 return Quantifier(atom, min=min_match, max=max_match) 

347 

348 def visit_lookahead_term(self, node, lookahead_term): 

349 ampersand, term, _ = lookahead_term 

350 return Lookahead(term) 

351 

352 def visit_not_term(self, node, not_term): 

353 exclamation, term, _ = not_term 

354 return Not(term) 

355 

356 def visit_rule(self, node, rule): 

357 """Assign a name to the Expression and return it.""" 

358 label, equals, expression = rule 

359 expression.name = label # Assign a name to the expr. 

360 return expression 

361 

362 def visit_sequence(self, node, sequence): 

363 """A parsed Sequence looks like [term node, OneOrMore node of 

364 ``another_term``s]. Flatten it out.""" 

365 term, other_terms = sequence 

366 return Sequence(term, *other_terms) 

367 

368 def visit_ored(self, node, ored): 

369 first_term, other_terms = ored 

370 if len(first_term) == 1: 

371 first_term = first_term[0] 

372 else: 

373 first_term = Sequence(*first_term) 

374 return OneOf(first_term, *other_terms) 

375 

376 def visit_or_term(self, node, or_term): 

377 """Return just the term from an ``or_term``. 

378 

379 We already know it's going to be ored, from the containing ``ored``. 

380 

381 """ 

382 slash, _, terms = or_term 

383 if len(terms) == 1: 

384 return terms[0] 

385 else: 

386 return Sequence(*terms) 

387 

388 def visit_label(self, node, label): 

389 """Turn a label into a unicode string.""" 

390 name, _ = label 

391 return name.text 

392 

393 def visit_reference(self, node, reference): 

394 """Stick a :class:`LazyReference` in the tree as a placeholder. 

395 

396 We resolve them all later. 

397 

398 """ 

399 label, not_equals = reference 

400 return LazyReference(label) 

401 

402 def visit_regex(self, node, regex): 

403 """Return a ``Regex`` expression.""" 

404 tilde, literal, flags, _ = regex 

405 flags = flags.text.upper() 

406 pattern = literal.literal # Pull the string back out of the Literal 

407 # object. 

408 return Regex(pattern, ignore_case='I' in flags, 

409 locale='L' in flags, 

410 multiline='M' in flags, 

411 dot_all='S' in flags, 

412 unicode='U' in flags, 

413 verbose='X' in flags, 

414 ascii='A' in flags) 

415 

416 def visit_spaceless_literal(self, spaceless_literal, visited_children): 

417 """Turn a string literal into a ``Literal`` that recognizes it.""" 

418 literal_value = evaluate_string(spaceless_literal.text) 

419 if self._last_literal_node_and_type: 

420 last_node, last_type = self._last_literal_node_and_type 

421 if last_type != type(literal_value): 

422 raise BadGrammar(dedent(f"""\ 

423 Found {last_node.text} ({last_type}) and {spaceless_literal.text} ({type(literal_value)}) string literals. 

424 All strings in a single grammar must be of the same type. 

425 """) 

426 ) 

427 

428 self._last_literal_node_and_type = spaceless_literal, type(literal_value) 

429 

430 return Literal(literal_value) 

431 

432 def visit_literal(self, node, literal): 

433 """Pick just the literal out of a literal-and-junk combo.""" 

434 spaceless_literal, _ = literal 

435 return spaceless_literal 

436 

437 def generic_visit(self, node, visited_children): 

438 """Replace childbearing nodes with a list of their children; keep 

439 others untouched. 

440 

441 For our case, if a node has children, only the children are important. 

442 Otherwise, keep the node around for (for example) the flags of the 

443 regex rule. Most of these kept-around nodes are subsequently thrown 

444 away by the other visitor methods. 

445 

446 We can't simply hang the visited children off the original node; that 

447 would be disastrous if the node occurred in more than one place in the 

448 tree. 

449 

450 """ 

451 return visited_children or node # should semantically be a tuple 

452 

453 def visit_rules(self, node, rules_list): 

454 """Collate all the rules into a map. Return (map, default rule). 

455 

456 The default rule is the first one. Or, if you have more than one rule 

457 of that name, it's the last-occurring rule of that name. (This lets you 

458 override the default rule when you extend a grammar.) If there are no 

459 string-based rules, the default rule is None, because the custom rules, 

460 due to being kwarg-based, are unordered. 

461 

462 """ 

463 _, rules = rules_list 

464 

465 # Map each rule's name to its Expression. Later rules of the same name 

466 # override earlier ones. This lets us define rules multiple times and 

467 # have the last declaration win, so you can extend grammars by 

468 # concatenation. 

469 rule_map = OrderedDict((expr.name, expr) for expr in rules) 

470 

471 # And custom rules override string-based rules. This is the least 

472 # surprising choice when you compare the dict constructor: 

473 # dict({'x': 5}, x=6). 

474 rule_map.update(self.custom_rules) 

475 

476 # Resolve references. This tolerates forward references. 

477 for name, rule in list(rule_map.items()): 

478 if hasattr(rule, 'resolve_refs'): 

479 # Some custom rules may not define a resolve_refs method, 

480 # though anything that inherits from Expression will have it. 

481 rule_map[name] = rule.resolve_refs(rule_map) 

482 

483 # isinstance() is a temporary hack around the fact that * rules don't 

484 # always get transformed into lists by NodeVisitor. We should fix that; 

485 # it's surprising and requires writing lame branches like this. 

486 return rule_map, (rule_map[rules[0].name] 

487 if isinstance(rules, list) and rules else None) 

488 

489 

490class TokenRuleVisitor(RuleVisitor): 

491 """A visitor which builds expression trees meant to work on sequences of 

492 pre-lexed tokens rather than strings""" 

493 

494 def visit_spaceless_literal(self, spaceless_literal, visited_children): 

495 """Turn a string literal into a ``TokenMatcher`` that matches 

496 ``Token`` objects by their ``type`` attributes.""" 

497 return TokenMatcher(evaluate_string(spaceless_literal.text)) 

498 

499 def visit_regex(self, node, regex): 

500 tilde, literal, flags, _ = regex 

501 raise BadGrammar('Regexes do not make sense in TokenGrammars, since ' 

502 'TokenGrammars operate on pre-lexed tokens rather ' 

503 'than characters.') 

504 

505 

506# Bootstrap to level 1... 

507rule_grammar = BootstrappingGrammar(rule_syntax) 

508# ...and then to level 2. This establishes that the node tree of our rule 

509# syntax is built by the same machinery that will build trees of our users' 

510# grammars. And the correctness of that tree is tested, indirectly, in 

511# test_grammar. 

512rule_grammar = Grammar(rule_syntax) 

513 

514 

515# TODO: Teach Expression trees how to spit out Python representations of 

516# themselves. Then we can just paste that in above, and we won't have to 

517# bootstrap on import. Though it'll be a little less DRY. [Ah, but this is not 

518# so clean, because it would have to output multiple statements to get multiple 

519# refs to a single expression hooked up.]