1"""A convenience which constructs expression trees from an easy-to-read syntax
2
3Use this unless you have a compelling reason not to; it performs some
4optimizations that would be tedious to do when constructing an expression tree
5by hand.
6
7"""
8from collections import OrderedDict
9from textwrap import dedent
10
11from parsimonious.exceptions import BadGrammar, UndefinedLabel
12from parsimonious.expressions import (Literal, Regex, Sequence, OneOf,
13 Lookahead, Quantifier, Optional, ZeroOrMore, OneOrMore, Not, TokenMatcher,
14 expression, is_callable)
15from parsimonious.nodes import NodeVisitor
16from parsimonious.utils import evaluate_string
17
18class Grammar(OrderedDict):
19 """A collection of rules that describe a language
20
21 You can start parsing from the default rule by calling ``parse()``
22 directly on the ``Grammar`` object::
23
24 g = Grammar('''
25 polite_greeting = greeting ", my good " title
26 greeting = "Hi" / "Hello"
27 title = "madam" / "sir"
28 ''')
29 g.parse('Hello, my good sir')
30
31 Or start parsing from any of the other rules; you can pull them out of the
32 grammar as if it were a dictionary::
33
34 g['title'].parse('sir')
35
36 You could also just construct a bunch of ``Expression`` objects yourself
37 and stitch them together into a language, but using a ``Grammar`` has some
38 important advantages:
39
40 * Languages are much easier to define in the nice syntax it provides.
41 * Circular references aren't a pain.
42 * It does all kinds of whizzy space- and time-saving optimizations, like
43 factoring up repeated subexpressions into a single object, which should
44 increase cache hit ratio. [Is this implemented yet?]
45
46 """
47 def __init__(self, rules='', **more_rules):
48 """Construct a grammar.
49
50 :arg rules: A string of production rules, one per line.
51 :arg default_rule: The name of the rule invoked when you call
52 :meth:`parse()` or :meth:`match()` on the grammar. Defaults to the
53 first rule. Falls back to None if there are no string-based rules
54 in this grammar.
55 :arg more_rules: Additional kwargs whose names are rule names and
56 values are Expressions or custom-coded callables which accomplish
57 things the built-in rule syntax cannot. These take precedence over
58 ``rules`` in case of naming conflicts.
59
60 """
61
62 decorated_custom_rules = {
63 k: (expression(v, k, self) if is_callable(v) else v)
64 for k, v in more_rules.items()}
65
66 exprs, first = self._expressions_from_rules(rules, decorated_custom_rules)
67 super().__init__(exprs.items())
68 self.default_rule = first # may be None
69
70 def default(self, rule_name):
71 """Return a new Grammar whose :term:`default rule` is ``rule_name``."""
72 new = self._copy()
73 new.default_rule = new[rule_name]
74 return new
75
76 def _copy(self):
77 """Return a shallow copy of myself.
78
79 Deep is unnecessary, since Expression trees are immutable. Subgrammars
80 recreate all the Expressions from scratch, and AbstractGrammars have
81 no Expressions.
82
83 """
84 new = Grammar.__new__(Grammar)
85 super(Grammar, new).__init__(self.items())
86 new.default_rule = self.default_rule
87 return new
88
89 def _expressions_from_rules(self, rules, custom_rules):
90 """Return a 2-tuple: a dict of rule names pointing to their
91 expressions, and then the first rule.
92
93 It's a web of expressions, all referencing each other. Typically,
94 there's a single root to the web of references, and that root is the
95 starting symbol for parsing, but there's nothing saying you can't have
96 multiple roots.
97
98 :arg custom_rules: A map of rule names to custom-coded rules:
99 Expressions
100
101 """
102 tree = rule_grammar.parse(rules)
103 return RuleVisitor(custom_rules).visit(tree)
104
105 def parse(self, text, pos=0):
106 """Parse some text with the :term:`default rule`.
107
108 :arg pos: The index at which to start parsing
109
110 """
111 self._check_default_rule()
112 return self.default_rule.parse(text, pos=pos)
113
114 def match(self, text, pos=0):
115 """Parse some text with the :term:`default rule` but not necessarily
116 all the way to the end.
117
118 :arg pos: The index at which to start parsing
119
120 """
121 self._check_default_rule()
122 return self.default_rule.match(text, pos=pos)
123
124 def _check_default_rule(self):
125 """Raise RuntimeError if there is no default rule defined."""
126 if not self.default_rule:
127 raise RuntimeError("Can't call parse() on a Grammar that has no "
128 "default rule. Choose a specific rule instead, "
129 "like some_grammar['some_rule'].parse(...).")
130
131 def __str__(self):
132 """Return a rule string that, when passed to the constructor, would
133 reconstitute the grammar."""
134 exprs = [self.default_rule] if self.default_rule else []
135 exprs.extend(expr for expr in self.values() if
136 expr is not self.default_rule)
137 return '\n'.join(expr.as_rule() for expr in exprs)
138
139 def __repr__(self):
140 """Return an expression that will reconstitute the grammar."""
141 return "Grammar({!r})".format(str(self))
142
143
144class TokenGrammar(Grammar):
145 """A Grammar which takes a list of pre-lexed tokens instead of text
146
147 This is useful if you want to do the lexing yourself, as a separate pass:
148 for example, to implement indentation-based languages.
149
150 """
151 def _expressions_from_rules(self, rules, custom_rules):
152 tree = rule_grammar.parse(rules)
153 return TokenRuleVisitor(custom_rules).visit(tree)
154
155
156class BootstrappingGrammar(Grammar):
157 """The grammar used to recognize the textual rules that describe other
158 grammars
159
160 This grammar gets its start from some hard-coded Expressions and claws its
161 way from there to an expression tree that describes how to parse the
162 grammar description syntax.
163
164 """
165 def _expressions_from_rules(self, rule_syntax, custom_rules):
166 """Return the rules for parsing the grammar definition syntax.
167
168 Return a 2-tuple: a dict of rule names pointing to their expressions,
169 and then the top-level expression for the first rule.
170
171 """
172 # Hard-code enough of the rules to parse the grammar that describes the
173 # grammar description language, to bootstrap:
174 comment = Regex(r'#[^\r\n]*', name='comment')
175 meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness')
176 _ = ZeroOrMore(meaninglessness, name='_')
177 equals = Sequence(Literal('='), _, name='equals')
178 label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label')
179 reference = Sequence(label, Not(equals), name='reference')
180 quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier')
181 # This pattern supports empty literals. TODO: A problem?
182 spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"',
183 ignore_case=True,
184 dot_all=True,
185 name='spaceless_literal')
186 literal = Sequence(spaceless_literal, _, name='literal')
187 regex = Sequence(Literal('~'),
188 literal,
189 Regex('[ilmsuxa]*', ignore_case=True),
190 _,
191 name='regex')
192 atom = OneOf(reference, literal, regex, name='atom')
193 quantified = Sequence(atom, quantifier, name='quantified')
194
195 term = OneOf(quantified, atom, name='term')
196 not_term = Sequence(Literal('!'), term, _, name='not_term')
197 term.members = (not_term,) + term.members
198
199 sequence = Sequence(term, OneOrMore(term), name='sequence')
200 or_term = Sequence(Literal('/'), _, OneOrMore(term), name='or_term')
201 ored = Sequence(OneOrMore(term), OneOrMore(or_term), name='ored')
202 expression = OneOf(ored, sequence, term, name='expression')
203 rule = Sequence(label, equals, expression, name='rule')
204 rules = Sequence(_, OneOrMore(rule), name='rules')
205
206 # Use those hard-coded rules to parse the (more extensive) rule syntax.
207 # (For example, unless I start using parentheses in the rule language
208 # definition itself, I should never have to hard-code expressions for
209 # those above.)
210
211 rule_tree = rules.parse(rule_syntax)
212
213 # Turn the parse tree into a map of expressions:
214 return RuleVisitor().visit(rule_tree)
215
216
217# The grammar for parsing PEG grammar definitions:
218# This is a nice, simple grammar. We may someday add to it, but it's a safe bet
219# that the future will always be a superset of this.
220rule_syntax = (r'''
221 # Ignored things (represented by _) are typically hung off the end of the
222 # leafmost kinds of nodes. Literals like "/" count as leaves.
223
224 rules = _ rule*
225 rule = label equals expression
226 equals = "=" _
227 literal = spaceless_literal _
228
229 # So you can't spell a regex like `~"..." ilm`:
230 spaceless_literal = ~"u?r?b?\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""is /
231 ~"u?r?b?'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'"is
232
233 expression = ored / sequence / term
234 or_term = "/" _ term+
235 ored = term+ or_term+
236 sequence = term term+
237 not_term = "!" term _
238 lookahead_term = "&" term _
239 term = not_term / lookahead_term / quantified / atom
240 quantified = atom quantifier
241 atom = reference / literal / regex / parenthesized
242 regex = "~" spaceless_literal ~"[ilmsuxa]*"i _
243 parenthesized = "(" _ expression ")" _
244 quantifier = ~r"[*+?]|\{\d*,\d+\}|\{\d+,\d*\}|\{\d+\}" _
245 reference = label !equals
246
247 # A subsequent equal sign is the only thing that distinguishes a label
248 # (which begins a new rule) from a reference (which is just a pointer to a
249 # rule defined somewhere else):
250 label = ~"[a-zA-Z_][a-zA-Z_0-9]*(?![\"'])" _
251
252 # _ = ~r"\s*(?:#[^\r\n]*)?\s*"
253 _ = meaninglessness*
254 meaninglessness = ~r"\s+" / comment
255 comment = ~r"#[^\r\n]*"
256 ''')
257
258
259class LazyReference(str):
260 """A lazy reference to a rule, which we resolve after grokking all the
261 rules"""
262
263 name = ''
264
265 def resolve_refs(self, rule_map):
266 """
267 Traverse the rule map following top-level lazy references,
268 until we reach a cycle (raise an error) or a concrete expression.
269
270 For example, the following is a circular reference:
271 foo = bar
272 baz = foo2
273 foo2 = foo
274
275 Note that every RHS of a grammar rule _must_ be either a
276 LazyReference or a concrete expression, so the reference chain will
277 eventually either terminate or find a cycle.
278 """
279 seen = set()
280 cur = self
281 while True:
282 if cur in seen:
283 raise BadGrammar(f"Circular Reference resolving {self.name}={self}.")
284 else:
285 seen.add(cur)
286 try:
287 cur = rule_map[str(cur)]
288 except KeyError:
289 raise UndefinedLabel(cur)
290 if not isinstance(cur, LazyReference):
291 return cur
292
293 # Just for debugging:
294 def _as_rhs(self):
295 return '<LazyReference to %s>' % self
296
297
298class RuleVisitor(NodeVisitor):
299 """Turns a parse tree of a grammar definition into a map of ``Expression``
300 objects
301
302 This is the magic piece that breathes life into a parsed bunch of parse
303 rules, allowing them to go forth and parse other things.
304
305 """
306 quantifier_classes = {'?': Optional, '*': ZeroOrMore, '+': OneOrMore}
307
308 visit_expression = visit_term = visit_atom = NodeVisitor.lift_child
309
310 def __init__(self, custom_rules=None):
311 """Construct.
312
313 :arg custom_rules: A dict of {rule name: expression} holding custom
314 rules which will take precedence over the others
315
316 """
317 self.custom_rules = custom_rules or {}
318 self._last_literal_node_and_type = None
319
320 def visit_parenthesized(self, node, parenthesized):
321 """Treat a parenthesized subexpression as just its contents.
322
323 Its position in the tree suffices to maintain its grouping semantics.
324
325 """
326 left_paren, _, expression, right_paren, _ = parenthesized
327 return expression
328
329 def visit_quantifier(self, node, quantifier):
330 """Turn a quantifier into just its symbol-matching node."""
331 symbol, _ = quantifier
332 return symbol
333
334 def visit_quantified(self, node, quantified):
335 atom, quantifier = quantified
336 try:
337 return self.quantifier_classes[quantifier.text](atom)
338 except KeyError:
339 # This should pass: assert re.full_match("\{(\d*)(,(\d*))?\}", quantifier)
340 quantifier = quantifier.text[1:-1].split(",")
341 if len(quantifier) == 1:
342 min_match = max_match = int(quantifier[0])
343 else:
344 min_match = int(quantifier[0]) if quantifier[0] else 0
345 max_match = int(quantifier[1]) if quantifier[1] else float('inf')
346 return Quantifier(atom, min=min_match, max=max_match)
347
348 def visit_lookahead_term(self, node, lookahead_term):
349 ampersand, term, _ = lookahead_term
350 return Lookahead(term)
351
352 def visit_not_term(self, node, not_term):
353 exclamation, term, _ = not_term
354 return Not(term)
355
356 def visit_rule(self, node, rule):
357 """Assign a name to the Expression and return it."""
358 label, equals, expression = rule
359 expression.name = label # Assign a name to the expr.
360 return expression
361
362 def visit_sequence(self, node, sequence):
363 """A parsed Sequence looks like [term node, OneOrMore node of
364 ``another_term``s]. Flatten it out."""
365 term, other_terms = sequence
366 return Sequence(term, *other_terms)
367
368 def visit_ored(self, node, ored):
369 first_term, other_terms = ored
370 if len(first_term) == 1:
371 first_term = first_term[0]
372 else:
373 first_term = Sequence(*first_term)
374 return OneOf(first_term, *other_terms)
375
376 def visit_or_term(self, node, or_term):
377 """Return just the term from an ``or_term``.
378
379 We already know it's going to be ored, from the containing ``ored``.
380
381 """
382 slash, _, terms = or_term
383 if len(terms) == 1:
384 return terms[0]
385 else:
386 return Sequence(*terms)
387
388 def visit_label(self, node, label):
389 """Turn a label into a unicode string."""
390 name, _ = label
391 return name.text
392
393 def visit_reference(self, node, reference):
394 """Stick a :class:`LazyReference` in the tree as a placeholder.
395
396 We resolve them all later.
397
398 """
399 label, not_equals = reference
400 return LazyReference(label)
401
402 def visit_regex(self, node, regex):
403 """Return a ``Regex`` expression."""
404 tilde, literal, flags, _ = regex
405 flags = flags.text.upper()
406 pattern = literal.literal # Pull the string back out of the Literal
407 # object.
408 return Regex(pattern, ignore_case='I' in flags,
409 locale='L' in flags,
410 multiline='M' in flags,
411 dot_all='S' in flags,
412 unicode='U' in flags,
413 verbose='X' in flags,
414 ascii='A' in flags)
415
416 def visit_spaceless_literal(self, spaceless_literal, visited_children):
417 """Turn a string literal into a ``Literal`` that recognizes it."""
418 literal_value = evaluate_string(spaceless_literal.text)
419 if self._last_literal_node_and_type:
420 last_node, last_type = self._last_literal_node_and_type
421 if last_type != type(literal_value):
422 raise BadGrammar(dedent(f"""\
423 Found {last_node.text} ({last_type}) and {spaceless_literal.text} ({type(literal_value)}) string literals.
424 All strings in a single grammar must be of the same type.
425 """)
426 )
427
428 self._last_literal_node_and_type = spaceless_literal, type(literal_value)
429
430 return Literal(literal_value)
431
432 def visit_literal(self, node, literal):
433 """Pick just the literal out of a literal-and-junk combo."""
434 spaceless_literal, _ = literal
435 return spaceless_literal
436
437 def generic_visit(self, node, visited_children):
438 """Replace childbearing nodes with a list of their children; keep
439 others untouched.
440
441 For our case, if a node has children, only the children are important.
442 Otherwise, keep the node around for (for example) the flags of the
443 regex rule. Most of these kept-around nodes are subsequently thrown
444 away by the other visitor methods.
445
446 We can't simply hang the visited children off the original node; that
447 would be disastrous if the node occurred in more than one place in the
448 tree.
449
450 """
451 return visited_children or node # should semantically be a tuple
452
453 def visit_rules(self, node, rules_list):
454 """Collate all the rules into a map. Return (map, default rule).
455
456 The default rule is the first one. Or, if you have more than one rule
457 of that name, it's the last-occurring rule of that name. (This lets you
458 override the default rule when you extend a grammar.) If there are no
459 string-based rules, the default rule is None, because the custom rules,
460 due to being kwarg-based, are unordered.
461
462 """
463 _, rules = rules_list
464
465 # Map each rule's name to its Expression. Later rules of the same name
466 # override earlier ones. This lets us define rules multiple times and
467 # have the last declaration win, so you can extend grammars by
468 # concatenation.
469 rule_map = OrderedDict((expr.name, expr) for expr in rules)
470
471 # And custom rules override string-based rules. This is the least
472 # surprising choice when you compare the dict constructor:
473 # dict({'x': 5}, x=6).
474 rule_map.update(self.custom_rules)
475
476 # Resolve references. This tolerates forward references.
477 for name, rule in list(rule_map.items()):
478 if hasattr(rule, 'resolve_refs'):
479 # Some custom rules may not define a resolve_refs method,
480 # though anything that inherits from Expression will have it.
481 rule_map[name] = rule.resolve_refs(rule_map)
482
483 # isinstance() is a temporary hack around the fact that * rules don't
484 # always get transformed into lists by NodeVisitor. We should fix that;
485 # it's surprising and requires writing lame branches like this.
486 return rule_map, (rule_map[rules[0].name]
487 if isinstance(rules, list) and rules else None)
488
489
490class TokenRuleVisitor(RuleVisitor):
491 """A visitor which builds expression trees meant to work on sequences of
492 pre-lexed tokens rather than strings"""
493
494 def visit_spaceless_literal(self, spaceless_literal, visited_children):
495 """Turn a string literal into a ``TokenMatcher`` that matches
496 ``Token`` objects by their ``type`` attributes."""
497 return TokenMatcher(evaluate_string(spaceless_literal.text))
498
499 def visit_regex(self, node, regex):
500 tilde, literal, flags, _ = regex
501 raise BadGrammar('Regexes do not make sense in TokenGrammars, since '
502 'TokenGrammars operate on pre-lexed tokens rather '
503 'than characters.')
504
505
506# Bootstrap to level 1...
507rule_grammar = BootstrappingGrammar(rule_syntax)
508# ...and then to level 2. This establishes that the node tree of our rule
509# syntax is built by the same machinery that will build trees of our users'
510# grammars. And the correctness of that tree is tested, indirectly, in
511# test_grammar.
512rule_grammar = Grammar(rule_syntax)
513
514
515# TODO: Teach Expression trees how to spit out Python representations of
516# themselves. Then we can just paste that in above, and we won't have to
517# bootstrap on import. Though it'll be a little less DRY. [Ah, but this is not
518# so clean, because it would have to output multiple statements to get multiple
519# refs to a single expression hooked up.]