Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/parsimonious/grammar.py: 90%
167 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:15 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:15 +0000
1"""A convenience which constructs expression trees from an easy-to-read syntax
3Use this unless you have a compelling reason not to; it performs some
4optimizations that would be tedious to do when constructing an expression tree
5by hand.
7"""
8from collections import OrderedDict
9from textwrap import dedent
11from parsimonious.exceptions import BadGrammar, UndefinedLabel
12from parsimonious.expressions import (Literal, Regex, Sequence, OneOf,
13 Lookahead, Quantifier, Optional, ZeroOrMore, OneOrMore, Not, TokenMatcher,
14 expression, is_callable)
15from parsimonious.nodes import NodeVisitor
16from parsimonious.utils import evaluate_string
18class Grammar(OrderedDict):
19 """A collection of rules that describe a language
21 You can start parsing from the default rule by calling ``parse()``
22 directly on the ``Grammar`` object::
24 g = Grammar('''
25 polite_greeting = greeting ", my good " title
26 greeting = "Hi" / "Hello"
27 title = "madam" / "sir"
28 ''')
29 g.parse('Hello, my good sir')
31 Or start parsing from any of the other rules; you can pull them out of the
32 grammar as if it were a dictionary::
34 g['title'].parse('sir')
36 You could also just construct a bunch of ``Expression`` objects yourself
37 and stitch them together into a language, but using a ``Grammar`` has some
38 important advantages:
40 * Languages are much easier to define in the nice syntax it provides.
41 * Circular references aren't a pain.
42 * It does all kinds of whizzy space- and time-saving optimizations, like
43 factoring up repeated subexpressions into a single object, which should
44 increase cache hit ratio. [Is this implemented yet?]
46 """
47 def __init__(self, rules='', **more_rules):
48 """Construct a grammar.
50 :arg rules: A string of production rules, one per line.
51 :arg default_rule: The name of the rule invoked when you call
52 :meth:`parse()` or :meth:`match()` on the grammar. Defaults to the
53 first rule. Falls back to None if there are no string-based rules
54 in this grammar.
55 :arg more_rules: Additional kwargs whose names are rule names and
56 values are Expressions or custom-coded callables which accomplish
57 things the built-in rule syntax cannot. These take precedence over
58 ``rules`` in case of naming conflicts.
60 """
62 decorated_custom_rules = {
63 k: (expression(v, k, self) if is_callable(v) else v)
64 for k, v in more_rules.items()}
66 exprs, first = self._expressions_from_rules(rules, decorated_custom_rules)
67 super().__init__(exprs.items())
68 self.default_rule = first # may be None
70 def default(self, rule_name):
71 """Return a new Grammar whose :term:`default rule` is ``rule_name``."""
72 new = self._copy()
73 new.default_rule = new[rule_name]
74 return new
76 def _copy(self):
77 """Return a shallow copy of myself.
79 Deep is unnecessary, since Expression trees are immutable. Subgrammars
80 recreate all the Expressions from scratch, and AbstractGrammars have
81 no Expressions.
83 """
84 new = Grammar.__new__(Grammar)
85 super(Grammar, new).__init__(self.items())
86 new.default_rule = self.default_rule
87 return new
89 def _expressions_from_rules(self, rules, custom_rules):
90 """Return a 2-tuple: a dict of rule names pointing to their
91 expressions, and then the first rule.
93 It's a web of expressions, all referencing each other. Typically,
94 there's a single root to the web of references, and that root is the
95 starting symbol for parsing, but there's nothing saying you can't have
96 multiple roots.
98 :arg custom_rules: A map of rule names to custom-coded rules:
99 Expressions
101 """
102 tree = rule_grammar.parse(rules)
103 return RuleVisitor(custom_rules).visit(tree)
105 def parse(self, text, pos=0):
106 """Parse some text with the :term:`default rule`.
108 :arg pos: The index at which to start parsing
110 """
111 self._check_default_rule()
112 return self.default_rule.parse(text, pos=pos)
114 def match(self, text, pos=0):
115 """Parse some text with the :term:`default rule` but not necessarily
116 all the way to the end.
118 :arg pos: The index at which to start parsing
120 """
121 self._check_default_rule()
122 return self.default_rule.match(text, pos=pos)
124 def _check_default_rule(self):
125 """Raise RuntimeError if there is no default rule defined."""
126 if not self.default_rule:
127 raise RuntimeError("Can't call parse() on a Grammar that has no "
128 "default rule. Choose a specific rule instead, "
129 "like some_grammar['some_rule'].parse(...).")
131 def __str__(self):
132 """Return a rule string that, when passed to the constructor, would
133 reconstitute the grammar."""
134 exprs = [self.default_rule] if self.default_rule else []
135 exprs.extend(expr for expr in self.values() if
136 expr is not self.default_rule)
137 return '\n'.join(expr.as_rule() for expr in exprs)
139 def __repr__(self):
140 """Return an expression that will reconstitute the grammar."""
141 return "Grammar({!r})".format(str(self))
144class TokenGrammar(Grammar):
145 """A Grammar which takes a list of pre-lexed tokens instead of text
147 This is useful if you want to do the lexing yourself, as a separate pass:
148 for example, to implement indentation-based languages.
150 """
151 def _expressions_from_rules(self, rules, custom_rules):
152 tree = rule_grammar.parse(rules)
153 return TokenRuleVisitor(custom_rules).visit(tree)
156class BootstrappingGrammar(Grammar):
157 """The grammar used to recognize the textual rules that describe other
158 grammars
160 This grammar gets its start from some hard-coded Expressions and claws its
161 way from there to an expression tree that describes how to parse the
162 grammar description syntax.
164 """
165 def _expressions_from_rules(self, rule_syntax, custom_rules):
166 """Return the rules for parsing the grammar definition syntax.
168 Return a 2-tuple: a dict of rule names pointing to their expressions,
169 and then the top-level expression for the first rule.
171 """
172 # Hard-code enough of the rules to parse the grammar that describes the
173 # grammar description language, to bootstrap:
174 comment = Regex(r'#[^\r\n]*', name='comment')
175 meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness')
176 _ = ZeroOrMore(meaninglessness, name='_')
177 equals = Sequence(Literal('='), _, name='equals')
178 label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label')
179 reference = Sequence(label, Not(equals), name='reference')
180 quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier')
181 # This pattern supports empty literals. TODO: A problem?
182 spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"',
183 ignore_case=True,
184 dot_all=True,
185 name='spaceless_literal')
186 literal = Sequence(spaceless_literal, _, name='literal')
187 regex = Sequence(Literal('~'),
188 literal,
189 Regex('[ilmsuxa]*', ignore_case=True),
190 _,
191 name='regex')
192 atom = OneOf(reference, literal, regex, name='atom')
193 quantified = Sequence(atom, quantifier, name='quantified')
195 term = OneOf(quantified, atom, name='term')
196 not_term = Sequence(Literal('!'), term, _, name='not_term')
197 term.members = (not_term,) + term.members
199 sequence = Sequence(term, OneOrMore(term), name='sequence')
200 or_term = Sequence(Literal('/'), _, term, name='or_term')
201 ored = Sequence(term, OneOrMore(or_term), name='ored')
202 expression = OneOf(ored, sequence, term, name='expression')
203 rule = Sequence(label, equals, expression, name='rule')
204 rules = Sequence(_, OneOrMore(rule), name='rules')
206 # Use those hard-coded rules to parse the (more extensive) rule syntax.
207 # (For example, unless I start using parentheses in the rule language
208 # definition itself, I should never have to hard-code expressions for
209 # those above.)
211 rule_tree = rules.parse(rule_syntax)
213 # Turn the parse tree into a map of expressions:
214 return RuleVisitor().visit(rule_tree)
217# The grammar for parsing PEG grammar definitions:
218# This is a nice, simple grammar. We may someday add to it, but it's a safe bet
219# that the future will always be a superset of this.
220rule_syntax = (r'''
221 # Ignored things (represented by _) are typically hung off the end of the
222 # leafmost kinds of nodes. Literals like "/" count as leaves.
224 rules = _ rule*
225 rule = label equals expression
226 equals = "=" _
227 literal = spaceless_literal _
229 # So you can't spell a regex like `~"..." ilm`:
230 spaceless_literal = ~"u?r?b?\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""is /
231 ~"u?r?b?'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'"is
233 expression = ored / sequence / term
234 or_term = "/" _ term
235 ored = term or_term+
236 sequence = term term+
237 not_term = "!" term _
238 lookahead_term = "&" term _
239 term = not_term / lookahead_term / quantified / atom
240 quantified = atom quantifier
241 atom = reference / literal / regex / parenthesized
242 regex = "~" spaceless_literal ~"[ilmsuxa]*"i _
243 parenthesized = "(" _ expression ")" _
244 quantifier = ~r"[*+?]|\{\d*,\d+\}|\{\d+,\d*\}|\{\d+\}" _
245 reference = label !equals
247 # A subsequent equal sign is the only thing that distinguishes a label
248 # (which begins a new rule) from a reference (which is just a pointer to a
249 # rule defined somewhere else):
250 label = ~"[a-zA-Z_][a-zA-Z_0-9]*(?![\"'])" _
252 # _ = ~r"\s*(?:#[^\r\n]*)?\s*"
253 _ = meaninglessness*
254 meaninglessness = ~r"\s+" / comment
255 comment = ~r"#[^\r\n]*"
256 ''')
259class LazyReference(str):
260 """A lazy reference to a rule, which we resolve after grokking all the
261 rules"""
263 name = ''
265 def resolve_refs(self, rule_map):
266 """
267 Traverse the rule map following top-level lazy references,
268 until we reach a cycle (raise an error) or a concrete expression.
270 For example, the following is a circular reference:
271 foo = bar
272 baz = foo2
273 foo2 = foo
275 Note that every RHS of a grammar rule _must_ be either a
276 LazyReference or a concrete expression, so the reference chain will
277 eventually either terminate or find a cycle.
278 """
279 seen = set()
280 cur = self
281 while True:
282 if cur in seen:
283 raise BadGrammar(f"Circular Reference resolving {self.name}={self}.")
284 else:
285 seen.add(cur)
286 try:
287 cur = rule_map[str(cur)]
288 except KeyError:
289 raise UndefinedLabel(cur)
290 if not isinstance(cur, LazyReference):
291 return cur
293 # Just for debugging:
294 def _as_rhs(self):
295 return '<LazyReference to %s>' % self
298class RuleVisitor(NodeVisitor):
299 """Turns a parse tree of a grammar definition into a map of ``Expression``
300 objects
302 This is the magic piece that breathes life into a parsed bunch of parse
303 rules, allowing them to go forth and parse other things.
305 """
306 quantifier_classes = {'?': Optional, '*': ZeroOrMore, '+': OneOrMore}
308 visit_expression = visit_term = visit_atom = NodeVisitor.lift_child
310 def __init__(self, custom_rules=None):
311 """Construct.
313 :arg custom_rules: A dict of {rule name: expression} holding custom
314 rules which will take precedence over the others
316 """
317 self.custom_rules = custom_rules or {}
318 self._last_literal_node_and_type = None
320 def visit_parenthesized(self, node, parenthesized):
321 """Treat a parenthesized subexpression as just its contents.
323 Its position in the tree suffices to maintain its grouping semantics.
325 """
326 left_paren, _, expression, right_paren, _ = parenthesized
327 return expression
329 def visit_quantifier(self, node, quantifier):
330 """Turn a quantifier into just its symbol-matching node."""
331 symbol, _ = quantifier
332 return symbol
334 def visit_quantified(self, node, quantified):
335 atom, quantifier = quantified
336 try:
337 return self.quantifier_classes[quantifier.text](atom)
338 except KeyError:
339 # This should pass: assert re.full_match("\{(\d*)(,(\d*))?\}", quantifier)
340 quantifier = quantifier.text[1:-1].split(",")
341 if len(quantifier) == 1:
342 min_match = max_match = int(quantifier[0])
343 else:
344 min_match = int(quantifier[0]) if quantifier[0] else 0
345 max_match = int(quantifier[1]) if quantifier[1] else float('inf')
346 return Quantifier(atom, min=min_match, max=max_match)
348 def visit_lookahead_term(self, node, lookahead_term):
349 ampersand, term, _ = lookahead_term
350 return Lookahead(term)
352 def visit_not_term(self, node, not_term):
353 exclamation, term, _ = not_term
354 return Not(term)
356 def visit_rule(self, node, rule):
357 """Assign a name to the Expression and return it."""
358 label, equals, expression = rule
359 expression.name = label # Assign a name to the expr.
360 return expression
362 def visit_sequence(self, node, sequence):
363 """A parsed Sequence looks like [term node, OneOrMore node of
364 ``another_term``s]. Flatten it out."""
365 term, other_terms = sequence
366 return Sequence(term, *other_terms)
368 def visit_ored(self, node, ored):
369 first_term, other_terms = ored
370 return OneOf(first_term, *other_terms)
372 def visit_or_term(self, node, or_term):
373 """Return just the term from an ``or_term``.
375 We already know it's going to be ored, from the containing ``ored``.
377 """
378 slash, _, term = or_term
379 return term
381 def visit_label(self, node, label):
382 """Turn a label into a unicode string."""
383 name, _ = label
384 return name.text
386 def visit_reference(self, node, reference):
387 """Stick a :class:`LazyReference` in the tree as a placeholder.
389 We resolve them all later.
391 """
392 label, not_equals = reference
393 return LazyReference(label)
395 def visit_regex(self, node, regex):
396 """Return a ``Regex`` expression."""
397 tilde, literal, flags, _ = regex
398 flags = flags.text.upper()
399 pattern = literal.literal # Pull the string back out of the Literal
400 # object.
401 return Regex(pattern, ignore_case='I' in flags,
402 locale='L' in flags,
403 multiline='M' in flags,
404 dot_all='S' in flags,
405 unicode='U' in flags,
406 verbose='X' in flags,
407 ascii='A' in flags)
409 def visit_spaceless_literal(self, spaceless_literal, visited_children):
410 """Turn a string literal into a ``Literal`` that recognizes it."""
411 literal_value = evaluate_string(spaceless_literal.text)
412 if self._last_literal_node_and_type:
413 last_node, last_type = self._last_literal_node_and_type
414 if last_type != type(literal_value):
415 raise BadGrammar(dedent(f"""\
416 Found {last_node.text} ({last_type}) and {spaceless_literal.text} ({type(literal_value)}) string literals.
417 All strings in a single grammar must be of the same type.
418 """)
419 )
421 self._last_literal_node_and_type = spaceless_literal, type(literal_value)
423 return Literal(literal_value)
425 def visit_literal(self, node, literal):
426 """Pick just the literal out of a literal-and-junk combo."""
427 spaceless_literal, _ = literal
428 return spaceless_literal
430 def generic_visit(self, node, visited_children):
431 """Replace childbearing nodes with a list of their children; keep
432 others untouched.
434 For our case, if a node has children, only the children are important.
435 Otherwise, keep the node around for (for example) the flags of the
436 regex rule. Most of these kept-around nodes are subsequently thrown
437 away by the other visitor methods.
439 We can't simply hang the visited children off the original node; that
440 would be disastrous if the node occurred in more than one place in the
441 tree.
443 """
444 return visited_children or node # should semantically be a tuple
446 def visit_rules(self, node, rules_list):
447 """Collate all the rules into a map. Return (map, default rule).
449 The default rule is the first one. Or, if you have more than one rule
450 of that name, it's the last-occurring rule of that name. (This lets you
451 override the default rule when you extend a grammar.) If there are no
452 string-based rules, the default rule is None, because the custom rules,
453 due to being kwarg-based, are unordered.
455 """
456 _, rules = rules_list
458 # Map each rule's name to its Expression. Later rules of the same name
459 # override earlier ones. This lets us define rules multiple times and
460 # have the last declaration win, so you can extend grammars by
461 # concatenation.
462 rule_map = OrderedDict((expr.name, expr) for expr in rules)
464 # And custom rules override string-based rules. This is the least
465 # surprising choice when you compare the dict constructor:
466 # dict({'x': 5}, x=6).
467 rule_map.update(self.custom_rules)
469 # Resolve references. This tolerates forward references.
470 for name, rule in list(rule_map.items()):
471 if hasattr(rule, 'resolve_refs'):
472 # Some custom rules may not define a resolve_refs method,
473 # though anything that inherits from Expression will have it.
474 rule_map[name] = rule.resolve_refs(rule_map)
476 # isinstance() is a temporary hack around the fact that * rules don't
477 # always get transformed into lists by NodeVisitor. We should fix that;
478 # it's surprising and requires writing lame branches like this.
479 return rule_map, (rule_map[rules[0].name]
480 if isinstance(rules, list) and rules else None)
483class TokenRuleVisitor(RuleVisitor):
484 """A visitor which builds expression trees meant to work on sequences of
485 pre-lexed tokens rather than strings"""
487 def visit_spaceless_literal(self, spaceless_literal, visited_children):
488 """Turn a string literal into a ``TokenMatcher`` that matches
489 ``Token`` objects by their ``type`` attributes."""
490 return TokenMatcher(evaluate_string(spaceless_literal.text))
492 def visit_regex(self, node, regex):
493 tilde, literal, flags, _ = regex
494 raise BadGrammar('Regexes do not make sense in TokenGrammars, since '
495 'TokenGrammars operate on pre-lexed tokens rather '
496 'than characters.')
499# Bootstrap to level 1...
500rule_grammar = BootstrappingGrammar(rule_syntax)
501# ...and then to level 2. This establishes that the node tree of our rule
502# syntax is built by the same machinery that will build trees of our users'
503# grammars. And the correctness of that tree is tested, indirectly, in
504# test_grammar.
505rule_grammar = Grammar(rule_syntax)
508# TODO: Teach Expression trees how to spit out Python representations of
509# themselves. Then we can just paste that in above, and we won't have to
510# bootstrap on import. Though it'll be a little less DRY. [Ah, but this is not
511# so clean, because it would have to output multiple statements to get multiple
512# refs to a single expression hooked up.]