1"""Subexpressions that make up a parsed grammar
2
3These do the parsing.
4
5"""
6# TODO: Make sure all symbol refs are local--not class lookups or
7# anything--for speed. And kill all the dots.
8
9from collections import defaultdict
10from inspect import getfullargspec, isfunction, ismethod, ismethoddescriptor
11try:
12 import regex as re
13except ImportError:
14 import re # Fallback as per https://github.com/erikrose/parsimonious/issues/231
15
16from parsimonious.exceptions import ParseError, IncompleteParseError, LeftRecursionError
17from parsimonious.nodes import Node, RegexNode
18from parsimonious.utils import StrAndRepr
19
20
21def is_callable(value):
22 criteria = [isfunction, ismethod, ismethoddescriptor]
23 return any([criterion(value) for criterion in criteria])
24
25
26def expression(callable, rule_name, grammar):
27 """Turn a plain callable into an Expression.
28
29 The callable can be of this simple form::
30
31 def foo(text, pos):
32 '''If this custom expression matches starting at text[pos], return
33 the index where it stops matching. Otherwise, return None.'''
34 if the expression matched:
35 return end_pos
36
37 If there child nodes to return, return a tuple::
38
39 return end_pos, children
40
41 If the expression doesn't match at the given ``pos`` at all... ::
42
43 return None
44
45 If your callable needs to make sub-calls to other rules in the grammar or
46 do error reporting, it can take this form, gaining additional arguments::
47
48 def foo(text, pos, cache, error, grammar):
49 # Call out to other rules:
50 node = grammar['another_rule'].match_core(text, pos, cache, error)
51 ...
52 # Return values as above.
53
54 The return value of the callable, if an int or a tuple, will be
55 automatically transmuted into a :class:`~parsimonious.Node`. If it returns
56 a Node-like class directly, it will be passed through unchanged.
57
58 :arg rule_name: The rule name to attach to the resulting
59 :class:`~parsimonious.Expression`
60 :arg grammar: The :class:`~parsimonious.Grammar` this expression will be a
61 part of, to make delegating to other rules possible
62
63 """
64
65 # Resolve unbound methods; allows grammars to use @staticmethod custom rules
66 # https://stackoverflow.com/questions/41921255/staticmethod-object-is-not-callable
67 if ismethoddescriptor(callable) and hasattr(callable, '__func__'):
68 callable = callable.__func__
69
70 num_args = len(getfullargspec(callable).args)
71 if ismethod(callable):
72 # do not count the first argument (typically 'self') for methods
73 num_args -= 1
74 if num_args == 2:
75 is_simple = True
76 elif num_args == 5:
77 is_simple = False
78 else:
79 raise RuntimeError("Custom rule functions must take either 2 or 5 "
80 "arguments, not %s." % num_args)
81
82 class AdHocExpression(Expression):
83 def _uncached_match(self, text, pos, cache, error):
84 result = (callable(text, pos) if is_simple else
85 callable(text, pos, cache, error, grammar))
86
87 if isinstance(result, int):
88 end, children = result, None
89 elif isinstance(result, tuple):
90 end, children = result
91 else:
92 # Node or None
93 return result
94 return Node(self, text, pos, end, children=children)
95
96 def _as_rhs(self):
97 return '{custom function "%s"}' % callable.__name__
98
99 return AdHocExpression(name=rule_name)
100
101
102IN_PROGRESS = object()
103
104
105class Expression(StrAndRepr):
106 """A thing that can be matched against a piece of text"""
107
108 # Slots are about twice as fast as __dict__-based attributes:
109 # http://stackoverflow.com/questions/1336791/dictionary-vs-object-which-is-more-efficient-and-why
110
111 # Top-level expressions--rules--have names. Subexpressions are named ''.
112 __slots__ = ['name', 'identity_tuple']
113
114 def __init__(self, name=''):
115 self.name = name
116 self.identity_tuple = (self.name, )
117
118 def __hash__(self):
119 return hash(self.identity_tuple)
120
121 def __eq__(self, other):
122 return self._eq_check_cycles(other, set())
123
124 def __ne__(self, other):
125 return not (self == other)
126
127 def _eq_check_cycles(self, other, checked):
128 # keep a set of all pairs that are already checked, so we won't fall into infinite recursions.
129 checked.add((id(self), id(other)))
130 return other.__class__ is self.__class__ and self.identity_tuple == other.identity_tuple
131
132 def resolve_refs(self, rule_map):
133 # Nothing to do on the base expression.
134 return self
135
136 def parse(self, text, pos=0):
137 """Return a parse tree of ``text``.
138
139 Raise ``ParseError`` if the expression wasn't satisfied. Raise
140 ``IncompleteParseError`` if the expression was satisfied but didn't
141 consume the full string.
142
143 """
144 node = self.match(text, pos=pos)
145 if node.end < len(text):
146 raise IncompleteParseError(text, node.end, self)
147 return node
148
149 def match(self, text, pos=0):
150 """Return the parse tree matching this expression at the given
151 position, not necessarily extending all the way to the end of ``text``.
152
153 Raise ``ParseError`` if there is no match there.
154
155 :arg pos: The index at which to start matching
156
157 """
158 error = ParseError(text)
159 node = self.match_core(text, pos, defaultdict(dict), error)
160 if node is None:
161 raise error
162 return node
163
164 def match_core(self, text, pos, cache, error):
165 """Internal guts of ``match()``
166
167 This is appropriate to call only from custom rules or Expression
168 subclasses.
169
170 :arg cache: The packrat cache::
171
172 {(oid, pos): Node tree matched by object `oid` at index `pos` ...}
173
174 :arg error: A ParseError instance with ``text`` already filled in but
175 otherwise blank. We update the error reporting info on this object
176 as we go. (Sticking references on an existing instance is faster
177 than allocating a new one for each expression that fails.) We
178 return None rather than raising and catching ParseErrors because
179 catching is slow.
180
181 """
182 # TODO: Optimize. Probably a hot spot.
183 #
184 # Is there a faster way of looking up cached stuff?
185 #
186 # If this is slow, think about the array module. It might (or might
187 # not!) use more RAM, but it'll likely be faster than hashing things
188 # all the time. Also, can we move all the allocs up front?
189 #
190 # To save space, we have lots of choices: (0) Quit caching whole Node
191 # objects. Cache just what you need to reconstitute them. (1) Cache
192 # only the results of entire rules, not subexpressions (probably a
193 # horrible idea for rules that need to backtrack internally a lot). (2)
194 # Age stuff out of the cache somehow. LRU? (3) Cuts.
195 expr_cache = cache[id(self)]
196 if pos in expr_cache:
197 node = expr_cache[pos]
198 else:
199 # TODO: Set default value to prevent infinite recursion in left-recursive rules.
200 expr_cache[pos] = IN_PROGRESS # Mark as in progress
201 node = expr_cache[pos] = self._uncached_match(text, pos, cache, error)
202 if node is IN_PROGRESS:
203 raise LeftRecursionError(text, pos=-1, expr=self)
204
205 # Record progress for error reporting:
206 if node is None and pos >= error.pos and (
207 self.name or getattr(error.expr, 'name', None) is None):
208 # Don't bother reporting on unnamed expressions (unless that's all
209 # we've seen so far), as they're hard to track down for a human.
210 # Perhaps we could include the unnamed subexpressions later as
211 # auxiliary info.
212 error.expr = self
213 error.pos = pos
214
215 return node
216
217 def __str__(self):
218 return '<%s %s>' % (
219 self.__class__.__name__,
220 self.as_rule())
221
222 def as_rule(self):
223 """Return the left- and right-hand sides of a rule that represents me.
224
225 Return unicode. If I have no ``name``, omit the left-hand side.
226
227 """
228 rhs = self._as_rhs().strip()
229 if rhs.startswith('(') and rhs.endswith(')'):
230 rhs = rhs[1:-1]
231
232 return ('%s = %s' % (self.name, rhs)) if self.name else rhs
233
234 def _unicode_members(self):
235 """Return an iterable of my unicode-represented children, stopping
236 descent when we hit a named node so the returned value resembles the
237 input rule."""
238 return [(m.name or m._as_rhs()) for m in self.members]
239
240 def _as_rhs(self):
241 """Return the right-hand side of a rule that represents me.
242
243 Implemented by subclasses.
244
245 """
246 raise NotImplementedError
247
248
249class Literal(Expression):
250 """A string literal
251
252 Use these if you can; they're the fastest.
253
254 """
255 __slots__ = ['literal']
256
257 def __init__(self, literal, name=''):
258 super().__init__(name)
259 self.literal = literal
260 self.identity_tuple = (name, literal)
261
262 def _uncached_match(self, text, pos, cache, error):
263 if text.startswith(self.literal, pos):
264 return Node(self, text, pos, pos + len(self.literal))
265
266 def _as_rhs(self):
267 return repr(self.literal)
268
269
270class TokenMatcher(Literal):
271 """An expression matching a single token of a given type
272
273 This is for use only with TokenGrammars.
274
275 """
276 def _uncached_match(self, token_list, pos, cache, error):
277 if token_list[pos].type == self.literal:
278 return Node(self, token_list, pos, pos + 1)
279
280
281class Regex(Expression):
282 """An expression that matches what a regex does.
283
284 Use these as much as you can and jam as much into each one as you can;
285 they're fast.
286
287 """
288 __slots__ = ['re']
289
290 def __init__(self, pattern, name='', ignore_case=False, locale=False,
291 multiline=False, dot_all=False, unicode=False, verbose=False, ascii=False):
292 super().__init__(name)
293 self.re = re.compile(pattern, (ignore_case and re.I) |
294 (locale and re.L) |
295 (multiline and re.M) |
296 (dot_all and re.S) |
297 (unicode and re.U) |
298 (verbose and re.X) |
299 (ascii and re.A))
300 self.identity_tuple = (self.name, self.re)
301
302 def _uncached_match(self, text, pos, cache, error):
303 """Return length of match, ``None`` if no match."""
304 m = self.re.match(text, pos)
305 if m is not None:
306 span = m.span()
307 node = RegexNode(self, text, pos, pos + span[1] - span[0])
308 node.match = m # TODO: A terrible idea for cache size?
309 return node
310
311 def _regex_flags_from_bits(self, bits):
312 """Return the textual equivalent of numerically encoded regex flags."""
313 flags = 'ilmsuxa'
314 return ''.join(flags[i - 1] if (1 << i) & bits else '' for i in range(1, len(flags) + 1))
315
316 def _as_rhs(self):
317 return '~{!r}{}'.format(self.re.pattern,
318 self._regex_flags_from_bits(self.re.flags))
319
320
321class Compound(Expression):
322 """An abstract expression which contains other expressions"""
323
324 __slots__ = ['members']
325
326 def __init__(self, *members, **kwargs):
327 """``members`` is a sequence of expressions."""
328 super().__init__(kwargs.get('name', ''))
329 self.members = members
330
331 def resolve_refs(self, rule_map):
332 self.members = tuple(m.resolve_refs(rule_map) for m in self.members)
333 return self
334
335 def _eq_check_cycles(self, other, checked):
336 return (
337 super()._eq_check_cycles(other, checked) and
338 len(self.members) == len(other.members) and
339 all(m._eq_check_cycles(mo, checked) for m, mo in zip(self.members, other.members) if (id(m), id(mo)) not in checked)
340 )
341
342 def __hash__(self):
343 # Note we leave members out of the hash computation, since compounds can get added to
344 # sets, then have their members mutated. See RuleVisitor._resolve_refs.
345 # Equality should still work, but we want the rules to go into the correct hash bucket.
346 return hash((self.__class__, self.name))
347
348
349class Sequence(Compound):
350 """A series of expressions that must match contiguous, ordered pieces of
351 the text
352
353 In other words, it's a concatenation operator: each piece has to match, one
354 after another.
355
356 """
357 def _uncached_match(self, text, pos, cache, error):
358 new_pos = pos
359 children = []
360 for m in self.members:
361 node = m.match_core(text, new_pos, cache, error)
362 if node is None:
363 return None
364 children.append(node)
365 length = node.end - node.start
366 new_pos += length
367 # Hooray! We got through all the members!
368 return Node(self, text, pos, new_pos, children)
369
370 def _as_rhs(self):
371 return '({0})'.format(' '.join(self._unicode_members()))
372
373
374class OneOf(Compound):
375 """A series of expressions, one of which must match
376
377 Expressions are tested in order from first to last. The first to succeed
378 wins.
379
380 """
381 def _uncached_match(self, text, pos, cache, error):
382 for m in self.members:
383 node = m.match_core(text, pos, cache, error)
384 if node is not None:
385 # Wrap the succeeding child in a node representing the OneOf:
386 return Node(self, text, pos, node.end, children=[node])
387
388 def _as_rhs(self):
389 return '({0})'.format(' / '.join(self._unicode_members()))
390
391
392class Lookahead(Compound):
393 """An expression which consumes nothing, even if its contained expression
394 succeeds"""
395
396 __slots__ = ['negativity']
397
398 def __init__(self, member, *, negative=False, **kwargs):
399 super().__init__(member, **kwargs)
400 self.negativity = bool(negative)
401
402 def _uncached_match(self, text, pos, cache, error):
403 node = self.members[0].match_core(text, pos, cache, error)
404 if (node is None) == self.negativity: # negative lookahead == match only if not found
405 return Node(self, text, pos, pos)
406
407 def _as_rhs(self):
408 return '%s%s' % ('!' if self.negativity else '&', self._unicode_members()[0])
409
410 def _eq_check_cycles(self, other, checked):
411 return (
412 super()._eq_check_cycles(other, checked) and
413 self.negativity == other.negativity
414 )
415
416def Not(term):
417 return Lookahead(term, negative=True)
418
419# Quantifiers. None of these is strictly necessary, but they're darn handy.
420
421class Quantifier(Compound):
422 """An expression wrapper like the */+/?/{n,m} quantifier in regexes."""
423
424 __slots__ = ['min', 'max']
425
426 def __init__(self, member, *, min=0, max=float('inf'), name='', **kwargs):
427 super().__init__(member, name=name, **kwargs)
428 self.min = min
429 self.max = max
430
431 def _uncached_match(self, text, pos, cache, error):
432 new_pos = pos
433 children = []
434 size = len(text)
435 while new_pos < size and len(children) < self.max:
436 node = self.members[0].match_core(text, new_pos, cache, error)
437 if node is None:
438 break # no more matches
439 children.append(node)
440 length = node.end - node.start
441 if len(children) >= self.min and length == 0: # Don't loop infinitely
442 break
443 new_pos += length
444 if len(children) >= self.min:
445 return Node(self, text, pos, new_pos, children)
446
447 def _as_rhs(self):
448 if self.min == 0 and self.max == 1:
449 qualifier = '?'
450 elif self.min == 0 and self.max == float('inf'):
451 qualifier = '*'
452 elif self.min == 1 and self.max == float('inf'):
453 qualifier = '+'
454 elif self.max == float('inf'):
455 qualifier = '{%d,}' % self.min
456 elif self.min == 0:
457 qualifier = '{,%d}' % self.max
458 else:
459 qualifier = '{%d,%d}' % (self.min, self.max)
460 return '%s%s' % (self._unicode_members()[0], qualifier)
461
462 def _eq_check_cycles(self, other, checked):
463 return (
464 super()._eq_check_cycles(other, checked) and
465 self.min == other.min and
466 self.max == other.max
467 )
468
469def ZeroOrMore(member, name=''):
470 return Quantifier(member, name=name, min=0, max=float('inf'))
471
472def OneOrMore(member, name='', min=1):
473 return Quantifier(member, name=name, min=min, max=float('inf'))
474
475def Optional(member, name=''):
476 return Quantifier(member, name=name, min=0, max=1)