Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/prompt_toolkit/contrib/regular_languages/compiler.py: 67%
205 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-11 06:55 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-11 06:55 +0000
1r"""
2Compiler for a regular grammar.
4Example usage::
6 # Create and compile grammar.
7 p = compile('add \s+ (?P<var1>[^\s]+) \s+ (?P<var2>[^\s]+)')
9 # Match input string.
10 m = p.match('add 23 432')
12 # Get variables.
13 m.variables().get('var1') # Returns "23"
14 m.variables().get('var2') # Returns "432"
17Partial matches are possible::
19 # Create and compile grammar.
20 p = compile('''
21 # Operators with two arguments.
22 ((?P<operator1>[^\s]+) \s+ (?P<var1>[^\s]+) \s+ (?P<var2>[^\s]+)) |
24 # Operators with only one arguments.
25 ((?P<operator2>[^\s]+) \s+ (?P<var1>[^\s]+))
26 ''')
28 # Match partial input string.
29 m = p.match_prefix('add 23')
31 # Get variables. (Notice that both operator1 and operator2 contain the
32 # value "add".) This is because our input is incomplete, and we don't know
33 # yet in which rule of the regex we we'll end up. It could also be that
34 # `operator1` and `operator2` have a different autocompleter and we want to
35 # call all possible autocompleters that would result in valid input.)
36 m.variables().get('var1') # Returns "23"
37 m.variables().get('operator1') # Returns "add"
38 m.variables().get('operator2') # Returns "add"
40"""
41from __future__ import annotations
43import re
44from typing import Callable, Dict, Iterable, Iterator, Pattern
45from typing import Match as RegexMatch
47from .regex_parser import (
48 AnyNode,
49 Lookahead,
50 Node,
51 NodeSequence,
52 Regex,
53 Repeat,
54 Variable,
55 parse_regex,
56 tokenize_regex,
57)
59__all__ = [
60 "compile",
61]
64# Name of the named group in the regex, matching trailing input.
65# (Trailing input is when the input contains characters after the end of the
66# expression has been matched.)
67_INVALID_TRAILING_INPUT = "invalid_trailing"
69EscapeFuncDict = Dict[str, Callable[[str], str]]
72class _CompiledGrammar:
73 """
74 Compiles a grammar. This will take the parse tree of a regular expression
75 and compile the grammar.
77 :param root_node: :class~`.regex_parser.Node` instance.
78 :param escape_funcs: `dict` mapping variable names to escape callables.
79 :param unescape_funcs: `dict` mapping variable names to unescape callables.
80 """
82 def __init__(
83 self,
84 root_node: Node,
85 escape_funcs: EscapeFuncDict | None = None,
86 unescape_funcs: EscapeFuncDict | None = None,
87 ) -> None:
88 self.root_node = root_node
89 self.escape_funcs = escape_funcs or {}
90 self.unescape_funcs = unescape_funcs or {}
92 #: Dictionary that will map the regex names to Node instances.
93 self._group_names_to_nodes: dict[
94 str, str
95 ] = {} # Maps regex group names to varnames.
96 counter = [0]
98 def create_group_func(node: Variable) -> str:
99 name = "n%s" % counter[0]
100 self._group_names_to_nodes[name] = node.varname
101 counter[0] += 1
102 return name
104 # Compile regex strings.
105 self._re_pattern = "^%s$" % self._transform(root_node, create_group_func)
106 self._re_prefix_patterns = list(
107 self._transform_prefix(root_node, create_group_func)
108 )
110 # Compile the regex itself.
111 flags = re.DOTALL # Note that we don't need re.MULTILINE! (^ and $
112 # still represent the start and end of input text.)
113 self._re = re.compile(self._re_pattern, flags)
114 self._re_prefix = [re.compile(t, flags) for t in self._re_prefix_patterns]
116 # We compile one more set of regexes, similar to `_re_prefix`, but accept any trailing
117 # input. This will ensure that we can still highlight the input correctly, even when the
118 # input contains some additional characters at the end that don't match the grammar.)
119 self._re_prefix_with_trailing_input = [
120 re.compile(
121 r"(?:{})(?P<{}>.*?)$".format(t.rstrip("$"), _INVALID_TRAILING_INPUT),
122 flags,
123 )
124 for t in self._re_prefix_patterns
125 ]
127 def escape(self, varname: str, value: str) -> str:
128 """
129 Escape `value` to fit in the place of this variable into the grammar.
130 """
131 f = self.escape_funcs.get(varname)
132 return f(value) if f else value
134 def unescape(self, varname: str, value: str) -> str:
135 """
136 Unescape `value`.
137 """
138 f = self.unescape_funcs.get(varname)
139 return f(value) if f else value
141 @classmethod
142 def _transform(
143 cls, root_node: Node, create_group_func: Callable[[Variable], str]
144 ) -> str:
145 """
146 Turn a :class:`Node` object into a regular expression.
148 :param root_node: The :class:`Node` instance for which we generate the grammar.
149 :param create_group_func: A callable which takes a `Node` and returns the next
150 free name for this node.
151 """
153 def transform(node: Node) -> str:
154 # Turn `AnyNode` into an OR.
155 if isinstance(node, AnyNode):
156 return "(?:%s)" % "|".join(transform(c) for c in node.children)
158 # Concatenate a `NodeSequence`
159 elif isinstance(node, NodeSequence):
160 return "".join(transform(c) for c in node.children)
162 # For Regex and Lookahead nodes, just insert them literally.
163 elif isinstance(node, Regex):
164 return node.regex
166 elif isinstance(node, Lookahead):
167 before = "(?!" if node.negative else "(="
168 return before + transform(node.childnode) + ")"
170 # A `Variable` wraps the children into a named group.
171 elif isinstance(node, Variable):
172 return f"(?P<{create_group_func(node)}>{transform(node.childnode)})"
174 # `Repeat`.
175 elif isinstance(node, Repeat):
176 if node.max_repeat is None:
177 if node.min_repeat == 0:
178 repeat_sign = "*"
179 elif node.min_repeat == 1:
180 repeat_sign = "+"
181 else:
182 repeat_sign = "{%i,%s}" % (
183 node.min_repeat,
184 ("" if node.max_repeat is None else str(node.max_repeat)),
185 )
187 return "(?:{}){}{}".format(
188 transform(node.childnode),
189 repeat_sign,
190 ("" if node.greedy else "?"),
191 )
192 else:
193 raise TypeError(f"Got {node!r}")
195 return transform(root_node)
197 @classmethod
198 def _transform_prefix(
199 cls, root_node: Node, create_group_func: Callable[[Variable], str]
200 ) -> Iterable[str]:
201 """
202 Yield all the regular expressions matching a prefix of the grammar
203 defined by the `Node` instance.
205 For each `Variable`, one regex pattern will be generated, with this
206 named group at the end. This is required because a regex engine will
207 terminate once a match is found. For autocompletion however, we need
208 the matches for all possible paths, so that we can provide completions
209 for each `Variable`.
211 - So, in the case of an `Any` (`A|B|C)', we generate a pattern for each
212 clause. This is one for `A`, one for `B` and one for `C`. Unless some
213 groups don't contain a `Variable`, then these can be merged together.
214 - In the case of a `NodeSequence` (`ABC`), we generate a pattern for
215 each prefix that ends with a variable, and one pattern for the whole
216 sequence. So, that's one for `A`, one for `AB` and one for `ABC`.
218 :param root_node: The :class:`Node` instance for which we generate the grammar.
219 :param create_group_func: A callable which takes a `Node` and returns the next
220 free name for this node.
221 """
223 def contains_variable(node: Node) -> bool:
224 if isinstance(node, Regex):
225 return False
226 elif isinstance(node, Variable):
227 return True
228 elif isinstance(node, (Lookahead, Repeat)):
229 return contains_variable(node.childnode)
230 elif isinstance(node, (NodeSequence, AnyNode)):
231 return any(contains_variable(child) for child in node.children)
233 return False
235 def transform(node: Node) -> Iterable[str]:
236 # Generate separate pattern for all terms that contain variables
237 # within this OR. Terms that don't contain a variable can be merged
238 # together in one pattern.
239 if isinstance(node, AnyNode):
240 # If we have a definition like:
241 # (?P<name> .*) | (?P<city> .*)
242 # Then we want to be able to generate completions for both the
243 # name as well as the city. We do this by yielding two
244 # different regular expressions, because the engine won't
245 # follow multiple paths, if multiple are possible.
246 children_with_variable = []
247 children_without_variable = []
248 for c in node.children:
249 if contains_variable(c):
250 children_with_variable.append(c)
251 else:
252 children_without_variable.append(c)
254 for c in children_with_variable:
255 yield from transform(c)
257 # Merge options without variable together.
258 if children_without_variable:
259 yield "|".join(
260 r for c in children_without_variable for r in transform(c)
261 )
263 # For a sequence, generate a pattern for each prefix that ends with
264 # a variable + one pattern of the complete sequence.
265 # (This is because, for autocompletion, we match the text before
266 # the cursor, and completions are given for the variable that we
267 # match right before the cursor.)
268 elif isinstance(node, NodeSequence):
269 # For all components in the sequence, compute prefix patterns,
270 # as well as full patterns.
271 complete = [cls._transform(c, create_group_func) for c in node.children]
272 prefixes = [list(transform(c)) for c in node.children]
273 variable_nodes = [contains_variable(c) for c in node.children]
275 # If any child is contains a variable, we should yield a
276 # pattern up to that point, so that we are sure this will be
277 # matched.
278 for i in range(len(node.children)):
279 if variable_nodes[i]:
280 for c_str in prefixes[i]:
281 yield "".join(complete[:i]) + c_str
283 # If there are non-variable nodes, merge all the prefixes into
284 # one pattern. If the input is: "[part1] [part2] [part3]", then
285 # this gets compiled into:
286 # (complete1 + (complete2 + (complete3 | partial3) | partial2) | partial1 )
287 # For nodes that contain a variable, we skip the "|partial"
288 # part here, because thees are matched with the previous
289 # patterns.
290 if not all(variable_nodes):
291 result = []
293 # Start with complete patterns.
294 for i in range(len(node.children)):
295 result.append("(?:")
296 result.append(complete[i])
298 # Add prefix patterns.
299 for i in range(len(node.children) - 1, -1, -1):
300 if variable_nodes[i]:
301 # No need to yield a prefix for this one, we did
302 # the variable prefixes earlier.
303 result.append(")")
304 else:
305 result.append("|(?:")
306 # If this yields multiple, we should yield all combinations.
307 assert len(prefixes[i]) == 1
308 result.append(prefixes[i][0])
309 result.append("))")
311 yield "".join(result)
313 elif isinstance(node, Regex):
314 yield "(?:%s)?" % node.regex
316 elif isinstance(node, Lookahead):
317 if node.negative:
318 yield "(?!%s)" % cls._transform(node.childnode, create_group_func)
319 else:
320 # Not sure what the correct semantics are in this case.
321 # (Probably it's not worth implementing this.)
322 raise Exception("Positive lookahead not yet supported.")
324 elif isinstance(node, Variable):
325 # (Note that we should not append a '?' here. the 'transform'
326 # method will already recursively do that.)
327 for c_str in transform(node.childnode):
328 yield f"(?P<{create_group_func(node)}>{c_str})"
330 elif isinstance(node, Repeat):
331 # If we have a repetition of 8 times. That would mean that the
332 # current input could have for instance 7 times a complete
333 # match, followed by a partial match.
334 prefix = cls._transform(node.childnode, create_group_func)
336 if node.max_repeat == 1:
337 yield from transform(node.childnode)
338 else:
339 for c_str in transform(node.childnode):
340 if node.max_repeat:
341 repeat_sign = "{,%i}" % (node.max_repeat - 1)
342 else:
343 repeat_sign = "*"
344 yield "(?:{}){}{}{}".format(
345 prefix,
346 repeat_sign,
347 ("" if node.greedy else "?"),
348 c_str,
349 )
351 else:
352 raise TypeError("Got %r" % node)
354 for r in transform(root_node):
355 yield "^(?:%s)$" % r
357 def match(self, string: str) -> Match | None:
358 """
359 Match the string with the grammar.
360 Returns a :class:`Match` instance or `None` when the input doesn't match the grammar.
362 :param string: The input string.
363 """
364 m = self._re.match(string)
366 if m:
367 return Match(
368 string, [(self._re, m)], self._group_names_to_nodes, self.unescape_funcs
369 )
370 return None
372 def match_prefix(self, string: str) -> Match | None:
373 """
374 Do a partial match of the string with the grammar. The returned
375 :class:`Match` instance can contain multiple representations of the
376 match. This will never return `None`. If it doesn't match at all, the "trailing input"
377 part will capture all of the input.
379 :param string: The input string.
380 """
381 # First try to match using `_re_prefix`. If nothing is found, use the patterns that
382 # also accept trailing characters.
383 for patterns in [self._re_prefix, self._re_prefix_with_trailing_input]:
384 matches = [(r, r.match(string)) for r in patterns]
385 matches2 = [(r, m) for r, m in matches if m]
387 if matches2 != []:
388 return Match(
389 string, matches2, self._group_names_to_nodes, self.unescape_funcs
390 )
392 return None
395class Match:
396 """
397 :param string: The input string.
398 :param re_matches: List of (compiled_re_pattern, re_match) tuples.
399 :param group_names_to_nodes: Dictionary mapping all the re group names to the matching Node instances.
400 """
402 def __init__(
403 self,
404 string: str,
405 re_matches: list[tuple[Pattern[str], RegexMatch[str]]],
406 group_names_to_nodes: dict[str, str],
407 unescape_funcs: dict[str, Callable[[str], str]],
408 ):
409 self.string = string
410 self._re_matches = re_matches
411 self._group_names_to_nodes = group_names_to_nodes
412 self._unescape_funcs = unescape_funcs
414 def _nodes_to_regs(self) -> list[tuple[str, tuple[int, int]]]:
415 """
416 Return a list of (varname, reg) tuples.
417 """
419 def get_tuples() -> Iterable[tuple[str, tuple[int, int]]]:
420 for r, re_match in self._re_matches:
421 for group_name, group_index in r.groupindex.items():
422 if group_name != _INVALID_TRAILING_INPUT:
423 regs = re_match.regs
424 reg = regs[group_index]
425 node = self._group_names_to_nodes[group_name]
426 yield (node, reg)
428 return list(get_tuples())
430 def _nodes_to_values(self) -> list[tuple[str, str, tuple[int, int]]]:
431 """
432 Returns list of (Node, string_value) tuples.
433 """
435 def is_none(sl: tuple[int, int]) -> bool:
436 return sl[0] == -1 and sl[1] == -1
438 def get(sl: tuple[int, int]) -> str:
439 return self.string[sl[0] : sl[1]]
441 return [
442 (varname, get(slice), slice)
443 for varname, slice in self._nodes_to_regs()
444 if not is_none(slice)
445 ]
447 def _unescape(self, varname: str, value: str) -> str:
448 unwrapper = self._unescape_funcs.get(varname)
449 return unwrapper(value) if unwrapper else value
451 def variables(self) -> Variables:
452 """
453 Returns :class:`Variables` instance.
454 """
455 return Variables(
456 [(k, self._unescape(k, v), sl) for k, v, sl in self._nodes_to_values()]
457 )
459 def trailing_input(self) -> MatchVariable | None:
460 """
461 Get the `MatchVariable` instance, representing trailing input, if there is any.
462 "Trailing input" is input at the end that does not match the grammar anymore, but
463 when this is removed from the end of the input, the input would be a valid string.
464 """
465 slices: list[tuple[int, int]] = []
467 # Find all regex group for the name _INVALID_TRAILING_INPUT.
468 for r, re_match in self._re_matches:
469 for group_name, group_index in r.groupindex.items():
470 if group_name == _INVALID_TRAILING_INPUT:
471 slices.append(re_match.regs[group_index])
473 # Take the smallest part. (Smaller trailing text means that a larger input has
474 # been matched, so that is better.)
475 if slices:
476 slice = (max(i[0] for i in slices), max(i[1] for i in slices))
477 value = self.string[slice[0] : slice[1]]
478 return MatchVariable("<trailing_input>", value, slice)
479 return None
481 def end_nodes(self) -> Iterable[MatchVariable]:
482 """
483 Yields `MatchVariable` instances for all the nodes having their end
484 position at the end of the input string.
485 """
486 for varname, reg in self._nodes_to_regs():
487 # If this part goes until the end of the input string.
488 if reg[1] == len(self.string):
489 value = self._unescape(varname, self.string[reg[0] : reg[1]])
490 yield MatchVariable(varname, value, (reg[0], reg[1]))
493class Variables:
494 def __init__(self, tuples: list[tuple[str, str, tuple[int, int]]]) -> None:
495 #: List of (varname, value, slice) tuples.
496 self._tuples = tuples
498 def __repr__(self) -> str:
499 return "{}({})".format(
500 self.__class__.__name__,
501 ", ".join(f"{k}={v!r}" for k, v, _ in self._tuples),
502 )
504 def get(self, key: str, default: str | None = None) -> str | None:
505 items = self.getall(key)
506 return items[0] if items else default
508 def getall(self, key: str) -> list[str]:
509 return [v for k, v, _ in self._tuples if k == key]
511 def __getitem__(self, key: str) -> str | None:
512 return self.get(key)
514 def __iter__(self) -> Iterator[MatchVariable]:
515 """
516 Yield `MatchVariable` instances.
517 """
518 for varname, value, slice in self._tuples:
519 yield MatchVariable(varname, value, slice)
522class MatchVariable:
523 """
524 Represents a match of a variable in the grammar.
526 :param varname: (string) Name of the variable.
527 :param value: (string) Value of this variable.
528 :param slice: (start, stop) tuple, indicating the position of this variable
529 in the input string.
530 """
532 def __init__(self, varname: str, value: str, slice: tuple[int, int]) -> None:
533 self.varname = varname
534 self.value = value
535 self.slice = slice
537 self.start = self.slice[0]
538 self.stop = self.slice[1]
540 def __repr__(self) -> str:
541 return f"{self.__class__.__name__}({self.varname!r}, {self.value!r})"
544def compile(
545 expression: str,
546 escape_funcs: EscapeFuncDict | None = None,
547 unescape_funcs: EscapeFuncDict | None = None,
548) -> _CompiledGrammar:
549 """
550 Compile grammar (given as regex string), returning a `CompiledGrammar`
551 instance.
552 """
553 return _compile_from_parse_tree(
554 parse_regex(tokenize_regex(expression)),
555 escape_funcs=escape_funcs,
556 unescape_funcs=unescape_funcs,
557 )
560def _compile_from_parse_tree(
561 root_node: Node,
562 escape_funcs: EscapeFuncDict | None = None,
563 unescape_funcs: EscapeFuncDict | None = None,
564) -> _CompiledGrammar:
565 """
566 Compile grammar (given as parse tree), returning a `CompiledGrammar`
567 instance.
568 """
569 return _CompiledGrammar(
570 root_node, escape_funcs=escape_funcs, unescape_funcs=unescape_funcs
571 )