Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/prompt_toolkit/contrib/regular_languages/regex_parser.py: 89%
114 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-11 06:55 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-11 06:55 +0000
1"""
2Parser for parsing a regular expression.
3Take a string representing a regular expression and return the root node of its
4parse tree.
6usage::
8 root_node = parse_regex('(hello|world)')
10Remarks:
11- The regex parser processes multiline, it ignores all whitespace and supports
12 multiple named groups with the same name and #-style comments.
14Limitations:
15- Lookahead is not supported.
16"""
17from __future__ import annotations
19import re
21__all__ = [
22 "Repeat",
23 "Variable",
24 "Regex",
25 "Lookahead",
26 "tokenize_regex",
27 "parse_regex",
28]
31class Node:
32 """
33 Base class for all the grammar nodes.
34 (You don't initialize this one.)
35 """
37 def __add__(self, other_node: Node) -> NodeSequence:
38 return NodeSequence([self, other_node])
40 def __or__(self, other_node: Node) -> AnyNode:
41 return AnyNode([self, other_node])
44class AnyNode(Node):
45 """
46 Union operation (OR operation) between several grammars. You don't
47 initialize this yourself, but it's a result of a "Grammar1 | Grammar2"
48 operation.
49 """
51 def __init__(self, children: list[Node]) -> None:
52 self.children = children
54 def __or__(self, other_node: Node) -> AnyNode:
55 return AnyNode(self.children + [other_node])
57 def __repr__(self) -> str:
58 return f"{self.__class__.__name__}({self.children!r})"
61class NodeSequence(Node):
62 """
63 Concatenation operation of several grammars. You don't initialize this
64 yourself, but it's a result of a "Grammar1 + Grammar2" operation.
65 """
67 def __init__(self, children: list[Node]) -> None:
68 self.children = children
70 def __add__(self, other_node: Node) -> NodeSequence:
71 return NodeSequence(self.children + [other_node])
73 def __repr__(self) -> str:
74 return f"{self.__class__.__name__}({self.children!r})"
77class Regex(Node):
78 """
79 Regular expression.
80 """
82 def __init__(self, regex: str) -> None:
83 re.compile(regex) # Validate
85 self.regex = regex
87 def __repr__(self) -> str:
88 return f"{self.__class__.__name__}(/{self.regex}/)"
91class Lookahead(Node):
92 """
93 Lookahead expression.
94 """
96 def __init__(self, childnode: Node, negative: bool = False) -> None:
97 self.childnode = childnode
98 self.negative = negative
100 def __repr__(self) -> str:
101 return f"{self.__class__.__name__}({self.childnode!r})"
104class Variable(Node):
105 """
106 Mark a variable in the regular grammar. This will be translated into a
107 named group. Each variable can have his own completer, validator, etc..
109 :param childnode: The grammar which is wrapped inside this variable.
110 :param varname: String.
111 """
113 def __init__(self, childnode: Node, varname: str = "") -> None:
114 self.childnode = childnode
115 self.varname = varname
117 def __repr__(self) -> str:
118 return "{}(childnode={!r}, varname={!r})".format(
119 self.__class__.__name__,
120 self.childnode,
121 self.varname,
122 )
125class Repeat(Node):
126 def __init__(
127 self,
128 childnode: Node,
129 min_repeat: int = 0,
130 max_repeat: int | None = None,
131 greedy: bool = True,
132 ) -> None:
133 self.childnode = childnode
134 self.min_repeat = min_repeat
135 self.max_repeat = max_repeat
136 self.greedy = greedy
138 def __repr__(self) -> str:
139 return f"{self.__class__.__name__}(childnode={self.childnode!r})"
142def tokenize_regex(input: str) -> list[str]:
143 """
144 Takes a string, representing a regular expression as input, and tokenizes
145 it.
147 :param input: string, representing a regular expression.
148 :returns: List of tokens.
149 """
150 # Regular expression for tokenizing other regular expressions.
151 p = re.compile(
152 r"""^(
153 \(\?P\<[a-zA-Z0-9_-]+\> | # Start of named group.
154 \(\?#[^)]*\) | # Comment
155 \(\?= | # Start of lookahead assertion
156 \(\?! | # Start of negative lookahead assertion
157 \(\?<= | # If preceded by.
158 \(\?< | # If not preceded by.
159 \(?: | # Start of group. (non capturing.)
160 \( | # Start of group.
161 \(?[iLmsux] | # Flags.
162 \(?P=[a-zA-Z]+\) | # Back reference to named group
163 \) | # End of group.
164 \{[^{}]*\} | # Repetition
165 \*\? | \+\? | \?\?\ | # Non greedy repetition.
166 \* | \+ | \? | # Repetition
167 \#.*\n | # Comment
168 \\. |
170 # Character group.
171 \[
172 ( [^\]\\] | \\.)*
173 \] |
175 [^(){}] |
176 .
177 )""",
178 re.VERBOSE,
179 )
181 tokens = []
183 while input:
184 m = p.match(input)
185 if m:
186 token, input = input[: m.end()], input[m.end() :]
187 if not token.isspace():
188 tokens.append(token)
189 else:
190 raise Exception("Could not tokenize input regex.")
192 return tokens
195def parse_regex(regex_tokens: list[str]) -> Node:
196 """
197 Takes a list of tokens from the tokenizer, and returns a parse tree.
198 """
199 # We add a closing brace because that represents the final pop of the stack.
200 tokens: list[str] = [")"] + regex_tokens[::-1]
202 def wrap(lst: list[Node]) -> Node:
203 """Turn list into sequence when it contains several items."""
204 if len(lst) == 1:
205 return lst[0]
206 else:
207 return NodeSequence(lst)
209 def _parse() -> Node:
210 or_list: list[list[Node]] = []
211 result: list[Node] = []
213 def wrapped_result() -> Node:
214 if or_list == []:
215 return wrap(result)
216 else:
217 or_list.append(result)
218 return AnyNode([wrap(i) for i in or_list])
220 while tokens:
221 t = tokens.pop()
223 if t.startswith("(?P<"):
224 variable = Variable(_parse(), varname=t[4:-1])
225 result.append(variable)
227 elif t in ("*", "*?"):
228 greedy = t == "*"
229 result[-1] = Repeat(result[-1], greedy=greedy)
231 elif t in ("+", "+?"):
232 greedy = t == "+"
233 result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)
235 elif t in ("?", "??"):
236 if result == []:
237 raise Exception("Nothing to repeat." + repr(tokens))
238 else:
239 greedy = t == "?"
240 result[-1] = Repeat(
241 result[-1], min_repeat=0, max_repeat=1, greedy=greedy
242 )
244 elif t == "|":
245 or_list.append(result)
246 result = []
248 elif t in ("(", "(?:"):
249 result.append(_parse())
251 elif t == "(?!":
252 result.append(Lookahead(_parse(), negative=True))
254 elif t == "(?=":
255 result.append(Lookahead(_parse(), negative=False))
257 elif t == ")":
258 return wrapped_result()
260 elif t.startswith("#"):
261 pass
263 elif t.startswith("{"):
264 # TODO: implement!
265 raise Exception(f"{t}-style repetition not yet supported")
267 elif t.startswith("(?"):
268 raise Exception("%r not supported" % t)
270 elif t.isspace():
271 pass
272 else:
273 result.append(Regex(t))
275 raise Exception("Expecting ')' token")
277 result = _parse()
279 if len(tokens) != 0:
280 raise Exception("Unmatched parentheses.")
281 else:
282 return result