Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/prompt_toolkit/contrib/regular_languages/regex

1"""

2Parser for parsing a regular expression.

3Take a string representing a regular expression and return the root node of its

4parse tree.

6usage::

8 root_node = parse_regex('(hello|world)')

10Remarks:

11- The regex parser processes multiline, it ignores all whitespace and supports

12 multiple named groups with the same name and #-style comments.

14Limitations:

15- Lookahead is not supported.

16"""

17from __future__ import annotations

19import re

21__all__ = [

22 "Repeat",

23 "Variable",

24 "Regex",

25 "Lookahead",

26 "tokenize_regex",

27 "parse_regex",

28]

31class Node:

32 """

33 Base class for all the grammar nodes.

34 (You don't initialize this one.)

35 """

37 def __add__(self, other_node: Node) -> NodeSequence:

38 return NodeSequence([self, other_node])

40 def __or__(self, other_node: Node) -> AnyNode:

41 return AnyNode([self, other_node])

44class AnyNode(Node):

45 """

46 Union operation (OR operation) between several grammars. You don't

47 initialize this yourself, but it's a result of a "Grammar1 | Grammar2"

48 operation.

49 """

51 def __init__(self, children: list[Node]) -> None:

52 self.children = children

54 def __or__(self, other_node: Node) -> AnyNode:

55 return AnyNode(self.children + [other_node])

57 def __repr__(self) -> str:

58 return f"{self.__class__.__name__}({self.children!r})"

61class NodeSequence(Node):

62 """

63 Concatenation operation of several grammars. You don't initialize this

64 yourself, but it's a result of a "Grammar1 + Grammar2" operation.

65 """

67 def __init__(self, children: list[Node]) -> None:

68 self.children = children

70 def __add__(self, other_node: Node) -> NodeSequence:

71 return NodeSequence(self.children + [other_node])

73 def __repr__(self) -> str:

74 return f"{self.__class__.__name__}({self.children!r})"

77class Regex(Node):

78 """

79 Regular expression.

80 """

82 def __init__(self, regex: str) -> None:

83 re.compile(regex) # Validate

85 self.regex = regex

87 def __repr__(self) -> str:

88 return f"{self.__class__.__name__}(/{self.regex}/)"

91class Lookahead(Node):

92 """

93 Lookahead expression.

94 """

96 def __init__(self, childnode: Node, negative: bool = False) -> None:

97 self.childnode = childnode

98 self.negative = negative

100 def __repr__(self) -> str:

101 return f"{self.__class__.__name__}({self.childnode!r})"

102

103

104class Variable(Node):

105 """

106 Mark a variable in the regular grammar. This will be translated into a

107 named group. Each variable can have his own completer, validator, etc..

108

109 :param childnode: The grammar which is wrapped inside this variable.

110 :param varname: String.

111 """

112

113 def __init__(self, childnode: Node, varname: str = "") -> None:

114 self.childnode = childnode

115 self.varname = varname

116

117 def __repr__(self) -> str:

118 return "{}(childnode={!r}, varname={!r})".format(

119 self.__class__.__name__,

120 self.childnode,

121 self.varname,

122 )

123

124

125class Repeat(Node):

126 def __init__(

127 self,

128 childnode: Node,

129 min_repeat: int = 0,

130 max_repeat: int | None = None,

131 greedy: bool = True,

132 ) -> None:

133 self.childnode = childnode

134 self.min_repeat = min_repeat

135 self.max_repeat = max_repeat

136 self.greedy = greedy

137

138 def __repr__(self) -> str:

139 return f"{self.__class__.__name__}(childnode={self.childnode!r})"

140

141

142def tokenize_regex(input: str) -> list[str]:

143 """

144 Takes a string, representing a regular expression as input, and tokenizes

145 it.

146

147 :param input: string, representing a regular expression.

148 :returns: List of tokens.

149 """

150 # Regular expression for tokenizing other regular expressions.

151 p = re.compile(

152 r"""^(

153 \(\?P\<[a-zA-Z0-9_-]+\> | # Start of named group.

154 \(\?#[^)]*\) | # Comment

155 \(\?= | # Start of lookahead assertion

156 \(\?! | # Start of negative lookahead assertion

157 \(\?<= | # If preceded by.

158 \(\?< | # If not preceded by.

159 \(?: | # Start of group. (non capturing.)

160 \( | # Start of group.

161 \(?[iLmsux] | # Flags.

162 \(?P=[a-zA-Z]+\) | # Back reference to named group

163 \) | # End of group.

164 \{[^{}]*\} | # Repetition

165 \*\? | \+\? | \?\?\ | # Non greedy repetition.

166 \* | \+ | \? | # Repetition

167 \#.*\n | # Comment

168 \\. |

169

170 # Character group.

171 \[

172 ( [^\]\\] | \\.)*

173 \] |

174

175 [^(){}] |

176 .

177 )""",

178 re.VERBOSE,

179 )

180

181 tokens = []

182

183 while input:

184 m = p.match(input)

185 if m:

186 token, input = input[: m.end()], input[m.end() :]

187 if not token.isspace():

188 tokens.append(token)

189 else:

190 raise Exception("Could not tokenize input regex.")

191

192 return tokens

193

194

195def parse_regex(regex_tokens: list[str]) -> Node:

196 """

197 Takes a list of tokens from the tokenizer, and returns a parse tree.

198 """

199 # We add a closing brace because that represents the final pop of the stack.

200 tokens: list[str] = [")"] + regex_tokens[::-1]

201

202 def wrap(lst: list[Node]) -> Node:

203 """Turn list into sequence when it contains several items."""

204 if len(lst) == 1:

205 return lst[0]

206 else:

207 return NodeSequence(lst)

208

209 def _parse() -> Node:

210 or_list: list[list[Node]] = []

211 result: list[Node] = []

212

213 def wrapped_result() -> Node:

214 if or_list == []:

215 return wrap(result)

216 else:

217 or_list.append(result)

218 return AnyNode([wrap(i) for i in or_list])

219

220 while tokens:

221 t = tokens.pop()

222

223 if t.startswith("(?P<"):

224 variable = Variable(_parse(), varname=t[4:-1])

225 result.append(variable)

226

227 elif t in ("*", "*?"):

228 greedy = t == "*"

229 result[-1] = Repeat(result[-1], greedy=greedy)

230

231 elif t in ("+", "+?"):

232 greedy = t == "+"

233 result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)

234

235 elif t in ("?", "??"):

236 if result == []:

237 raise Exception("Nothing to repeat." + repr(tokens))

238 else:

239 greedy = t == "?"

240 result[-1] = Repeat(

241 result[-1], min_repeat=0, max_repeat=1, greedy=greedy

242 )

243

244 elif t == "|":

245 or_list.append(result)

246 result = []

247

248 elif t in ("(", "(?:"):

249 result.append(_parse())

250

251 elif t == "(?!":

252 result.append(Lookahead(_parse(), negative=True))

253

254 elif t == "(?=":

255 result.append(Lookahead(_parse(), negative=False))

256

257 elif t == ")":

258 return wrapped_result()

259

260 elif t.startswith("#"):

261 pass

262

263 elif t.startswith("{"):

264 # TODO: implement!

265 raise Exception(f"{t}-style repetition not yet supported")

266

267 elif t.startswith("(?"):

268 raise Exception("%r not supported" % t)

269

270 elif t.isspace():

271 pass

272 else:

273 result.append(Regex(t))

274

275 raise Exception("Expecting ')' token")

276

277 result = _parse()

278

279 if len(tokens) != 0:

280 raise Exception("Unmatched parentheses.")

281 else:

282 return result

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/prompt_toolkit/contrib/regular_languages/regex_parser.py: 89%

114 statements