Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/prompt_toolkit/contrib/regular_languages/regex_parser.py: 89%

114 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-11 06:55 +0000

1""" 

2Parser for parsing a regular expression. 

3Take a string representing a regular expression and return the root node of its 

4parse tree. 

5 

6usage:: 

7 

8 root_node = parse_regex('(hello|world)') 

9 

10Remarks: 

11- The regex parser processes multiline, it ignores all whitespace and supports 

12 multiple named groups with the same name and #-style comments. 

13 

14Limitations: 

15- Lookahead is not supported. 

16""" 

17from __future__ import annotations 

18 

19import re 

20 

21__all__ = [ 

22 "Repeat", 

23 "Variable", 

24 "Regex", 

25 "Lookahead", 

26 "tokenize_regex", 

27 "parse_regex", 

28] 

29 

30 

31class Node: 

32 """ 

33 Base class for all the grammar nodes. 

34 (You don't initialize this one.) 

35 """ 

36 

37 def __add__(self, other_node: Node) -> NodeSequence: 

38 return NodeSequence([self, other_node]) 

39 

40 def __or__(self, other_node: Node) -> AnyNode: 

41 return AnyNode([self, other_node]) 

42 

43 

44class AnyNode(Node): 

45 """ 

46 Union operation (OR operation) between several grammars. You don't 

47 initialize this yourself, but it's a result of a "Grammar1 | Grammar2" 

48 operation. 

49 """ 

50 

51 def __init__(self, children: list[Node]) -> None: 

52 self.children = children 

53 

54 def __or__(self, other_node: Node) -> AnyNode: 

55 return AnyNode(self.children + [other_node]) 

56 

57 def __repr__(self) -> str: 

58 return f"{self.__class__.__name__}({self.children!r})" 

59 

60 

61class NodeSequence(Node): 

62 """ 

63 Concatenation operation of several grammars. You don't initialize this 

64 yourself, but it's a result of a "Grammar1 + Grammar2" operation. 

65 """ 

66 

67 def __init__(self, children: list[Node]) -> None: 

68 self.children = children 

69 

70 def __add__(self, other_node: Node) -> NodeSequence: 

71 return NodeSequence(self.children + [other_node]) 

72 

73 def __repr__(self) -> str: 

74 return f"{self.__class__.__name__}({self.children!r})" 

75 

76 

77class Regex(Node): 

78 """ 

79 Regular expression. 

80 """ 

81 

82 def __init__(self, regex: str) -> None: 

83 re.compile(regex) # Validate 

84 

85 self.regex = regex 

86 

87 def __repr__(self) -> str: 

88 return f"{self.__class__.__name__}(/{self.regex}/)" 

89 

90 

91class Lookahead(Node): 

92 """ 

93 Lookahead expression. 

94 """ 

95 

96 def __init__(self, childnode: Node, negative: bool = False) -> None: 

97 self.childnode = childnode 

98 self.negative = negative 

99 

100 def __repr__(self) -> str: 

101 return f"{self.__class__.__name__}({self.childnode!r})" 

102 

103 

104class Variable(Node): 

105 """ 

106 Mark a variable in the regular grammar. This will be translated into a 

107 named group. Each variable can have his own completer, validator, etc.. 

108 

109 :param childnode: The grammar which is wrapped inside this variable. 

110 :param varname: String. 

111 """ 

112 

113 def __init__(self, childnode: Node, varname: str = "") -> None: 

114 self.childnode = childnode 

115 self.varname = varname 

116 

117 def __repr__(self) -> str: 

118 return "{}(childnode={!r}, varname={!r})".format( 

119 self.__class__.__name__, 

120 self.childnode, 

121 self.varname, 

122 ) 

123 

124 

125class Repeat(Node): 

126 def __init__( 

127 self, 

128 childnode: Node, 

129 min_repeat: int = 0, 

130 max_repeat: int | None = None, 

131 greedy: bool = True, 

132 ) -> None: 

133 self.childnode = childnode 

134 self.min_repeat = min_repeat 

135 self.max_repeat = max_repeat 

136 self.greedy = greedy 

137 

138 def __repr__(self) -> str: 

139 return f"{self.__class__.__name__}(childnode={self.childnode!r})" 

140 

141 

142def tokenize_regex(input: str) -> list[str]: 

143 """ 

144 Takes a string, representing a regular expression as input, and tokenizes 

145 it. 

146 

147 :param input: string, representing a regular expression. 

148 :returns: List of tokens. 

149 """ 

150 # Regular expression for tokenizing other regular expressions. 

151 p = re.compile( 

152 r"""^( 

153 \(\?P\<[a-zA-Z0-9_-]+\> | # Start of named group. 

154 \(\?#[^)]*\) | # Comment 

155 \(\?= | # Start of lookahead assertion 

156 \(\?! | # Start of negative lookahead assertion 

157 \(\?<= | # If preceded by. 

158 \(\?< | # If not preceded by. 

159 \(?: | # Start of group. (non capturing.) 

160 \( | # Start of group. 

161 \(?[iLmsux] | # Flags. 

162 \(?P=[a-zA-Z]+\) | # Back reference to named group 

163 \) | # End of group. 

164 \{[^{}]*\} | # Repetition 

165 \*\? | \+\? | \?\?\ | # Non greedy repetition. 

166 \* | \+ | \? | # Repetition 

167 \#.*\n | # Comment 

168 \\. | 

169 

170 # Character group. 

171 \[ 

172 ( [^\]\\] | \\.)* 

173 \] | 

174 

175 [^(){}] | 

176 . 

177 )""", 

178 re.VERBOSE, 

179 ) 

180 

181 tokens = [] 

182 

183 while input: 

184 m = p.match(input) 

185 if m: 

186 token, input = input[: m.end()], input[m.end() :] 

187 if not token.isspace(): 

188 tokens.append(token) 

189 else: 

190 raise Exception("Could not tokenize input regex.") 

191 

192 return tokens 

193 

194 

195def parse_regex(regex_tokens: list[str]) -> Node: 

196 """ 

197 Takes a list of tokens from the tokenizer, and returns a parse tree. 

198 """ 

199 # We add a closing brace because that represents the final pop of the stack. 

200 tokens: list[str] = [")"] + regex_tokens[::-1] 

201 

202 def wrap(lst: list[Node]) -> Node: 

203 """Turn list into sequence when it contains several items.""" 

204 if len(lst) == 1: 

205 return lst[0] 

206 else: 

207 return NodeSequence(lst) 

208 

209 def _parse() -> Node: 

210 or_list: list[list[Node]] = [] 

211 result: list[Node] = [] 

212 

213 def wrapped_result() -> Node: 

214 if or_list == []: 

215 return wrap(result) 

216 else: 

217 or_list.append(result) 

218 return AnyNode([wrap(i) for i in or_list]) 

219 

220 while tokens: 

221 t = tokens.pop() 

222 

223 if t.startswith("(?P<"): 

224 variable = Variable(_parse(), varname=t[4:-1]) 

225 result.append(variable) 

226 

227 elif t in ("*", "*?"): 

228 greedy = t == "*" 

229 result[-1] = Repeat(result[-1], greedy=greedy) 

230 

231 elif t in ("+", "+?"): 

232 greedy = t == "+" 

233 result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy) 

234 

235 elif t in ("?", "??"): 

236 if result == []: 

237 raise Exception("Nothing to repeat." + repr(tokens)) 

238 else: 

239 greedy = t == "?" 

240 result[-1] = Repeat( 

241 result[-1], min_repeat=0, max_repeat=1, greedy=greedy 

242 ) 

243 

244 elif t == "|": 

245 or_list.append(result) 

246 result = [] 

247 

248 elif t in ("(", "(?:"): 

249 result.append(_parse()) 

250 

251 elif t == "(?!": 

252 result.append(Lookahead(_parse(), negative=True)) 

253 

254 elif t == "(?=": 

255 result.append(Lookahead(_parse(), negative=False)) 

256 

257 elif t == ")": 

258 return wrapped_result() 

259 

260 elif t.startswith("#"): 

261 pass 

262 

263 elif t.startswith("{"): 

264 # TODO: implement! 

265 raise Exception(f"{t}-style repetition not yet supported") 

266 

267 elif t.startswith("(?"): 

268 raise Exception("%r not supported" % t) 

269 

270 elif t.isspace(): 

271 pass 

272 else: 

273 result.append(Regex(t)) 

274 

275 raise Exception("Expecting ')' token") 

276 

277 result = _parse() 

278 

279 if len(tokens) != 0: 

280 raise Exception("Unmatched parentheses.") 

281 else: 

282 return result