Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/prompt_toolkit/contrib/regular_languages/compiler.py: 67%

205 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-11 06:55 +0000

1r""" 

2Compiler for a regular grammar. 

3 

4Example usage:: 

5 

6 # Create and compile grammar. 

7 p = compile('add \s+ (?P<var1>[^\s]+) \s+ (?P<var2>[^\s]+)') 

8 

9 # Match input string. 

10 m = p.match('add 23 432') 

11 

12 # Get variables. 

13 m.variables().get('var1') # Returns "23" 

14 m.variables().get('var2') # Returns "432" 

15 

16 

17Partial matches are possible:: 

18 

19 # Create and compile grammar. 

20 p = compile(''' 

21 # Operators with two arguments. 

22 ((?P<operator1>[^\s]+) \s+ (?P<var1>[^\s]+) \s+ (?P<var2>[^\s]+)) | 

23 

24 # Operators with only one arguments. 

25 ((?P<operator2>[^\s]+) \s+ (?P<var1>[^\s]+)) 

26 ''') 

27 

28 # Match partial input string. 

29 m = p.match_prefix('add 23') 

30 

31 # Get variables. (Notice that both operator1 and operator2 contain the 

32 # value "add".) This is because our input is incomplete, and we don't know 

33 # yet in which rule of the regex we we'll end up. It could also be that 

34 # `operator1` and `operator2` have a different autocompleter and we want to 

35 # call all possible autocompleters that would result in valid input.) 

36 m.variables().get('var1') # Returns "23" 

37 m.variables().get('operator1') # Returns "add" 

38 m.variables().get('operator2') # Returns "add" 

39 

40""" 

41from __future__ import annotations 

42 

43import re 

44from typing import Callable, Dict, Iterable, Iterator, Pattern 

45from typing import Match as RegexMatch 

46 

47from .regex_parser import ( 

48 AnyNode, 

49 Lookahead, 

50 Node, 

51 NodeSequence, 

52 Regex, 

53 Repeat, 

54 Variable, 

55 parse_regex, 

56 tokenize_regex, 

57) 

58 

59__all__ = [ 

60 "compile", 

61] 

62 

63 

64# Name of the named group in the regex, matching trailing input. 

65# (Trailing input is when the input contains characters after the end of the 

66# expression has been matched.) 

67_INVALID_TRAILING_INPUT = "invalid_trailing" 

68 

69EscapeFuncDict = Dict[str, Callable[[str], str]] 

70 

71 

72class _CompiledGrammar: 

73 """ 

74 Compiles a grammar. This will take the parse tree of a regular expression 

75 and compile the grammar. 

76 

77 :param root_node: :class~`.regex_parser.Node` instance. 

78 :param escape_funcs: `dict` mapping variable names to escape callables. 

79 :param unescape_funcs: `dict` mapping variable names to unescape callables. 

80 """ 

81 

82 def __init__( 

83 self, 

84 root_node: Node, 

85 escape_funcs: EscapeFuncDict | None = None, 

86 unescape_funcs: EscapeFuncDict | None = None, 

87 ) -> None: 

88 self.root_node = root_node 

89 self.escape_funcs = escape_funcs or {} 

90 self.unescape_funcs = unescape_funcs or {} 

91 

92 #: Dictionary that will map the regex names to Node instances. 

93 self._group_names_to_nodes: dict[ 

94 str, str 

95 ] = {} # Maps regex group names to varnames. 

96 counter = [0] 

97 

98 def create_group_func(node: Variable) -> str: 

99 name = "n%s" % counter[0] 

100 self._group_names_to_nodes[name] = node.varname 

101 counter[0] += 1 

102 return name 

103 

104 # Compile regex strings. 

105 self._re_pattern = "^%s$" % self._transform(root_node, create_group_func) 

106 self._re_prefix_patterns = list( 

107 self._transform_prefix(root_node, create_group_func) 

108 ) 

109 

110 # Compile the regex itself. 

111 flags = re.DOTALL # Note that we don't need re.MULTILINE! (^ and $ 

112 # still represent the start and end of input text.) 

113 self._re = re.compile(self._re_pattern, flags) 

114 self._re_prefix = [re.compile(t, flags) for t in self._re_prefix_patterns] 

115 

116 # We compile one more set of regexes, similar to `_re_prefix`, but accept any trailing 

117 # input. This will ensure that we can still highlight the input correctly, even when the 

118 # input contains some additional characters at the end that don't match the grammar.) 

119 self._re_prefix_with_trailing_input = [ 

120 re.compile( 

121 r"(?:{})(?P<{}>.*?)$".format(t.rstrip("$"), _INVALID_TRAILING_INPUT), 

122 flags, 

123 ) 

124 for t in self._re_prefix_patterns 

125 ] 

126 

127 def escape(self, varname: str, value: str) -> str: 

128 """ 

129 Escape `value` to fit in the place of this variable into the grammar. 

130 """ 

131 f = self.escape_funcs.get(varname) 

132 return f(value) if f else value 

133 

134 def unescape(self, varname: str, value: str) -> str: 

135 """ 

136 Unescape `value`. 

137 """ 

138 f = self.unescape_funcs.get(varname) 

139 return f(value) if f else value 

140 

141 @classmethod 

142 def _transform( 

143 cls, root_node: Node, create_group_func: Callable[[Variable], str] 

144 ) -> str: 

145 """ 

146 Turn a :class:`Node` object into a regular expression. 

147 

148 :param root_node: The :class:`Node` instance for which we generate the grammar. 

149 :param create_group_func: A callable which takes a `Node` and returns the next 

150 free name for this node. 

151 """ 

152 

153 def transform(node: Node) -> str: 

154 # Turn `AnyNode` into an OR. 

155 if isinstance(node, AnyNode): 

156 return "(?:%s)" % "|".join(transform(c) for c in node.children) 

157 

158 # Concatenate a `NodeSequence` 

159 elif isinstance(node, NodeSequence): 

160 return "".join(transform(c) for c in node.children) 

161 

162 # For Regex and Lookahead nodes, just insert them literally. 

163 elif isinstance(node, Regex): 

164 return node.regex 

165 

166 elif isinstance(node, Lookahead): 

167 before = "(?!" if node.negative else "(=" 

168 return before + transform(node.childnode) + ")" 

169 

170 # A `Variable` wraps the children into a named group. 

171 elif isinstance(node, Variable): 

172 return f"(?P<{create_group_func(node)}>{transform(node.childnode)})" 

173 

174 # `Repeat`. 

175 elif isinstance(node, Repeat): 

176 if node.max_repeat is None: 

177 if node.min_repeat == 0: 

178 repeat_sign = "*" 

179 elif node.min_repeat == 1: 

180 repeat_sign = "+" 

181 else: 

182 repeat_sign = "{%i,%s}" % ( 

183 node.min_repeat, 

184 ("" if node.max_repeat is None else str(node.max_repeat)), 

185 ) 

186 

187 return "(?:{}){}{}".format( 

188 transform(node.childnode), 

189 repeat_sign, 

190 ("" if node.greedy else "?"), 

191 ) 

192 else: 

193 raise TypeError(f"Got {node!r}") 

194 

195 return transform(root_node) 

196 

197 @classmethod 

198 def _transform_prefix( 

199 cls, root_node: Node, create_group_func: Callable[[Variable], str] 

200 ) -> Iterable[str]: 

201 """ 

202 Yield all the regular expressions matching a prefix of the grammar 

203 defined by the `Node` instance. 

204 

205 For each `Variable`, one regex pattern will be generated, with this 

206 named group at the end. This is required because a regex engine will 

207 terminate once a match is found. For autocompletion however, we need 

208 the matches for all possible paths, so that we can provide completions 

209 for each `Variable`. 

210 

211 - So, in the case of an `Any` (`A|B|C)', we generate a pattern for each 

212 clause. This is one for `A`, one for `B` and one for `C`. Unless some 

213 groups don't contain a `Variable`, then these can be merged together. 

214 - In the case of a `NodeSequence` (`ABC`), we generate a pattern for 

215 each prefix that ends with a variable, and one pattern for the whole 

216 sequence. So, that's one for `A`, one for `AB` and one for `ABC`. 

217 

218 :param root_node: The :class:`Node` instance for which we generate the grammar. 

219 :param create_group_func: A callable which takes a `Node` and returns the next 

220 free name for this node. 

221 """ 

222 

223 def contains_variable(node: Node) -> bool: 

224 if isinstance(node, Regex): 

225 return False 

226 elif isinstance(node, Variable): 

227 return True 

228 elif isinstance(node, (Lookahead, Repeat)): 

229 return contains_variable(node.childnode) 

230 elif isinstance(node, (NodeSequence, AnyNode)): 

231 return any(contains_variable(child) for child in node.children) 

232 

233 return False 

234 

235 def transform(node: Node) -> Iterable[str]: 

236 # Generate separate pattern for all terms that contain variables 

237 # within this OR. Terms that don't contain a variable can be merged 

238 # together in one pattern. 

239 if isinstance(node, AnyNode): 

240 # If we have a definition like: 

241 # (?P<name> .*) | (?P<city> .*) 

242 # Then we want to be able to generate completions for both the 

243 # name as well as the city. We do this by yielding two 

244 # different regular expressions, because the engine won't 

245 # follow multiple paths, if multiple are possible. 

246 children_with_variable = [] 

247 children_without_variable = [] 

248 for c in node.children: 

249 if contains_variable(c): 

250 children_with_variable.append(c) 

251 else: 

252 children_without_variable.append(c) 

253 

254 for c in children_with_variable: 

255 yield from transform(c) 

256 

257 # Merge options without variable together. 

258 if children_without_variable: 

259 yield "|".join( 

260 r for c in children_without_variable for r in transform(c) 

261 ) 

262 

263 # For a sequence, generate a pattern for each prefix that ends with 

264 # a variable + one pattern of the complete sequence. 

265 # (This is because, for autocompletion, we match the text before 

266 # the cursor, and completions are given for the variable that we 

267 # match right before the cursor.) 

268 elif isinstance(node, NodeSequence): 

269 # For all components in the sequence, compute prefix patterns, 

270 # as well as full patterns. 

271 complete = [cls._transform(c, create_group_func) for c in node.children] 

272 prefixes = [list(transform(c)) for c in node.children] 

273 variable_nodes = [contains_variable(c) for c in node.children] 

274 

275 # If any child is contains a variable, we should yield a 

276 # pattern up to that point, so that we are sure this will be 

277 # matched. 

278 for i in range(len(node.children)): 

279 if variable_nodes[i]: 

280 for c_str in prefixes[i]: 

281 yield "".join(complete[:i]) + c_str 

282 

283 # If there are non-variable nodes, merge all the prefixes into 

284 # one pattern. If the input is: "[part1] [part2] [part3]", then 

285 # this gets compiled into: 

286 # (complete1 + (complete2 + (complete3 | partial3) | partial2) | partial1 ) 

287 # For nodes that contain a variable, we skip the "|partial" 

288 # part here, because thees are matched with the previous 

289 # patterns. 

290 if not all(variable_nodes): 

291 result = [] 

292 

293 # Start with complete patterns. 

294 for i in range(len(node.children)): 

295 result.append("(?:") 

296 result.append(complete[i]) 

297 

298 # Add prefix patterns. 

299 for i in range(len(node.children) - 1, -1, -1): 

300 if variable_nodes[i]: 

301 # No need to yield a prefix for this one, we did 

302 # the variable prefixes earlier. 

303 result.append(")") 

304 else: 

305 result.append("|(?:") 

306 # If this yields multiple, we should yield all combinations. 

307 assert len(prefixes[i]) == 1 

308 result.append(prefixes[i][0]) 

309 result.append("))") 

310 

311 yield "".join(result) 

312 

313 elif isinstance(node, Regex): 

314 yield "(?:%s)?" % node.regex 

315 

316 elif isinstance(node, Lookahead): 

317 if node.negative: 

318 yield "(?!%s)" % cls._transform(node.childnode, create_group_func) 

319 else: 

320 # Not sure what the correct semantics are in this case. 

321 # (Probably it's not worth implementing this.) 

322 raise Exception("Positive lookahead not yet supported.") 

323 

324 elif isinstance(node, Variable): 

325 # (Note that we should not append a '?' here. the 'transform' 

326 # method will already recursively do that.) 

327 for c_str in transform(node.childnode): 

328 yield f"(?P<{create_group_func(node)}>{c_str})" 

329 

330 elif isinstance(node, Repeat): 

331 # If we have a repetition of 8 times. That would mean that the 

332 # current input could have for instance 7 times a complete 

333 # match, followed by a partial match. 

334 prefix = cls._transform(node.childnode, create_group_func) 

335 

336 if node.max_repeat == 1: 

337 yield from transform(node.childnode) 

338 else: 

339 for c_str in transform(node.childnode): 

340 if node.max_repeat: 

341 repeat_sign = "{,%i}" % (node.max_repeat - 1) 

342 else: 

343 repeat_sign = "*" 

344 yield "(?:{}){}{}{}".format( 

345 prefix, 

346 repeat_sign, 

347 ("" if node.greedy else "?"), 

348 c_str, 

349 ) 

350 

351 else: 

352 raise TypeError("Got %r" % node) 

353 

354 for r in transform(root_node): 

355 yield "^(?:%s)$" % r 

356 

357 def match(self, string: str) -> Match | None: 

358 """ 

359 Match the string with the grammar. 

360 Returns a :class:`Match` instance or `None` when the input doesn't match the grammar. 

361 

362 :param string: The input string. 

363 """ 

364 m = self._re.match(string) 

365 

366 if m: 

367 return Match( 

368 string, [(self._re, m)], self._group_names_to_nodes, self.unescape_funcs 

369 ) 

370 return None 

371 

372 def match_prefix(self, string: str) -> Match | None: 

373 """ 

374 Do a partial match of the string with the grammar. The returned 

375 :class:`Match` instance can contain multiple representations of the 

376 match. This will never return `None`. If it doesn't match at all, the "trailing input" 

377 part will capture all of the input. 

378 

379 :param string: The input string. 

380 """ 

381 # First try to match using `_re_prefix`. If nothing is found, use the patterns that 

382 # also accept trailing characters. 

383 for patterns in [self._re_prefix, self._re_prefix_with_trailing_input]: 

384 matches = [(r, r.match(string)) for r in patterns] 

385 matches2 = [(r, m) for r, m in matches if m] 

386 

387 if matches2 != []: 

388 return Match( 

389 string, matches2, self._group_names_to_nodes, self.unescape_funcs 

390 ) 

391 

392 return None 

393 

394 

395class Match: 

396 """ 

397 :param string: The input string. 

398 :param re_matches: List of (compiled_re_pattern, re_match) tuples. 

399 :param group_names_to_nodes: Dictionary mapping all the re group names to the matching Node instances. 

400 """ 

401 

402 def __init__( 

403 self, 

404 string: str, 

405 re_matches: list[tuple[Pattern[str], RegexMatch[str]]], 

406 group_names_to_nodes: dict[str, str], 

407 unescape_funcs: dict[str, Callable[[str], str]], 

408 ): 

409 self.string = string 

410 self._re_matches = re_matches 

411 self._group_names_to_nodes = group_names_to_nodes 

412 self._unescape_funcs = unescape_funcs 

413 

414 def _nodes_to_regs(self) -> list[tuple[str, tuple[int, int]]]: 

415 """ 

416 Return a list of (varname, reg) tuples. 

417 """ 

418 

419 def get_tuples() -> Iterable[tuple[str, tuple[int, int]]]: 

420 for r, re_match in self._re_matches: 

421 for group_name, group_index in r.groupindex.items(): 

422 if group_name != _INVALID_TRAILING_INPUT: 

423 regs = re_match.regs 

424 reg = regs[group_index] 

425 node = self._group_names_to_nodes[group_name] 

426 yield (node, reg) 

427 

428 return list(get_tuples()) 

429 

430 def _nodes_to_values(self) -> list[tuple[str, str, tuple[int, int]]]: 

431 """ 

432 Returns list of (Node, string_value) tuples. 

433 """ 

434 

435 def is_none(sl: tuple[int, int]) -> bool: 

436 return sl[0] == -1 and sl[1] == -1 

437 

438 def get(sl: tuple[int, int]) -> str: 

439 return self.string[sl[0] : sl[1]] 

440 

441 return [ 

442 (varname, get(slice), slice) 

443 for varname, slice in self._nodes_to_regs() 

444 if not is_none(slice) 

445 ] 

446 

447 def _unescape(self, varname: str, value: str) -> str: 

448 unwrapper = self._unescape_funcs.get(varname) 

449 return unwrapper(value) if unwrapper else value 

450 

451 def variables(self) -> Variables: 

452 """ 

453 Returns :class:`Variables` instance. 

454 """ 

455 return Variables( 

456 [(k, self._unescape(k, v), sl) for k, v, sl in self._nodes_to_values()] 

457 ) 

458 

459 def trailing_input(self) -> MatchVariable | None: 

460 """ 

461 Get the `MatchVariable` instance, representing trailing input, if there is any. 

462 "Trailing input" is input at the end that does not match the grammar anymore, but 

463 when this is removed from the end of the input, the input would be a valid string. 

464 """ 

465 slices: list[tuple[int, int]] = [] 

466 

467 # Find all regex group for the name _INVALID_TRAILING_INPUT. 

468 for r, re_match in self._re_matches: 

469 for group_name, group_index in r.groupindex.items(): 

470 if group_name == _INVALID_TRAILING_INPUT: 

471 slices.append(re_match.regs[group_index]) 

472 

473 # Take the smallest part. (Smaller trailing text means that a larger input has 

474 # been matched, so that is better.) 

475 if slices: 

476 slice = (max(i[0] for i in slices), max(i[1] for i in slices)) 

477 value = self.string[slice[0] : slice[1]] 

478 return MatchVariable("<trailing_input>", value, slice) 

479 return None 

480 

481 def end_nodes(self) -> Iterable[MatchVariable]: 

482 """ 

483 Yields `MatchVariable` instances for all the nodes having their end 

484 position at the end of the input string. 

485 """ 

486 for varname, reg in self._nodes_to_regs(): 

487 # If this part goes until the end of the input string. 

488 if reg[1] == len(self.string): 

489 value = self._unescape(varname, self.string[reg[0] : reg[1]]) 

490 yield MatchVariable(varname, value, (reg[0], reg[1])) 

491 

492 

493class Variables: 

494 def __init__(self, tuples: list[tuple[str, str, tuple[int, int]]]) -> None: 

495 #: List of (varname, value, slice) tuples. 

496 self._tuples = tuples 

497 

498 def __repr__(self) -> str: 

499 return "{}({})".format( 

500 self.__class__.__name__, 

501 ", ".join(f"{k}={v!r}" for k, v, _ in self._tuples), 

502 ) 

503 

504 def get(self, key: str, default: str | None = None) -> str | None: 

505 items = self.getall(key) 

506 return items[0] if items else default 

507 

508 def getall(self, key: str) -> list[str]: 

509 return [v for k, v, _ in self._tuples if k == key] 

510 

511 def __getitem__(self, key: str) -> str | None: 

512 return self.get(key) 

513 

514 def __iter__(self) -> Iterator[MatchVariable]: 

515 """ 

516 Yield `MatchVariable` instances. 

517 """ 

518 for varname, value, slice in self._tuples: 

519 yield MatchVariable(varname, value, slice) 

520 

521 

522class MatchVariable: 

523 """ 

524 Represents a match of a variable in the grammar. 

525 

526 :param varname: (string) Name of the variable. 

527 :param value: (string) Value of this variable. 

528 :param slice: (start, stop) tuple, indicating the position of this variable 

529 in the input string. 

530 """ 

531 

532 def __init__(self, varname: str, value: str, slice: tuple[int, int]) -> None: 

533 self.varname = varname 

534 self.value = value 

535 self.slice = slice 

536 

537 self.start = self.slice[0] 

538 self.stop = self.slice[1] 

539 

540 def __repr__(self) -> str: 

541 return f"{self.__class__.__name__}({self.varname!r}, {self.value!r})" 

542 

543 

544def compile( 

545 expression: str, 

546 escape_funcs: EscapeFuncDict | None = None, 

547 unescape_funcs: EscapeFuncDict | None = None, 

548) -> _CompiledGrammar: 

549 """ 

550 Compile grammar (given as regex string), returning a `CompiledGrammar` 

551 instance. 

552 """ 

553 return _compile_from_parse_tree( 

554 parse_regex(tokenize_regex(expression)), 

555 escape_funcs=escape_funcs, 

556 unescape_funcs=unescape_funcs, 

557 ) 

558 

559 

560def _compile_from_parse_tree( 

561 root_node: Node, 

562 escape_funcs: EscapeFuncDict | None = None, 

563 unescape_funcs: EscapeFuncDict | None = None, 

564) -> _CompiledGrammar: 

565 """ 

566 Compile grammar (given as parse tree), returning a `CompiledGrammar` 

567 instance. 

568 """ 

569 return _CompiledGrammar( 

570 root_node, escape_funcs=escape_funcs, unescape_funcs=unescape_funcs 

571 )