Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/prompt_toolkit/contrib/regular

1r"""

2Compiler for a regular grammar.

4Example usage::

6 # Create and compile grammar.

7 p = compile('add \s+ (?P<var1>[^\s]+) \s+ (?P<var2>[^\s]+)')

9 # Match input string.

10 m = p.match('add 23 432')

12 # Get variables.

13 m.variables().get('var1') # Returns "23"

14 m.variables().get('var2') # Returns "432"

17Partial matches are possible::

19 # Create and compile grammar.

20 p = compile('''

21 # Operators with two arguments.

22 ((?P<operator1>[^\s]+) \s+ (?P<var1>[^\s]+) \s+ (?P<var2>[^\s]+)) |

24 # Operators with only one arguments.

25 ((?P<operator2>[^\s]+) \s+ (?P<var1>[^\s]+))

26 ''')

28 # Match partial input string.

29 m = p.match_prefix('add 23')

31 # Get variables. (Notice that both operator1 and operator2 contain the

32 # value "add".) This is because our input is incomplete, and we don't know

33 # yet in which rule of the regex we we'll end up. It could also be that

34 # `operator1` and `operator2` have a different autocompleter and we want to

35 # call all possible autocompleters that would result in valid input.)

36 m.variables().get('var1') # Returns "23"

37 m.variables().get('operator1') # Returns "add"

38 m.variables().get('operator2') # Returns "add"

40"""

41from __future__ import annotations

43import re

44from typing import Callable, Dict, Iterable, Iterator, Pattern

45from typing import Match as RegexMatch

47from .regex_parser import (

48 AnyNode,

49 Lookahead,

50 Node,

51 NodeSequence,

52 Regex,

53 Repeat,

54 Variable,

55 parse_regex,

56 tokenize_regex,

57)

59__all__ = [

60 "compile",

61]

64# Name of the named group in the regex, matching trailing input.

65# (Trailing input is when the input contains characters after the end of the

66# expression has been matched.)

67_INVALID_TRAILING_INPUT = "invalid_trailing"

69EscapeFuncDict = Dict[str, Callable[[str], str]]

72class _CompiledGrammar:

73 """

74 Compiles a grammar. This will take the parse tree of a regular expression

75 and compile the grammar.

77 :param root_node: :class~`.regex_parser.Node` instance.

78 :param escape_funcs: `dict` mapping variable names to escape callables.

79 :param unescape_funcs: `dict` mapping variable names to unescape callables.

80 """

82 def __init__(

83 self,

84 root_node: Node,

85 escape_funcs: EscapeFuncDict | None = None,

86 unescape_funcs: EscapeFuncDict | None = None,

87 ) -> None:

88 self.root_node = root_node

89 self.escape_funcs = escape_funcs or {}

90 self.unescape_funcs = unescape_funcs or {}

92 #: Dictionary that will map the regex names to Node instances.

93 self._group_names_to_nodes: dict[

94 str, str

95 ] = {} # Maps regex group names to varnames.

96 counter = [0]

98 def create_group_func(node: Variable) -> str:

99 name = "n%s" % counter[0]

100 self._group_names_to_nodes[name] = node.varname

101 counter[0] += 1

102 return name

103

104 # Compile regex strings.

105 self._re_pattern = "^%s$" % self._transform(root_node, create_group_func)

106 self._re_prefix_patterns = list(

107 self._transform_prefix(root_node, create_group_func)

108 )

109

110 # Compile the regex itself.

111 flags = re.DOTALL # Note that we don't need re.MULTILINE! (^ and $

112 # still represent the start and end of input text.)

113 self._re = re.compile(self._re_pattern, flags)

114 self._re_prefix = [re.compile(t, flags) for t in self._re_prefix_patterns]

115

116 # We compile one more set of regexes, similar to `_re_prefix`, but accept any trailing

117 # input. This will ensure that we can still highlight the input correctly, even when the

118 # input contains some additional characters at the end that don't match the grammar.)

119 self._re_prefix_with_trailing_input = [

120 re.compile(

121 r"(?:{})(?P<{}>.*?)$".format(t.rstrip("$"), _INVALID_TRAILING_INPUT),

122 flags,

123 )

124 for t in self._re_prefix_patterns

125 ]

126

127 def escape(self, varname: str, value: str) -> str:

128 """

129 Escape `value` to fit in the place of this variable into the grammar.

130 """

131 f = self.escape_funcs.get(varname)

132 return f(value) if f else value

133

134 def unescape(self, varname: str, value: str) -> str:

135 """

136 Unescape `value`.

137 """

138 f = self.unescape_funcs.get(varname)

139 return f(value) if f else value

140

141 @classmethod

142 def _transform(

143 cls, root_node: Node, create_group_func: Callable[[Variable], str]

144 ) -> str:

145 """

146 Turn a :class:`Node` object into a regular expression.

147

148 :param root_node: The :class:`Node` instance for which we generate the grammar.

149 :param create_group_func: A callable which takes a `Node` and returns the next

150 free name for this node.

151 """

152

153 def transform(node: Node) -> str:

154 # Turn `AnyNode` into an OR.

155 if isinstance(node, AnyNode):

156 return "(?:%s)" % "|".join(transform(c) for c in node.children)

157

158 # Concatenate a `NodeSequence`

159 elif isinstance(node, NodeSequence):

160 return "".join(transform(c) for c in node.children)

161

162 # For Regex and Lookahead nodes, just insert them literally.

163 elif isinstance(node, Regex):

164 return node.regex

165

166 elif isinstance(node, Lookahead):

167 before = "(?!" if node.negative else "(="

168 return before + transform(node.childnode) + ")"

169

170 # A `Variable` wraps the children into a named group.

171 elif isinstance(node, Variable):

172 return f"(?P<{create_group_func(node)}>{transform(node.childnode)})"

173

174 # `Repeat`.

175 elif isinstance(node, Repeat):

176 if node.max_repeat is None:

177 if node.min_repeat == 0:

178 repeat_sign = "*"

179 elif node.min_repeat == 1:

180 repeat_sign = "+"

181 else:

182 repeat_sign = "{%i,%s}" % (

183 node.min_repeat,

184 ("" if node.max_repeat is None else str(node.max_repeat)),

185 )

186

187 return "(?:{}){}{}".format(

188 transform(node.childnode),

189 repeat_sign,

190 ("" if node.greedy else "?"),

191 )

192 else:

193 raise TypeError(f"Got {node!r}")

194

195 return transform(root_node)

196

197 @classmethod

198 def _transform_prefix(

199 cls, root_node: Node, create_group_func: Callable[[Variable], str]

200 ) -> Iterable[str]:

201 """

202 Yield all the regular expressions matching a prefix of the grammar

203 defined by the `Node` instance.

204

205 For each `Variable`, one regex pattern will be generated, with this

206 named group at the end. This is required because a regex engine will

207 terminate once a match is found. For autocompletion however, we need

208 the matches for all possible paths, so that we can provide completions

209 for each `Variable`.

210

211 - So, in the case of an `Any` (`A|B|C)', we generate a pattern for each

212 clause. This is one for `A`, one for `B` and one for `C`. Unless some

213 groups don't contain a `Variable`, then these can be merged together.

214 - In the case of a `NodeSequence` (`ABC`), we generate a pattern for

215 each prefix that ends with a variable, and one pattern for the whole

216 sequence. So, that's one for `A`, one for `AB` and one for `ABC`.

217

218 :param root_node: The :class:`Node` instance for which we generate the grammar.

219 :param create_group_func: A callable which takes a `Node` and returns the next

220 free name for this node.

221 """

222

223 def contains_variable(node: Node) -> bool:

224 if isinstance(node, Regex):

225 return False

226 elif isinstance(node, Variable):

227 return True

228 elif isinstance(node, (Lookahead, Repeat)):

229 return contains_variable(node.childnode)

230 elif isinstance(node, (NodeSequence, AnyNode)):

231 return any(contains_variable(child) for child in node.children)

232

233 return False

234

235 def transform(node: Node) -> Iterable[str]:

236 # Generate separate pattern for all terms that contain variables

237 # within this OR. Terms that don't contain a variable can be merged

238 # together in one pattern.

239 if isinstance(node, AnyNode):

240 # If we have a definition like:

241 # (?P<name> .*) | (?P<city> .*)

242 # Then we want to be able to generate completions for both the

243 # name as well as the city. We do this by yielding two

244 # different regular expressions, because the engine won't

245 # follow multiple paths, if multiple are possible.

246 children_with_variable = []

247 children_without_variable = []

248 for c in node.children:

249 if contains_variable(c):

250 children_with_variable.append(c)

251 else:

252 children_without_variable.append(c)

253

254 for c in children_with_variable:

255 yield from transform(c)

256

257 # Merge options without variable together.

258 if children_without_variable:

259 yield "|".join(

260 r for c in children_without_variable for r in transform(c)

261 )

262

263 # For a sequence, generate a pattern for each prefix that ends with

264 # a variable + one pattern of the complete sequence.

265 # (This is because, for autocompletion, we match the text before

266 # the cursor, and completions are given for the variable that we

267 # match right before the cursor.)

268 elif isinstance(node, NodeSequence):

269 # For all components in the sequence, compute prefix patterns,

270 # as well as full patterns.

271 complete = [cls._transform(c, create_group_func) for c in node.children]

272 prefixes = [list(transform(c)) for c in node.children]

273 variable_nodes = [contains_variable(c) for c in node.children]

274

275 # If any child is contains a variable, we should yield a

276 # pattern up to that point, so that we are sure this will be

277 # matched.

278 for i in range(len(node.children)):

279 if variable_nodes[i]:

280 for c_str in prefixes[i]:

281 yield "".join(complete[:i]) + c_str

282

283 # If there are non-variable nodes, merge all the prefixes into

284 # one pattern. If the input is: "[part1] [part2] [part3]", then

285 # this gets compiled into:

286 # (complete1 + (complete2 + (complete3 | partial3) | partial2) | partial1 )

287 # For nodes that contain a variable, we skip the "|partial"

288 # part here, because thees are matched with the previous

289 # patterns.

290 if not all(variable_nodes):

291 result = []

292

293 # Start with complete patterns.

294 for i in range(len(node.children)):

295 result.append("(?:")

296 result.append(complete[i])

297

298 # Add prefix patterns.

299 for i in range(len(node.children) - 1, -1, -1):

300 if variable_nodes[i]:

301 # No need to yield a prefix for this one, we did

302 # the variable prefixes earlier.

303 result.append(")")

304 else:

305 result.append("|(?:")

306 # If this yields multiple, we should yield all combinations.

307 assert len(prefixes[i]) == 1

308 result.append(prefixes[i][0])

309 result.append("))")

310

311 yield "".join(result)

312

313 elif isinstance(node, Regex):

314 yield "(?:%s)?" % node.regex

315

316 elif isinstance(node, Lookahead):

317 if node.negative:

318 yield "(?!%s)" % cls._transform(node.childnode, create_group_func)

319 else:

320 # Not sure what the correct semantics are in this case.

321 # (Probably it's not worth implementing this.)

322 raise Exception("Positive lookahead not yet supported.")

323

324 elif isinstance(node, Variable):

325 # (Note that we should not append a '?' here. the 'transform'

326 # method will already recursively do that.)

327 for c_str in transform(node.childnode):

328 yield f"(?P<{create_group_func(node)}>{c_str})"

329

330 elif isinstance(node, Repeat):

331 # If we have a repetition of 8 times. That would mean that the

332 # current input could have for instance 7 times a complete

333 # match, followed by a partial match.

334 prefix = cls._transform(node.childnode, create_group_func)

335

336 if node.max_repeat == 1:

337 yield from transform(node.childnode)

338 else:

339 for c_str in transform(node.childnode):

340 if node.max_repeat:

341 repeat_sign = "{,%i}" % (node.max_repeat - 1)

342 else:

343 repeat_sign = "*"

344 yield "(?:{}){}{}{}".format(

345 prefix,

346 repeat_sign,

347 ("" if node.greedy else "?"),

348 c_str,

349 )

350

351 else:

352 raise TypeError("Got %r" % node)

353

354 for r in transform(root_node):

355 yield "^(?:%s)$" % r

356

357 def match(self, string: str) -> Match | None:

358 """

359 Match the string with the grammar.

360 Returns a :class:`Match` instance or `None` when the input doesn't match the grammar.

361

362 :param string: The input string.

363 """

364 m = self._re.match(string)

365

366 if m:

367 return Match(

368 string, [(self._re, m)], self._group_names_to_nodes, self.unescape_funcs

369 )

370 return None

371

372 def match_prefix(self, string: str) -> Match | None:

373 """

374 Do a partial match of the string with the grammar. The returned

375 :class:`Match` instance can contain multiple representations of the

376 match. This will never return `None`. If it doesn't match at all, the "trailing input"

377 part will capture all of the input.

378

379 :param string: The input string.

380 """

381 # First try to match using `_re_prefix`. If nothing is found, use the patterns that

382 # also accept trailing characters.

383 for patterns in [self._re_prefix, self._re_prefix_with_trailing_input]:

384 matches = [(r, r.match(string)) for r in patterns]

385 matches2 = [(r, m) for r, m in matches if m]

386

387 if matches2 != []:

388 return Match(

389 string, matches2, self._group_names_to_nodes, self.unescape_funcs

390 )

391

392 return None

393

394

395class Match:

396 """

397 :param string: The input string.

398 :param re_matches: List of (compiled_re_pattern, re_match) tuples.

399 :param group_names_to_nodes: Dictionary mapping all the re group names to the matching Node instances.

400 """

401

402 def __init__(

403 self,

404 string: str,

405 re_matches: list[tuple[Pattern[str], RegexMatch[str]]],

406 group_names_to_nodes: dict[str, str],

407 unescape_funcs: dict[str, Callable[[str], str]],

408 ):

409 self.string = string

410 self._re_matches = re_matches

411 self._group_names_to_nodes = group_names_to_nodes

412 self._unescape_funcs = unescape_funcs

413

414 def _nodes_to_regs(self) -> list[tuple[str, tuple[int, int]]]:

415 """

416 Return a list of (varname, reg) tuples.

417 """

418

419 def get_tuples() -> Iterable[tuple[str, tuple[int, int]]]:

420 for r, re_match in self._re_matches:

421 for group_name, group_index in r.groupindex.items():

422 if group_name != _INVALID_TRAILING_INPUT:

423 regs = re_match.regs

424 reg = regs[group_index]

425 node = self._group_names_to_nodes[group_name]

426 yield (node, reg)

427

428 return list(get_tuples())

429

430 def _nodes_to_values(self) -> list[tuple[str, str, tuple[int, int]]]:

431 """

432 Returns list of (Node, string_value) tuples.

433 """

434

435 def is_none(sl: tuple[int, int]) -> bool:

436 return sl[0] == -1 and sl[1] == -1

437

438 def get(sl: tuple[int, int]) -> str:

439 return self.string[sl[0] : sl[1]]

440

441 return [

442 (varname, get(slice), slice)

443 for varname, slice in self._nodes_to_regs()

444 if not is_none(slice)

445 ]

446

447 def _unescape(self, varname: str, value: str) -> str:

448 unwrapper = self._unescape_funcs.get(varname)

449 return unwrapper(value) if unwrapper else value

450

451 def variables(self) -> Variables:

452 """

453 Returns :class:`Variables` instance.

454 """

455 return Variables(

456 [(k, self._unescape(k, v), sl) for k, v, sl in self._nodes_to_values()]

457 )

458

459 def trailing_input(self) -> MatchVariable | None:

460 """

461 Get the `MatchVariable` instance, representing trailing input, if there is any.

462 "Trailing input" is input at the end that does not match the grammar anymore, but

463 when this is removed from the end of the input, the input would be a valid string.

464 """

465 slices: list[tuple[int, int]] = []

466

467 # Find all regex group for the name _INVALID_TRAILING_INPUT.

468 for r, re_match in self._re_matches:

469 for group_name, group_index in r.groupindex.items():

470 if group_name == _INVALID_TRAILING_INPUT:

471 slices.append(re_match.regs[group_index])

472

473 # Take the smallest part. (Smaller trailing text means that a larger input has

474 # been matched, so that is better.)

475 if slices:

476 slice = (max(i[0] for i in slices), max(i[1] for i in slices))

477 value = self.string[slice[0] : slice[1]]

478 return MatchVariable("<trailing_input>", value, slice)

479 return None

480

481 def end_nodes(self) -> Iterable[MatchVariable]:

482 """

483 Yields `MatchVariable` instances for all the nodes having their end

484 position at the end of the input string.

485 """

486 for varname, reg in self._nodes_to_regs():

487 # If this part goes until the end of the input string.

488 if reg[1] == len(self.string):

489 value = self._unescape(varname, self.string[reg[0] : reg[1]])

490 yield MatchVariable(varname, value, (reg[0], reg[1]))

491

492

493class Variables:

494 def __init__(self, tuples: list[tuple[str, str, tuple[int, int]]]) -> None:

495 #: List of (varname, value, slice) tuples.

496 self._tuples = tuples

497

498 def __repr__(self) -> str:

499 return "{}({})".format(

500 self.__class__.__name__,

501 ", ".join(f"{k}={v!r}" for k, v, _ in self._tuples),

502 )

503

504 def get(self, key: str, default: str | None = None) -> str | None:

505 items = self.getall(key)

506 return items[0] if items else default

507

508 def getall(self, key: str) -> list[str]:

509 return [v for k, v, _ in self._tuples if k == key]

510

511 def __getitem__(self, key: str) -> str | None:

512 return self.get(key)

513

514 def __iter__(self) -> Iterator[MatchVariable]:

515 """

516 Yield `MatchVariable` instances.

517 """

518 for varname, value, slice in self._tuples:

519 yield MatchVariable(varname, value, slice)

520

521

522class MatchVariable:

523 """

524 Represents a match of a variable in the grammar.

525

526 :param varname: (string) Name of the variable.

527 :param value: (string) Value of this variable.

528 :param slice: (start, stop) tuple, indicating the position of this variable

529 in the input string.

530 """

531

532 def __init__(self, varname: str, value: str, slice: tuple[int, int]) -> None:

533 self.varname = varname

534 self.value = value

535 self.slice = slice

536

537 self.start = self.slice[0]

538 self.stop = self.slice[1]

539

540 def __repr__(self) -> str:

541 return f"{self.__class__.__name__}({self.varname!r}, {self.value!r})"

542

543

544def compile(

545 expression: str,

546 escape_funcs: EscapeFuncDict | None = None,

547 unescape_funcs: EscapeFuncDict | None = None,

548) -> _CompiledGrammar:

549 """

550 Compile grammar (given as regex string), returning a `CompiledGrammar`

551 instance.

552 """

553 return _compile_from_parse_tree(

554 parse_regex(tokenize_regex(expression)),

555 escape_funcs=escape_funcs,

556 unescape_funcs=unescape_funcs,

557 )

558

559

560def _compile_from_parse_tree(

561 root_node: Node,

562 escape_funcs: EscapeFuncDict | None = None,

563 unescape_funcs: EscapeFuncDict | None = None,

564) -> _CompiledGrammar:

565 """

566 Compile grammar (given as parse tree), returning a `CompiledGrammar`

567 instance.

568 """

569 return _CompiledGrammar(

570 root_node, escape_funcs=escape_funcs, unescape_funcs=unescape_funcs

571 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/prompt_toolkit/contrib/regular_languages/compiler.py: 67%

205 statements