Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/parsimonious/expressions.py: 70%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

211 statements  

1"""Subexpressions that make up a parsed grammar 

2 

3These do the parsing. 

4 

5""" 

6# TODO: Make sure all symbol refs are local--not class lookups or 

7# anything--for speed. And kill all the dots. 

8 

9from collections import defaultdict 

10from inspect import getfullargspec, isfunction, ismethod, ismethoddescriptor 

11try: 

12 import regex as re 

13except ImportError: 

14 import re # Fallback as per https://github.com/erikrose/parsimonious/issues/231 

15 

16from parsimonious.exceptions import ParseError, IncompleteParseError, LeftRecursionError 

17from parsimonious.nodes import Node, RegexNode 

18from parsimonious.utils import StrAndRepr 

19 

20 

21def is_callable(value): 

22 criteria = [isfunction, ismethod, ismethoddescriptor] 

23 return any([criterion(value) for criterion in criteria]) 

24 

25 

26def expression(callable, rule_name, grammar): 

27 """Turn a plain callable into an Expression. 

28 

29 The callable can be of this simple form:: 

30 

31 def foo(text, pos): 

32 '''If this custom expression matches starting at text[pos], return 

33 the index where it stops matching. Otherwise, return None.''' 

34 if the expression matched: 

35 return end_pos 

36 

37 If there child nodes to return, return a tuple:: 

38 

39 return end_pos, children 

40 

41 If the expression doesn't match at the given ``pos`` at all... :: 

42 

43 return None 

44 

45 If your callable needs to make sub-calls to other rules in the grammar or 

46 do error reporting, it can take this form, gaining additional arguments:: 

47 

48 def foo(text, pos, cache, error, grammar): 

49 # Call out to other rules: 

50 node = grammar['another_rule'].match_core(text, pos, cache, error) 

51 ... 

52 # Return values as above. 

53 

54 The return value of the callable, if an int or a tuple, will be 

55 automatically transmuted into a :class:`~parsimonious.Node`. If it returns 

56 a Node-like class directly, it will be passed through unchanged. 

57 

58 :arg rule_name: The rule name to attach to the resulting 

59 :class:`~parsimonious.Expression` 

60 :arg grammar: The :class:`~parsimonious.Grammar` this expression will be a 

61 part of, to make delegating to other rules possible 

62 

63 """ 

64 

65 # Resolve unbound methods; allows grammars to use @staticmethod custom rules 

66 # https://stackoverflow.com/questions/41921255/staticmethod-object-is-not-callable 

67 if ismethoddescriptor(callable) and hasattr(callable, '__func__'): 

68 callable = callable.__func__ 

69 

70 num_args = len(getfullargspec(callable).args) 

71 if ismethod(callable): 

72 # do not count the first argument (typically 'self') for methods 

73 num_args -= 1 

74 if num_args == 2: 

75 is_simple = True 

76 elif num_args == 5: 

77 is_simple = False 

78 else: 

79 raise RuntimeError("Custom rule functions must take either 2 or 5 " 

80 "arguments, not %s." % num_args) 

81 

82 class AdHocExpression(Expression): 

83 def _uncached_match(self, text, pos, cache, error): 

84 result = (callable(text, pos) if is_simple else 

85 callable(text, pos, cache, error, grammar)) 

86 

87 if isinstance(result, int): 

88 end, children = result, None 

89 elif isinstance(result, tuple): 

90 end, children = result 

91 else: 

92 # Node or None 

93 return result 

94 return Node(self, text, pos, end, children=children) 

95 

96 def _as_rhs(self): 

97 return '{custom function "%s"}' % callable.__name__ 

98 

99 return AdHocExpression(name=rule_name) 

100 

101 

102IN_PROGRESS = object() 

103 

104 

105class Expression(StrAndRepr): 

106 """A thing that can be matched against a piece of text""" 

107 

108 # Slots are about twice as fast as __dict__-based attributes: 

109 # http://stackoverflow.com/questions/1336791/dictionary-vs-object-which-is-more-efficient-and-why 

110 

111 # Top-level expressions--rules--have names. Subexpressions are named ''. 

112 __slots__ = ['name', 'identity_tuple'] 

113 

114 def __init__(self, name=''): 

115 self.name = name 

116 self.identity_tuple = (self.name, ) 

117 

118 def __hash__(self): 

119 return hash(self.identity_tuple) 

120 

121 def __eq__(self, other): 

122 return self._eq_check_cycles(other, set()) 

123 

124 def __ne__(self, other): 

125 return not (self == other) 

126 

127 def _eq_check_cycles(self, other, checked): 

128 # keep a set of all pairs that are already checked, so we won't fall into infinite recursions. 

129 checked.add((id(self), id(other))) 

130 return other.__class__ is self.__class__ and self.identity_tuple == other.identity_tuple 

131 

132 def resolve_refs(self, rule_map): 

133 # Nothing to do on the base expression. 

134 return self 

135 

136 def parse(self, text, pos=0): 

137 """Return a parse tree of ``text``. 

138 

139 Raise ``ParseError`` if the expression wasn't satisfied. Raise 

140 ``IncompleteParseError`` if the expression was satisfied but didn't 

141 consume the full string. 

142 

143 """ 

144 node = self.match(text, pos=pos) 

145 if node.end < len(text): 

146 raise IncompleteParseError(text, node.end, self) 

147 return node 

148 

149 def match(self, text, pos=0): 

150 """Return the parse tree matching this expression at the given 

151 position, not necessarily extending all the way to the end of ``text``. 

152 

153 Raise ``ParseError`` if there is no match there. 

154 

155 :arg pos: The index at which to start matching 

156 

157 """ 

158 error = ParseError(text) 

159 node = self.match_core(text, pos, defaultdict(dict), error) 

160 if node is None: 

161 raise error 

162 return node 

163 

164 def match_core(self, text, pos, cache, error): 

165 """Internal guts of ``match()`` 

166 

167 This is appropriate to call only from custom rules or Expression 

168 subclasses. 

169 

170 :arg cache: The packrat cache:: 

171 

172 {(oid, pos): Node tree matched by object `oid` at index `pos` ...} 

173 

174 :arg error: A ParseError instance with ``text`` already filled in but 

175 otherwise blank. We update the error reporting info on this object 

176 as we go. (Sticking references on an existing instance is faster 

177 than allocating a new one for each expression that fails.) We 

178 return None rather than raising and catching ParseErrors because 

179 catching is slow. 

180 

181 """ 

182 # TODO: Optimize. Probably a hot spot. 

183 # 

184 # Is there a faster way of looking up cached stuff? 

185 # 

186 # If this is slow, think about the array module. It might (or might 

187 # not!) use more RAM, but it'll likely be faster than hashing things 

188 # all the time. Also, can we move all the allocs up front? 

189 # 

190 # To save space, we have lots of choices: (0) Quit caching whole Node 

191 # objects. Cache just what you need to reconstitute them. (1) Cache 

192 # only the results of entire rules, not subexpressions (probably a 

193 # horrible idea for rules that need to backtrack internally a lot). (2) 

194 # Age stuff out of the cache somehow. LRU? (3) Cuts. 

195 expr_cache = cache[id(self)] 

196 if pos in expr_cache: 

197 node = expr_cache[pos] 

198 else: 

199 # TODO: Set default value to prevent infinite recursion in left-recursive rules. 

200 expr_cache[pos] = IN_PROGRESS # Mark as in progress 

201 node = expr_cache[pos] = self._uncached_match(text, pos, cache, error) 

202 if node is IN_PROGRESS: 

203 raise LeftRecursionError(text, pos=-1, expr=self) 

204 

205 # Record progress for error reporting: 

206 if node is None and pos >= error.pos and ( 

207 self.name or getattr(error.expr, 'name', None) is None): 

208 # Don't bother reporting on unnamed expressions (unless that's all 

209 # we've seen so far), as they're hard to track down for a human. 

210 # Perhaps we could include the unnamed subexpressions later as 

211 # auxiliary info. 

212 error.expr = self 

213 error.pos = pos 

214 

215 return node 

216 

217 def __str__(self): 

218 return '<%s %s>' % ( 

219 self.__class__.__name__, 

220 self.as_rule()) 

221 

222 def as_rule(self): 

223 """Return the left- and right-hand sides of a rule that represents me. 

224 

225 Return unicode. If I have no ``name``, omit the left-hand side. 

226 

227 """ 

228 rhs = self._as_rhs().strip() 

229 if rhs.startswith('(') and rhs.endswith(')'): 

230 rhs = rhs[1:-1] 

231 

232 return ('%s = %s' % (self.name, rhs)) if self.name else rhs 

233 

234 def _unicode_members(self): 

235 """Return an iterable of my unicode-represented children, stopping 

236 descent when we hit a named node so the returned value resembles the 

237 input rule.""" 

238 return [(m.name or m._as_rhs()) for m in self.members] 

239 

240 def _as_rhs(self): 

241 """Return the right-hand side of a rule that represents me. 

242 

243 Implemented by subclasses. 

244 

245 """ 

246 raise NotImplementedError 

247 

248 

249class Literal(Expression): 

250 """A string literal 

251 

252 Use these if you can; they're the fastest. 

253 

254 """ 

255 __slots__ = ['literal'] 

256 

257 def __init__(self, literal, name=''): 

258 super().__init__(name) 

259 self.literal = literal 

260 self.identity_tuple = (name, literal) 

261 

262 def _uncached_match(self, text, pos, cache, error): 

263 if text.startswith(self.literal, pos): 

264 return Node(self, text, pos, pos + len(self.literal)) 

265 

266 def _as_rhs(self): 

267 return repr(self.literal) 

268 

269 

270class TokenMatcher(Literal): 

271 """An expression matching a single token of a given type 

272 

273 This is for use only with TokenGrammars. 

274 

275 """ 

276 def _uncached_match(self, token_list, pos, cache, error): 

277 if token_list[pos].type == self.literal: 

278 return Node(self, token_list, pos, pos + 1) 

279 

280 

281class Regex(Expression): 

282 """An expression that matches what a regex does. 

283 

284 Use these as much as you can and jam as much into each one as you can; 

285 they're fast. 

286 

287 """ 

288 __slots__ = ['re'] 

289 

290 def __init__(self, pattern, name='', ignore_case=False, locale=False, 

291 multiline=False, dot_all=False, unicode=False, verbose=False, ascii=False): 

292 super().__init__(name) 

293 self.re = re.compile(pattern, (ignore_case and re.I) | 

294 (locale and re.L) | 

295 (multiline and re.M) | 

296 (dot_all and re.S) | 

297 (unicode and re.U) | 

298 (verbose and re.X) | 

299 (ascii and re.A)) 

300 self.identity_tuple = (self.name, self.re) 

301 

302 def _uncached_match(self, text, pos, cache, error): 

303 """Return length of match, ``None`` if no match.""" 

304 m = self.re.match(text, pos) 

305 if m is not None: 

306 span = m.span() 

307 node = RegexNode(self, text, pos, pos + span[1] - span[0]) 

308 node.match = m # TODO: A terrible idea for cache size? 

309 return node 

310 

311 def _regex_flags_from_bits(self, bits): 

312 """Return the textual equivalent of numerically encoded regex flags.""" 

313 flags = 'ilmsuxa' 

314 return ''.join(flags[i - 1] if (1 << i) & bits else '' for i in range(1, len(flags) + 1)) 

315 

316 def _as_rhs(self): 

317 return '~{!r}{}'.format(self.re.pattern, 

318 self._regex_flags_from_bits(self.re.flags)) 

319 

320 

321class Compound(Expression): 

322 """An abstract expression which contains other expressions""" 

323 

324 __slots__ = ['members'] 

325 

326 def __init__(self, *members, **kwargs): 

327 """``members`` is a sequence of expressions.""" 

328 super().__init__(kwargs.get('name', '')) 

329 self.members = members 

330 

331 def resolve_refs(self, rule_map): 

332 self.members = tuple(m.resolve_refs(rule_map) for m in self.members) 

333 return self 

334 

335 def _eq_check_cycles(self, other, checked): 

336 return ( 

337 super()._eq_check_cycles(other, checked) and 

338 len(self.members) == len(other.members) and 

339 all(m._eq_check_cycles(mo, checked) for m, mo in zip(self.members, other.members) if (id(m), id(mo)) not in checked) 

340 ) 

341 

342 def __hash__(self): 

343 # Note we leave members out of the hash computation, since compounds can get added to 

344 # sets, then have their members mutated. See RuleVisitor._resolve_refs. 

345 # Equality should still work, but we want the rules to go into the correct hash bucket. 

346 return hash((self.__class__, self.name)) 

347 

348 

349class Sequence(Compound): 

350 """A series of expressions that must match contiguous, ordered pieces of 

351 the text 

352 

353 In other words, it's a concatenation operator: each piece has to match, one 

354 after another. 

355 

356 """ 

357 def _uncached_match(self, text, pos, cache, error): 

358 new_pos = pos 

359 children = [] 

360 for m in self.members: 

361 node = m.match_core(text, new_pos, cache, error) 

362 if node is None: 

363 return None 

364 children.append(node) 

365 length = node.end - node.start 

366 new_pos += length 

367 # Hooray! We got through all the members! 

368 return Node(self, text, pos, new_pos, children) 

369 

370 def _as_rhs(self): 

371 return '({0})'.format(' '.join(self._unicode_members())) 

372 

373 

374class OneOf(Compound): 

375 """A series of expressions, one of which must match 

376 

377 Expressions are tested in order from first to last. The first to succeed 

378 wins. 

379 

380 """ 

381 def _uncached_match(self, text, pos, cache, error): 

382 for m in self.members: 

383 node = m.match_core(text, pos, cache, error) 

384 if node is not None: 

385 # Wrap the succeeding child in a node representing the OneOf: 

386 return Node(self, text, pos, node.end, children=[node]) 

387 

388 def _as_rhs(self): 

389 return '({0})'.format(' / '.join(self._unicode_members())) 

390 

391 

392class Lookahead(Compound): 

393 """An expression which consumes nothing, even if its contained expression 

394 succeeds""" 

395 

396 __slots__ = ['negativity'] 

397 

398 def __init__(self, member, *, negative=False, **kwargs): 

399 super().__init__(member, **kwargs) 

400 self.negativity = bool(negative) 

401 

402 def _uncached_match(self, text, pos, cache, error): 

403 node = self.members[0].match_core(text, pos, cache, error) 

404 if (node is None) == self.negativity: # negative lookahead == match only if not found 

405 return Node(self, text, pos, pos) 

406 

407 def _as_rhs(self): 

408 return '%s%s' % ('!' if self.negativity else '&', self._unicode_members()[0]) 

409 

410 def _eq_check_cycles(self, other, checked): 

411 return ( 

412 super()._eq_check_cycles(other, checked) and 

413 self.negativity == other.negativity 

414 ) 

415 

416def Not(term): 

417 return Lookahead(term, negative=True) 

418 

419# Quantifiers. None of these is strictly necessary, but they're darn handy. 

420 

421class Quantifier(Compound): 

422 """An expression wrapper like the */+/?/{n,m} quantifier in regexes.""" 

423 

424 __slots__ = ['min', 'max'] 

425 

426 def __init__(self, member, *, min=0, max=float('inf'), name='', **kwargs): 

427 super().__init__(member, name=name, **kwargs) 

428 self.min = min 

429 self.max = max 

430 

431 def _uncached_match(self, text, pos, cache, error): 

432 new_pos = pos 

433 children = [] 

434 size = len(text) 

435 while new_pos < size and len(children) < self.max: 

436 node = self.members[0].match_core(text, new_pos, cache, error) 

437 if node is None: 

438 break # no more matches 

439 children.append(node) 

440 length = node.end - node.start 

441 if len(children) >= self.min and length == 0: # Don't loop infinitely 

442 break 

443 new_pos += length 

444 if len(children) >= self.min: 

445 return Node(self, text, pos, new_pos, children) 

446 

447 def _as_rhs(self): 

448 if self.min == 0 and self.max == 1: 

449 qualifier = '?' 

450 elif self.min == 0 and self.max == float('inf'): 

451 qualifier = '*' 

452 elif self.min == 1 and self.max == float('inf'): 

453 qualifier = '+' 

454 elif self.max == float('inf'): 

455 qualifier = '{%d,}' % self.min 

456 elif self.min == 0: 

457 qualifier = '{,%d}' % self.max 

458 else: 

459 qualifier = '{%d,%d}' % (self.min, self.max) 

460 return '%s%s' % (self._unicode_members()[0], qualifier) 

461 

462 def _eq_check_cycles(self, other, checked): 

463 return ( 

464 super()._eq_check_cycles(other, checked) and 

465 self.min == other.min and 

466 self.max == other.max 

467 ) 

468 

469def ZeroOrMore(member, name=''): 

470 return Quantifier(member, name=name, min=0, max=float('inf')) 

471 

472def OneOrMore(member, name='', min=1): 

473 return Quantifier(member, name=name, min=min, max=float('inf')) 

474 

475def Optional(member, name=''): 

476 return Quantifier(member, name=name, min=0, max=1)