Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/parsimonious/grammar.py: 84%

1"""A convenience which constructs expression trees from an easy-to-read syntax

3Use this unless you have a compelling reason not to; it performs some

4optimizations that would be tedious to do when constructing an expression tree

5by hand.

7"""

8from collections import OrderedDict

9from textwrap import dedent

11from parsimonious.exceptions import BadGrammar, UndefinedLabel

12from parsimonious.expressions import (Literal, Regex, Sequence, OneOf,

13 Lookahead, Quantifier, Optional, ZeroOrMore, OneOrMore, Not, TokenMatcher,

14 expression, is_callable)

15from parsimonious.nodes import NodeVisitor

16from parsimonious.utils import evaluate_string

18class Grammar(OrderedDict):

19 """A collection of rules that describe a language

21 You can start parsing from the default rule by calling ``parse()``

22 directly on the ``Grammar`` object::

24 g = Grammar('''

25 polite_greeting = greeting ", my good " title

26 greeting = "Hi" / "Hello"

27 title = "madam" / "sir"

28 ''')

29 g.parse('Hello, my good sir')

31 Or start parsing from any of the other rules; you can pull them out of the

32 grammar as if it were a dictionary::

34 g['title'].parse('sir')

36 You could also just construct a bunch of ``Expression`` objects yourself

37 and stitch them together into a language, but using a ``Grammar`` has some

38 important advantages:

40 * Languages are much easier to define in the nice syntax it provides.

41 * Circular references aren't a pain.

42 * It does all kinds of whizzy space- and time-saving optimizations, like

43 factoring up repeated subexpressions into a single object, which should

44 increase cache hit ratio. [Is this implemented yet?]

46 """

47 def __init__(self, rules='', **more_rules):

48 """Construct a grammar.

50 :arg rules: A string of production rules, one per line.

51 :arg default_rule: The name of the rule invoked when you call

52 :meth:`parse()` or :meth:`match()` on the grammar. Defaults to the

53 first rule. Falls back to None if there are no string-based rules

54 in this grammar.

55 :arg more_rules: Additional kwargs whose names are rule names and

56 values are Expressions or custom-coded callables which accomplish

57 things the built-in rule syntax cannot. These take precedence over

58 ``rules`` in case of naming conflicts.

60 """

62 decorated_custom_rules = {

63 k: (expression(v, k, self) if is_callable(v) else v)

64 for k, v in more_rules.items()}

66 exprs, first = self._expressions_from_rules(rules, decorated_custom_rules)

67 super().__init__(exprs.items())

68 self.default_rule = first # may be None

70 def default(self, rule_name):

71 """Return a new Grammar whose :term:`default rule` is ``rule_name``."""

72 new = self._copy()

73 new.default_rule = new[rule_name]

74 return new

76 def _copy(self):

77 """Return a shallow copy of myself.

79 Deep is unnecessary, since Expression trees are immutable. Subgrammars

80 recreate all the Expressions from scratch, and AbstractGrammars have

81 no Expressions.

83 """

84 new = Grammar.__new__(Grammar)

85 super(Grammar, new).__init__(self.items())

86 new.default_rule = self.default_rule

87 return new

89 def _expressions_from_rules(self, rules, custom_rules):

90 """Return a 2-tuple: a dict of rule names pointing to their

91 expressions, and then the first rule.

93 It's a web of expressions, all referencing each other. Typically,

94 there's a single root to the web of references, and that root is the

95 starting symbol for parsing, but there's nothing saying you can't have

96 multiple roots.

98 :arg custom_rules: A map of rule names to custom-coded rules:

99 Expressions

100

101 """

102 tree = rule_grammar.parse(rules)

103 return RuleVisitor(custom_rules).visit(tree)

104

105 def parse(self, text, pos=0):

106 """Parse some text with the :term:`default rule`.

107

108 :arg pos: The index at which to start parsing

109

110 """

111 self._check_default_rule()

112 return self.default_rule.parse(text, pos=pos)

113

114 def match(self, text, pos=0):

115 """Parse some text with the :term:`default rule` but not necessarily

116 all the way to the end.

117

118 :arg pos: The index at which to start parsing

119

120 """

121 self._check_default_rule()

122 return self.default_rule.match(text, pos=pos)

123

124 def _check_default_rule(self):

125 """Raise RuntimeError if there is no default rule defined."""

126 if not self.default_rule:

127 raise RuntimeError("Can't call parse() on a Grammar that has no "

128 "default rule. Choose a specific rule instead, "

129 "like some_grammar['some_rule'].parse(...).")

130

131 def __str__(self):

132 """Return a rule string that, when passed to the constructor, would

133 reconstitute the grammar."""

134 exprs = [self.default_rule] if self.default_rule else []

135 exprs.extend(expr for expr in self.values() if

136 expr is not self.default_rule)

137 return '\n'.join(expr.as_rule() for expr in exprs)

138

139 def __repr__(self):

140 """Return an expression that will reconstitute the grammar."""

141 return "Grammar({!r})".format(str(self))

142

143

144class TokenGrammar(Grammar):

145 """A Grammar which takes a list of pre-lexed tokens instead of text

146

147 This is useful if you want to do the lexing yourself, as a separate pass:

148 for example, to implement indentation-based languages.

149

150 """

151 def _expressions_from_rules(self, rules, custom_rules):

152 tree = rule_grammar.parse(rules)

153 return TokenRuleVisitor(custom_rules).visit(tree)

154

155

156class BootstrappingGrammar(Grammar):

157 """The grammar used to recognize the textual rules that describe other

158 grammars

159

160 This grammar gets its start from some hard-coded Expressions and claws its

161 way from there to an expression tree that describes how to parse the

162 grammar description syntax.

163

164 """

165 def _expressions_from_rules(self, rule_syntax, custom_rules):

166 """Return the rules for parsing the grammar definition syntax.

167

168 Return a 2-tuple: a dict of rule names pointing to their expressions,

169 and then the top-level expression for the first rule.

170

171 """

172 # Hard-code enough of the rules to parse the grammar that describes the

173 # grammar description language, to bootstrap:

174 comment = Regex(r'#[^\r\n]*', name='comment')

175 meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness')

176 _ = ZeroOrMore(meaninglessness, name='_')

177 equals = Sequence(Literal('='), _, name='equals')

178 label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label')

179 reference = Sequence(label, Not(equals), name='reference')

180 quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier')

181 # This pattern supports empty literals. TODO: A problem?

182 spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"',

183 ignore_case=True,

184 dot_all=True,

185 name='spaceless_literal')

186 literal = Sequence(spaceless_literal, _, name='literal')

187 regex = Sequence(Literal('~'),

188 literal,

189 Regex('[ilmsuxa]*', ignore_case=True),

190 _,

191 name='regex')

192 atom = OneOf(reference, literal, regex, name='atom')

193 quantified = Sequence(atom, quantifier, name='quantified')

194

195 term = OneOf(quantified, atom, name='term')

196 not_term = Sequence(Literal('!'), term, _, name='not_term')

197 term.members = (not_term,) + term.members

198

199 sequence = Sequence(term, OneOrMore(term), name='sequence')

200 or_term = Sequence(Literal('/'), _, OneOrMore(term), name='or_term')

201 ored = Sequence(OneOrMore(term), OneOrMore(or_term), name='ored')

202 expression = OneOf(ored, sequence, term, name='expression')

203 rule = Sequence(label, equals, expression, name='rule')

204 rules = Sequence(_, OneOrMore(rule), name='rules')

205

206 # Use those hard-coded rules to parse the (more extensive) rule syntax.

207 # (For example, unless I start using parentheses in the rule language

208 # definition itself, I should never have to hard-code expressions for

209 # those above.)

210

211 rule_tree = rules.parse(rule_syntax)

212

213 # Turn the parse tree into a map of expressions:

214 return RuleVisitor().visit(rule_tree)

215

216

217# The grammar for parsing PEG grammar definitions:

218# This is a nice, simple grammar. We may someday add to it, but it's a safe bet

219# that the future will always be a superset of this.

220rule_syntax = (r'''

221 # Ignored things (represented by _) are typically hung off the end of the

222 # leafmost kinds of nodes. Literals like "/" count as leaves.

223

224 rules = _ rule*

225 rule = label equals expression

226 equals = "=" _

227 literal = spaceless_literal _

228

229 # So you can't spell a regex like `~"..." ilm`:

230 spaceless_literal = ~"u?r?b?\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""is /

231 ~"u?r?b?'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'"is

232

233 expression = ored / sequence / term

234 or_term = "/" _ term+

235 ored = term+ or_term+

236 sequence = term term+

237 not_term = "!" term _

238 lookahead_term = "&" term _

239 term = not_term / lookahead_term / quantified / atom

240 quantified = atom quantifier

241 atom = reference / literal / regex / parenthesized

242 regex = "~" spaceless_literal ~"[ilmsuxa]*"i _

243 parenthesized = "(" _ expression ")" _

244 quantifier = ~r"[*+?]|\{\d*,\d+\}|\{\d+,\d*\}|\{\d+\}" _

245 reference = label !equals

246

247 # A subsequent equal sign is the only thing that distinguishes a label

248 # (which begins a new rule) from a reference (which is just a pointer to a

249 # rule defined somewhere else):

250 label = ~"[a-zA-Z_][a-zA-Z_0-9]*(?![\"'])" _

251

252 # _ = ~r"\s*(?:#[^\r\n]*)?\s*"

253 _ = meaninglessness*

254 meaninglessness = ~r"\s+" / comment

255 comment = ~r"#[^\r\n]*"

256 ''')

257

258

259class LazyReference(str):

260 """A lazy reference to a rule, which we resolve after grokking all the

261 rules"""

262

263 name = ''

264

265 def resolve_refs(self, rule_map):

266 """

267 Traverse the rule map following top-level lazy references,

268 until we reach a cycle (raise an error) or a concrete expression.

269

270 For example, the following is a circular reference:

271 foo = bar

272 baz = foo2

273 foo2 = foo

274

275 Note that every RHS of a grammar rule _must_ be either a

276 LazyReference or a concrete expression, so the reference chain will

277 eventually either terminate or find a cycle.

278 """

279 seen = set()

280 cur = self

281 while True:

282 if cur in seen:

283 raise BadGrammar(f"Circular Reference resolving {self.name}={self}.")

284 else:

285 seen.add(cur)

286 try:

287 cur = rule_map[str(cur)]

288 except KeyError:

289 raise UndefinedLabel(cur)

290 if not isinstance(cur, LazyReference):

291 return cur

292

293 # Just for debugging:

294 def _as_rhs(self):

295 return '<LazyReference to %s>' % self

296

297

298class RuleVisitor(NodeVisitor):

299 """Turns a parse tree of a grammar definition into a map of ``Expression``

300 objects

301

302 This is the magic piece that breathes life into a parsed bunch of parse

303 rules, allowing them to go forth and parse other things.

304

305 """

306 quantifier_classes = {'?': Optional, '*': ZeroOrMore, '+': OneOrMore}

307

308 visit_expression = visit_term = visit_atom = NodeVisitor.lift_child

309

310 def __init__(self, custom_rules=None):

311 """Construct.

312

313 :arg custom_rules: A dict of {rule name: expression} holding custom

314 rules which will take precedence over the others

315

316 """

317 self.custom_rules = custom_rules or {}

318 self._last_literal_node_and_type = None

319

320 def visit_parenthesized(self, node, parenthesized):

321 """Treat a parenthesized subexpression as just its contents.

322

323 Its position in the tree suffices to maintain its grouping semantics.

324

325 """

326 left_paren, _, expression, right_paren, _ = parenthesized

327 return expression

328

329 def visit_quantifier(self, node, quantifier):

330 """Turn a quantifier into just its symbol-matching node."""

331 symbol, _ = quantifier

332 return symbol

333

334 def visit_quantified(self, node, quantified):

335 atom, quantifier = quantified

336 try:

337 return self.quantifier_classes[quantifier.text](atom)

338 except KeyError:

339 # This should pass: assert re.full_match("\{(\d*)(,(\d*))?\}", quantifier)

340 quantifier = quantifier.text[1:-1].split(",")

341 if len(quantifier) == 1:

342 min_match = max_match = int(quantifier[0])

343 else:

344 min_match = int(quantifier[0]) if quantifier[0] else 0

345 max_match = int(quantifier[1]) if quantifier[1] else float('inf')

346 return Quantifier(atom, min=min_match, max=max_match)

347

348 def visit_lookahead_term(self, node, lookahead_term):

349 ampersand, term, _ = lookahead_term

350 return Lookahead(term)

351

352 def visit_not_term(self, node, not_term):

353 exclamation, term, _ = not_term

354 return Not(term)

355

356 def visit_rule(self, node, rule):

357 """Assign a name to the Expression and return it."""

358 label, equals, expression = rule

359 expression.name = label # Assign a name to the expr.

360 return expression

361

362 def visit_sequence(self, node, sequence):

363 """A parsed Sequence looks like [term node, OneOrMore node of

364 ``another_term``s]. Flatten it out."""

365 term, other_terms = sequence

366 return Sequence(term, *other_terms)

367

368 def visit_ored(self, node, ored):

369 first_term, other_terms = ored

370 if len(first_term) == 1:

371 first_term = first_term[0]

372 else:

373 first_term = Sequence(*first_term)

374 return OneOf(first_term, *other_terms)

375

376 def visit_or_term(self, node, or_term):

377 """Return just the term from an ``or_term``.

378

379 We already know it's going to be ored, from the containing ``ored``.

380

381 """

382 slash, _, terms = or_term

383 if len(terms) == 1:

384 return terms[0]

385 else:

386 return Sequence(*terms)

387

388 def visit_label(self, node, label):

389 """Turn a label into a unicode string."""

390 name, _ = label

391 return name.text

392

393 def visit_reference(self, node, reference):

394 """Stick a :class:`LazyReference` in the tree as a placeholder.

395

396 We resolve them all later.

397

398 """

399 label, not_equals = reference

400 return LazyReference(label)

401

402 def visit_regex(self, node, regex):

403 """Return a ``Regex`` expression."""

404 tilde, literal, flags, _ = regex

405 flags = flags.text.upper()

406 pattern = literal.literal # Pull the string back out of the Literal

407 # object.

408 return Regex(pattern, ignore_case='I' in flags,

409 locale='L' in flags,

410 multiline='M' in flags,

411 dot_all='S' in flags,

412 unicode='U' in flags,

413 verbose='X' in flags,

414 ascii='A' in flags)

415

416 def visit_spaceless_literal(self, spaceless_literal, visited_children):

417 """Turn a string literal into a ``Literal`` that recognizes it."""

418 literal_value = evaluate_string(spaceless_literal.text)

419 if self._last_literal_node_and_type:

420 last_node, last_type = self._last_literal_node_and_type

421 if last_type != type(literal_value):

422 raise BadGrammar(dedent(f"""\

423 Found {last_node.text} ({last_type}) and {spaceless_literal.text} ({type(literal_value)}) string literals.

424 All strings in a single grammar must be of the same type.

425 """)

426 )

427

428 self._last_literal_node_and_type = spaceless_literal, type(literal_value)

429

430 return Literal(literal_value)

431

432 def visit_literal(self, node, literal):

433 """Pick just the literal out of a literal-and-junk combo."""

434 spaceless_literal, _ = literal

435 return spaceless_literal

436

437 def generic_visit(self, node, visited_children):

438 """Replace childbearing nodes with a list of their children; keep

439 others untouched.

440

441 For our case, if a node has children, only the children are important.

442 Otherwise, keep the node around for (for example) the flags of the

443 regex rule. Most of these kept-around nodes are subsequently thrown

444 away by the other visitor methods.

445

446 We can't simply hang the visited children off the original node; that

447 would be disastrous if the node occurred in more than one place in the

448 tree.

449

450 """

451 return visited_children or node # should semantically be a tuple

452

453 def visit_rules(self, node, rules_list):

454 """Collate all the rules into a map. Return (map, default rule).

455

456 The default rule is the first one. Or, if you have more than one rule

457 of that name, it's the last-occurring rule of that name. (This lets you

458 override the default rule when you extend a grammar.) If there are no

459 string-based rules, the default rule is None, because the custom rules,

460 due to being kwarg-based, are unordered.

461

462 """

463 _, rules = rules_list

464

465 # Map each rule's name to its Expression. Later rules of the same name

466 # override earlier ones. This lets us define rules multiple times and

467 # have the last declaration win, so you can extend grammars by

468 # concatenation.

469 rule_map = OrderedDict((expr.name, expr) for expr in rules)

470

471 # And custom rules override string-based rules. This is the least

472 # surprising choice when you compare the dict constructor:

473 # dict({'x': 5}, x=6).

474 rule_map.update(self.custom_rules)

475

476 # Resolve references. This tolerates forward references.

477 for name, rule in list(rule_map.items()):

478 if hasattr(rule, 'resolve_refs'):

479 # Some custom rules may not define a resolve_refs method,

480 # though anything that inherits from Expression will have it.

481 rule_map[name] = rule.resolve_refs(rule_map)

482

483 # isinstance() is a temporary hack around the fact that * rules don't

484 # always get transformed into lists by NodeVisitor. We should fix that;

485 # it's surprising and requires writing lame branches like this.

486 return rule_map, (rule_map[rules[0].name]

487 if isinstance(rules, list) and rules else None)

488

489

490class TokenRuleVisitor(RuleVisitor):

491 """A visitor which builds expression trees meant to work on sequences of

492 pre-lexed tokens rather than strings"""

493

494 def visit_spaceless_literal(self, spaceless_literal, visited_children):

495 """Turn a string literal into a ``TokenMatcher`` that matches

496 ``Token`` objects by their ``type`` attributes."""

497 return TokenMatcher(evaluate_string(spaceless_literal.text))

498

499 def visit_regex(self, node, regex):

500 tilde, literal, flags, _ = regex

501 raise BadGrammar('Regexes do not make sense in TokenGrammars, since '

502 'TokenGrammars operate on pre-lexed tokens rather '

503 'than characters.')

504

505

506# Bootstrap to level 1...

507rule_grammar = BootstrappingGrammar(rule_syntax)

508# ...and then to level 2. This establishes that the node tree of our rule

509# syntax is built by the same machinery that will build trees of our users'

510# grammars. And the correctness of that tree is tested, indirectly, in

511# test_grammar.

512rule_grammar = Grammar(rule_syntax)

513

514

515# TODO: Teach Expression trees how to spit out Python representations of

516# themselves. Then we can just paste that in above, and we won't have to

517# bootstrap on import. Though it'll be a little less DRY. [Ah, but this is not

518# so clean, because it would have to output multiple statements to get multiple

519# refs to a single expression hooked up.]