1"""Parses and compiles Lark grammars into an internal representation.
2"""
3
4import hashlib
5import os.path
6import sys
7from collections import namedtuple
8from copy import copy, deepcopy
9import pkgutil
10from ast import literal_eval
11from contextlib import suppress
12from typing import List, Tuple, Union, Callable, Dict, Optional, Sequence, Generator
13
14from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet, Serialize
15from .lexer import Token, TerminalDef, PatternStr, PatternRE, Pattern
16
17from .parse_tree_builder import ParseTreeBuilder
18from .parser_frontends import ParsingFrontend
19from .common import LexerConf, ParserConf
20from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, TOKEN_DEFAULT_PRIORITY
21from .utils import classify, dedup_list
22from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError, UnexpectedInput
23
24from .tree import Tree, SlottedTree as ST
25from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive
26inline_args = v_args(inline=True)
27
28IMPORT_PATHS = ['grammars']
29
30EXT = '.lark'
31
32_RE_FLAGS = 'imslux'
33
34_EMPTY = Symbol('__empty__')
35
36_TERMINAL_NAMES = {
37 '.' : 'DOT',
38 ',' : 'COMMA',
39 ':' : 'COLON',
40 ';' : 'SEMICOLON',
41 '+' : 'PLUS',
42 '-' : 'MINUS',
43 '*' : 'STAR',
44 '/' : 'SLASH',
45 '\\' : 'BACKSLASH',
46 '|' : 'VBAR',
47 '?' : 'QMARK',
48 '!' : 'BANG',
49 '@' : 'AT',
50 '#' : 'HASH',
51 '$' : 'DOLLAR',
52 '%' : 'PERCENT',
53 '^' : 'CIRCUMFLEX',
54 '&' : 'AMPERSAND',
55 '_' : 'UNDERSCORE',
56 '<' : 'LESSTHAN',
57 '>' : 'MORETHAN',
58 '=' : 'EQUAL',
59 '"' : 'DBLQUOTE',
60 '\'' : 'QUOTE',
61 '`' : 'BACKQUOTE',
62 '~' : 'TILDE',
63 '(' : 'LPAR',
64 ')' : 'RPAR',
65 '{' : 'LBRACE',
66 '}' : 'RBRACE',
67 '[' : 'LSQB',
68 ']' : 'RSQB',
69 '\n' : 'NEWLINE',
70 '\r\n' : 'CRLF',
71 '\t' : 'TAB',
72 ' ' : 'SPACE',
73}
74
75# Grammar Parser
76TERMINALS = {
77 '_LPAR': r'\(',
78 '_RPAR': r'\)',
79 '_LBRA': r'\[',
80 '_RBRA': r'\]',
81 '_LBRACE': r'\{',
82 '_RBRACE': r'\}',
83 'OP': '[+*]|[?](?![a-z_])',
84 '_COLON': ':',
85 '_COMMA': ',',
86 '_OR': r'\|',
87 '_DOT': r'\.(?!\.)',
88 '_DOTDOT': r'\.\.',
89 'TILDE': '~',
90 'RULE_MODIFIERS': '(!|![?]?|[?]!?)(?=[_a-z])',
91 'RULE': '_?[a-z][_a-z0-9]*',
92 'TERMINAL': '_?[A-Z][_A-Z0-9]*',
93 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
94 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS,
95 '_NL': r'(\r?\n)+\s*',
96 '_NL_OR': r'(\r?\n)+\s*\|',
97 'WS': r'[ \t]+',
98 'COMMENT': r'\s*//[^\n]*|\s*#[^\n]*',
99 'BACKSLASH': r'\\[ ]*\n',
100 '_TO': '->',
101 '_IGNORE': r'%ignore',
102 '_OVERRIDE': r'%override',
103 '_DECLARE': r'%declare',
104 '_EXTEND': r'%extend',
105 '_IMPORT': r'%import',
106 'NUMBER': r'[+-]?\d+',
107}
108
109RULES = {
110 'start': ['_list'],
111 '_list': ['_item', '_list _item'],
112 '_item': ['rule', 'term', 'ignore', 'import', 'declare', 'override', 'extend', '_NL'],
113
114 'rule': ['rule_modifiers RULE template_params priority _COLON expansions _NL'],
115 'rule_modifiers': ['RULE_MODIFIERS',
116 ''],
117 'priority': ['_DOT NUMBER',
118 ''],
119 'template_params': ['_LBRACE _template_params _RBRACE',
120 ''],
121 '_template_params': ['RULE',
122 '_template_params _COMMA RULE'],
123 'expansions': ['_expansions'],
124 '_expansions': ['alias',
125 '_expansions _OR alias',
126 '_expansions _NL_OR alias'],
127
128 '?alias': ['expansion _TO nonterminal', 'expansion'],
129 'expansion': ['_expansion'],
130
131 '_expansion': ['', '_expansion expr'],
132
133 '?expr': ['atom',
134 'atom OP',
135 'atom TILDE NUMBER',
136 'atom TILDE NUMBER _DOTDOT NUMBER',
137 ],
138
139 '?atom': ['_LPAR expansions _RPAR',
140 'maybe',
141 'value'],
142
143 'value': ['terminal',
144 'nonterminal',
145 'literal',
146 'range',
147 'template_usage'],
148
149 'terminal': ['TERMINAL'],
150 'nonterminal': ['RULE'],
151
152 '?name': ['RULE', 'TERMINAL'],
153 '?symbol': ['terminal', 'nonterminal'],
154
155 'maybe': ['_LBRA expansions _RBRA'],
156 'range': ['STRING _DOTDOT STRING'],
157
158 'template_usage': ['nonterminal _LBRACE _template_args _RBRACE'],
159 '_template_args': ['value',
160 '_template_args _COMMA value'],
161
162 'term': ['TERMINAL _COLON expansions _NL',
163 'TERMINAL _DOT NUMBER _COLON expansions _NL'],
164 'override': ['_OVERRIDE rule',
165 '_OVERRIDE term'],
166 'extend': ['_EXTEND rule',
167 '_EXTEND term'],
168 'ignore': ['_IGNORE expansions _NL'],
169 'declare': ['_DECLARE _declare_args _NL'],
170 'import': ['_IMPORT _import_path _NL',
171 '_IMPORT _import_path _LPAR name_list _RPAR _NL',
172 '_IMPORT _import_path _TO name _NL'],
173
174 '_import_path': ['import_lib', 'import_rel'],
175 'import_lib': ['_import_args'],
176 'import_rel': ['_DOT _import_args'],
177 '_import_args': ['name', '_import_args _DOT name'],
178
179 'name_list': ['_name_list'],
180 '_name_list': ['name', '_name_list _COMMA name'],
181
182 '_declare_args': ['symbol', '_declare_args symbol'],
183 'literal': ['REGEXP', 'STRING'],
184}
185
186
187# Value 5 keeps the number of states in the lalr parser somewhat minimal
188# It isn't optimal, but close to it. See PR #949
189SMALL_FACTOR_THRESHOLD = 5
190# The Threshold whether repeat via ~ are split up into different rules
191# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low,
192# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts.
193# (See PR #949)
194REPEAT_BREAK_THRESHOLD = 50
195
196
197class FindRuleSize(Transformer):
198 def __init__(self, keep_all_tokens: bool):
199 self.keep_all_tokens = keep_all_tokens
200
201 def _will_not_get_removed(self, sym: Symbol) -> bool:
202 if isinstance(sym, NonTerminal):
203 return not sym.name.startswith('_')
204 if isinstance(sym, Terminal):
205 return self.keep_all_tokens or not sym.filter_out
206 if sym is _EMPTY:
207 return False
208 assert False, sym
209
210 def _args_as_int(self, args: List[Union[int, Symbol]]) -> Generator[int, None, None]:
211 for a in args:
212 if isinstance(a, int):
213 yield a
214 elif isinstance(a, Symbol):
215 yield 1 if self._will_not_get_removed(a) else 0
216 else:
217 assert False
218
219 def expansion(self, args) -> int:
220 return sum(self._args_as_int(args))
221
222 def expansions(self, args) -> int:
223 return max(self._args_as_int(args))
224
225
226@inline_args
227class EBNF_to_BNF(Transformer_InPlace):
228 def __init__(self):
229 self.new_rules = []
230 self.rules_cache = {}
231 self.prefix = 'anon'
232 self.i = 0
233 self.rule_options = None
234
235 def _name_rule(self, inner: str):
236 new_name = '__%s_%s_%d' % (self.prefix, inner, self.i)
237 self.i += 1
238 return new_name
239
240 def _add_rule(self, key, name, expansions):
241 t = NonTerminal(name)
242 self.new_rules.append((name, expansions, self.rule_options))
243 self.rules_cache[key] = t
244 return t
245
246 def _add_recurse_rule(self, type_: str, expr: Tree):
247 try:
248 return self.rules_cache[expr]
249 except KeyError:
250 new_name = self._name_rule(type_)
251 t = NonTerminal(new_name)
252 tree = ST('expansions', [
253 ST('expansion', [expr]),
254 ST('expansion', [t, expr])
255 ])
256 return self._add_rule(expr, new_name, tree)
257
258 def _add_repeat_rule(self, a, b, target, atom):
259 """Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times.
260
261 When called recursively (into target), it repeats atom for x(n) times, where:
262 x(0) = 1
263 x(n) = a(n) * x(n-1) + b
264
265 Example rule when a=3, b=4:
266
267 new_rule: target target target atom atom atom atom
268
269 """
270 key = (a, b, target, atom)
271 try:
272 return self.rules_cache[key]
273 except KeyError:
274 new_name = self._name_rule('repeat_a%d_b%d' % (a, b))
275 tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)])
276 return self._add_rule(key, new_name, tree)
277
278 def _add_repeat_opt_rule(self, a, b, target, target_opt, atom):
279 """Creates a rule that matches atom 0 to (a*n+b)-1 times.
280
281 When target matches n times atom, and target_opt 0 to n-1 times target_opt,
282
283 First we generate target * i followed by target_opt, for i from 0 to a-1
284 These match 0 to n*a - 1 times atom
285
286 Then we generate target * a followed by atom * i, for i from 0 to b-1
287 These match n*a to n*a + b-1 times atom
288
289 The created rule will not have any shift/reduce conflicts so that it can be used with lalr
290
291 Example rule when a=3, b=4:
292
293 new_rule: target_opt
294 | target target_opt
295 | target target target_opt
296
297 | target target target
298 | target target target atom
299 | target target target atom atom
300 | target target target atom atom atom
301
302 """
303 key = (a, b, target, atom, "opt")
304 try:
305 return self.rules_cache[key]
306 except KeyError:
307 new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b))
308 tree = ST('expansions', [
309 ST('expansion', [target]*i + [target_opt]) for i in range(a)
310 ] + [
311 ST('expansion', [target]*a + [atom]*i) for i in range(b)
312 ])
313 return self._add_rule(key, new_name, tree)
314
315 def _generate_repeats(self, rule: Tree, mn: int, mx: int):
316 """Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times.
317 """
318 # For a small number of repeats, we can take the naive approach
319 if mx < REPEAT_BREAK_THRESHOLD:
320 return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])
321
322 # For large repeat values, we break the repetition into sub-rules.
323 # We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``.
324 # We then use small_factors to split up mn and diff up into values [(a, b), ...]
325 # This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt
326 # to generate a complete rule/expression that matches the corresponding number of repeats
327 mn_target = rule
328 for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD):
329 mn_target = self._add_repeat_rule(a, b, mn_target, rule)
330 if mx == mn:
331 return mn_target
332
333 diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less
334 diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD)
335 diff_target = rule # Match rule 1 times
336 diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times)
337 for a, b in diff_factors[:-1]:
338 diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
339 diff_target = self._add_repeat_rule(a, b, diff_target, rule)
340
341 a, b = diff_factors[-1]
342 diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
343
344 return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])])
345
346 def expr(self, rule: Tree, op: Token, *args):
347 if op.value == '?':
348 empty = ST('expansion', [])
349 return ST('expansions', [rule, empty])
350 elif op.value == '+':
351 # a : b c+ d
352 # -->
353 # a : b _c d
354 # _c : _c c | c;
355 return self._add_recurse_rule('plus', rule)
356 elif op.value == '*':
357 # a : b c* d
358 # -->
359 # a : b _c? d
360 # _c : _c c | c;
361 new_name = self._add_recurse_rule('star', rule)
362 return ST('expansions', [new_name, ST('expansion', [])])
363 elif op.value == '~':
364 if len(args) == 1:
365 mn = mx = int(args[0])
366 else:
367 mn, mx = map(int, args)
368 if mx < mn or mn < 0:
369 raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
370
371 return self._generate_repeats(rule, mn, mx)
372
373 assert False, op
374
375 def maybe(self, rule: Tree):
376 keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens
377 rule_size = FindRuleSize(keep_all_tokens).transform(rule)
378 empty = ST('expansion', [_EMPTY] * rule_size)
379 return ST('expansions', [rule, empty])
380
381
382class SimplifyRule_Visitor(Visitor):
383
384 @staticmethod
385 def _flatten(tree: Tree):
386 while tree.expand_kids_by_data(tree.data):
387 pass
388
389 def expansion(self, tree: Tree):
390 # rules_list unpacking
391 # a : b (c|d) e
392 # -->
393 # a : b c e | b d e
394 #
395 # In AST terms:
396 # expansion(b, expansions(c, d), e)
397 # -->
398 # expansions( expansion(b, c, e), expansion(b, d, e) )
399
400 self._flatten(tree)
401
402 for i, child in enumerate(tree.children):
403 if isinstance(child, Tree) and child.data == 'expansions':
404 tree.data = 'expansions'
405 tree.children = [self.visit(ST('expansion', [option if i == j else other
406 for j, other in enumerate(tree.children)]))
407 for option in dedup_list(child.children)]
408 self._flatten(tree)
409 break
410
411 def alias(self, tree):
412 rule, alias_name = tree.children
413 if rule.data == 'expansions':
414 aliases = []
415 for child in tree.children[0].children:
416 aliases.append(ST('alias', [child, alias_name]))
417 tree.data = 'expansions'
418 tree.children = aliases
419
420 def expansions(self, tree: Tree):
421 self._flatten(tree)
422 # Ensure all children are unique
423 if len(set(tree.children)) != len(tree.children):
424 tree.children = dedup_list(tree.children) # dedup is expensive, so try to minimize its use
425
426
427class RuleTreeToText(Transformer):
428 def expansions(self, x):
429 return x
430
431 def expansion(self, symbols):
432 return symbols, None
433
434 def alias(self, x):
435 (expansion, _alias), alias = x
436 assert _alias is None, (alias, expansion, '-', _alias) # Double alias not allowed
437 return expansion, alias.name
438
439
440class PrepareAnonTerminals(Transformer_InPlace):
441 """Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them"""
442
443 def __init__(self, terminals):
444 self.terminals = terminals
445 self.term_set = {td.name for td in self.terminals}
446 self.term_reverse = {td.pattern: td for td in terminals}
447 self.i = 0
448 self.rule_options = None
449
450 @inline_args
451 def pattern(self, p):
452 value = p.value
453 if p in self.term_reverse and p.flags != self.term_reverse[p].pattern.flags:
454 raise GrammarError(u'Conflicting flags for the same terminal: %s' % p)
455
456 term_name = None
457
458 if isinstance(p, PatternStr):
459 try:
460 # If already defined, use the user-defined terminal name
461 term_name = self.term_reverse[p].name
462 except KeyError:
463 # Try to assign an indicative anon-terminal name
464 try:
465 term_name = _TERMINAL_NAMES[value]
466 except KeyError:
467 if value and is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set:
468 term_name = value.upper()
469
470 if term_name in self.term_set:
471 term_name = None
472
473 elif isinstance(p, PatternRE):
474 if p in self.term_reverse: # Kind of a weird placement.name
475 term_name = self.term_reverse[p].name
476 else:
477 assert False, p
478
479 if term_name is None:
480 term_name = '__ANON_%d' % self.i
481 self.i += 1
482
483 if term_name not in self.term_set:
484 assert p not in self.term_reverse
485 self.term_set.add(term_name)
486 termdef = TerminalDef(term_name, p)
487 self.term_reverse[p] = termdef
488 self.terminals.append(termdef)
489
490 filter_out = False if self.rule_options and self.rule_options.keep_all_tokens else isinstance(p, PatternStr)
491
492 return Terminal(term_name, filter_out=filter_out)
493
494
495class _ReplaceSymbols(Transformer_InPlace):
496 """Helper for ApplyTemplates"""
497
498 def __init__(self):
499 self.names = {}
500
501 def value(self, c):
502 if len(c) == 1 and isinstance(c[0], Symbol) and c[0].name in self.names:
503 return self.names[c[0].name]
504 return self.__default__('value', c, None)
505
506 def template_usage(self, c):
507 name = c[0].name
508 if name in self.names:
509 return self.__default__('template_usage', [self.names[name]] + c[1:], None)
510 return self.__default__('template_usage', c, None)
511
512
513class ApplyTemplates(Transformer_InPlace):
514 """Apply the templates, creating new rules that represent the used templates"""
515
516 def __init__(self, rule_defs):
517 self.rule_defs = rule_defs
518 self.replacer = _ReplaceSymbols()
519 self.created_templates = set()
520
521 def template_usage(self, c):
522 name = c[0].name
523 args = c[1:]
524 result_name = "%s{%s}" % (name, ",".join(a.name for a in args))
525 if result_name not in self.created_templates:
526 self.created_templates.add(result_name)
527 (_n, params, tree, options) ,= (t for t in self.rule_defs if t[0] == name)
528 assert len(params) == len(args), args
529 result_tree = deepcopy(tree)
530 self.replacer.names = dict(zip(params, args))
531 self.replacer.transform(result_tree)
532 self.rule_defs.append((result_name, [], result_tree, deepcopy(options)))
533 return NonTerminal(result_name)
534
535
536def _rfind(s, choices):
537 return max(s.rfind(c) for c in choices)
538
539
540def eval_escaping(s):
541 w = ''
542 i = iter(s)
543 for n in i:
544 w += n
545 if n == '\\':
546 try:
547 n2 = next(i)
548 except StopIteration:
549 raise GrammarError("Literal ended unexpectedly (bad escaping): `%r`" % s)
550 if n2 == '\\':
551 w += '\\\\'
552 elif n2 not in 'Uuxnftr':
553 w += '\\'
554 w += n2
555 w = w.replace('\\"', '"').replace("'", "\\'")
556
557 to_eval = "u'''%s'''" % w
558 try:
559 s = literal_eval(to_eval)
560 except SyntaxError as e:
561 raise GrammarError(s, e)
562
563 return s
564
565
566def _literal_to_pattern(literal):
567 assert isinstance(literal, Token)
568 v = literal.value
569 flag_start = _rfind(v, '/"')+1
570 assert flag_start > 0
571 flags = v[flag_start:]
572 assert all(f in _RE_FLAGS for f in flags), flags
573
574 if literal.type == 'STRING' and '\n' in v:
575 raise GrammarError('You cannot put newlines in string literals')
576
577 if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags:
578 raise GrammarError('You can only use newlines in regular expressions '
579 'with the `x` (verbose) flag')
580
581 v = v[:flag_start]
582 assert v[0] == v[-1] and v[0] in '"/'
583 x = v[1:-1]
584
585 s = eval_escaping(x)
586
587 if s == "":
588 raise GrammarError("Empty terminals are not allowed (%s)" % literal)
589
590 if literal.type == 'STRING':
591 s = s.replace('\\\\', '\\')
592 return PatternStr(s, flags, raw=literal.value)
593 elif literal.type == 'REGEXP':
594 return PatternRE(s, flags, raw=literal.value)
595 else:
596 assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]'
597
598
599@inline_args
600class PrepareLiterals(Transformer_InPlace):
601 def literal(self, literal):
602 return ST('pattern', [_literal_to_pattern(literal)])
603
604 def range(self, start, end):
605 assert start.type == end.type == 'STRING'
606 start = start.value[1:-1]
607 end = end.value[1:-1]
608 assert len(eval_escaping(start)) == len(eval_escaping(end)) == 1
609 regexp = '[%s-%s]' % (start, end)
610 return ST('pattern', [PatternRE(regexp)])
611
612
613def _make_joined_pattern(regexp, flags_set) -> PatternRE:
614 return PatternRE(regexp, ())
615
616class TerminalTreeToPattern(Transformer_NonRecursive):
617 def pattern(self, ps):
618 p ,= ps
619 return p
620
621 def expansion(self, items: List[Pattern]) -> Pattern:
622 if not items:
623 return PatternStr('')
624
625 if len(items) == 1:
626 return items[0]
627
628 pattern = ''.join(i.to_regexp() for i in items)
629 return _make_joined_pattern(pattern, {i.flags for i in items})
630
631 def expansions(self, exps: List[Pattern]) -> Pattern:
632 if len(exps) == 1:
633 return exps[0]
634
635 # Do a bit of sorting to make sure that the longest option is returned
636 # (Python's re module otherwise prefers just 'l' when given (l|ll) and both could match)
637 exps.sort(key=lambda x: (-x.max_width, -x.min_width, -len(x.value)))
638
639 pattern = '(?:%s)' % ('|'.join(i.to_regexp() for i in exps))
640 return _make_joined_pattern(pattern, {i.flags for i in exps})
641
642 def expr(self, args) -> Pattern:
643 inner: Pattern
644 inner, op = args[:2]
645 if op == '~':
646 if len(args) == 3:
647 op = "{%d}" % int(args[2])
648 else:
649 mn, mx = map(int, args[2:])
650 if mx < mn:
651 raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (inner, mn, mx))
652 op = "{%d,%d}" % (mn, mx)
653 else:
654 assert len(args) == 2
655 return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags)
656
657 def maybe(self, expr):
658 return self.expr(expr + ['?'])
659
660 def alias(self, t):
661 raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)")
662
663 def value(self, v):
664 return v[0]
665
666
667class ValidateSymbols(Transformer_InPlace):
668 def value(self, v):
669 v ,= v
670 assert isinstance(v, (Tree, Symbol))
671 return v
672
673
674def nr_deepcopy_tree(t):
675 """Deepcopy tree `t` without recursion"""
676 return Transformer_NonRecursive(False).transform(t)
677
678
679class Grammar(Serialize):
680
681 term_defs: List[Tuple[str, Tuple[Tree, int]]]
682 rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]]
683 ignore: List[str]
684
685 def __init__(self, rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]], term_defs: List[Tuple[str, Tuple[Tree, int]]], ignore: List[str]) -> None:
686 self.term_defs = term_defs
687 self.rule_defs = rule_defs
688 self.ignore = ignore
689
690 __serialize_fields__ = 'term_defs', 'rule_defs', 'ignore'
691
692 def compile(self, start, terminals_to_keep) -> Tuple[List[TerminalDef], List[Rule], List[str]]:
693 # We change the trees in-place (to support huge grammars)
694 # So deepcopy allows calling compile more than once.
695 term_defs = [(n, (nr_deepcopy_tree(t), p)) for n, (t, p) in self.term_defs]
696 rule_defs = [(n, p, nr_deepcopy_tree(t), o) for n, p, t, o in self.rule_defs]
697
698 # ===================
699 # Compile Terminals
700 # ===================
701
702 # Convert terminal-trees to strings/regexps
703
704 for name, (term_tree, priority) in term_defs:
705 if term_tree is None: # Terminal added through %declare
706 continue
707 expansions = list(term_tree.find_data('expansion'))
708 if len(expansions) == 1 and not expansions[0].children:
709 raise GrammarError("Terminals cannot be empty (%s)" % name)
710
711 transformer = PrepareLiterals() * TerminalTreeToPattern()
712 terminals = [TerminalDef(name, transformer.transform(term_tree), priority)
713 for name, (term_tree, priority) in term_defs if term_tree]
714
715 # =================
716 # Compile Rules
717 # =================
718
719 # 1. Pre-process terminals
720 anon_tokens_transf = PrepareAnonTerminals(terminals)
721 transformer = PrepareLiterals() * ValidateSymbols() * anon_tokens_transf # Adds to terminals
722
723 # 2. Inline Templates
724
725 transformer *= ApplyTemplates(rule_defs)
726
727 # 3. Convert EBNF to BNF (and apply step 1 & 2)
728 ebnf_to_bnf = EBNF_to_BNF()
729 rules = []
730 i = 0
731 while i < len(rule_defs): # We have to do it like this because rule_defs might grow due to templates
732 name, params, rule_tree, options = rule_defs[i]
733 i += 1
734 if len(params) != 0: # Dont transform templates
735 continue
736 rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
737 ebnf_to_bnf.rule_options = rule_options
738 ebnf_to_bnf.prefix = name
739 anon_tokens_transf.rule_options = rule_options
740 tree = transformer.transform(rule_tree)
741 res: Tree = ebnf_to_bnf.transform(tree)
742 rules.append((name, res, options))
743 rules += ebnf_to_bnf.new_rules
744
745 assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"
746
747 # 4. Compile tree to Rule objects
748 rule_tree_to_text = RuleTreeToText()
749
750 simplify_rule = SimplifyRule_Visitor()
751 compiled_rules: List[Rule] = []
752 for rule_content in rules:
753 name, tree, options = rule_content
754 simplify_rule.visit(tree)
755 expansions = rule_tree_to_text.transform(tree)
756
757 for i, (expansion, alias) in enumerate(expansions):
758 if alias and name.startswith('_'):
759 raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)"% (name, alias))
760
761 empty_indices = tuple(x==_EMPTY for x in expansion)
762 if any(empty_indices):
763 exp_options = copy(options) or RuleOptions()
764 exp_options.empty_indices = empty_indices
765 expansion = [x for x in expansion if x!=_EMPTY]
766 else:
767 exp_options = options
768
769 for sym in expansion:
770 assert isinstance(sym, Symbol)
771 if sym.is_term and exp_options and exp_options.keep_all_tokens:
772 assert isinstance(sym, Terminal)
773 sym.filter_out = False
774 rule = Rule(NonTerminal(name), expansion, i, alias, exp_options)
775 compiled_rules.append(rule)
776
777 # Remove duplicates of empty rules, throw error for non-empty duplicates
778 if len(set(compiled_rules)) != len(compiled_rules):
779 duplicates = classify(compiled_rules, lambda x: x)
780 for dups in duplicates.values():
781 if len(dups) > 1:
782 if dups[0].expansion:
783 raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)"
784 % ''.join('\n * %s' % i for i in dups))
785
786 # Empty rule; assert all other attributes are equal
787 assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups)
788
789 # Remove duplicates
790 compiled_rules = list(OrderedSet(compiled_rules))
791
792 # Filter out unused rules
793 while True:
794 c = len(compiled_rules)
795 used_rules = {s for r in compiled_rules
796 for s in r.expansion
797 if isinstance(s, NonTerminal)
798 and s != r.origin}
799 used_rules |= {NonTerminal(s) for s in start}
800 compiled_rules, unused = classify_bool(compiled_rules, lambda r: r.origin in used_rules)
801 for r in unused:
802 logger.debug("Unused rule: %s", r)
803 if len(compiled_rules) == c:
804 break
805
806 # Filter out unused terminals
807 if terminals_to_keep != '*':
808 used_terms = {t.name for r in compiled_rules
809 for t in r.expansion
810 if isinstance(t, Terminal)}
811 terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep)
812 if unused:
813 logger.debug("Unused terminals: %s", [t.name for t in unused])
814
815 return terminals, compiled_rules, self.ignore
816
817
818PackageResource = namedtuple('PackageResource', 'pkg_name path')
819
820
821class FromPackageLoader:
822 """
823 Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`.
824 This allows them to be compatible even from within zip files.
825
826 Relative imports are handled, so you can just freely use them.
827
828 pkg_name: The name of the package. You can probably provide `__name__` most of the time
829 search_paths: All the path that will be search on absolute imports.
830 """
831
832 pkg_name: str
833 search_paths: Sequence[str]
834
835 def __init__(self, pkg_name: str, search_paths: Sequence[str]=("", )) -> None:
836 self.pkg_name = pkg_name
837 self.search_paths = search_paths
838
839 def __repr__(self):
840 return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths)
841
842 def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]:
843 if base_path is None:
844 to_try = self.search_paths
845 else:
846 # Check whether or not the importing grammar was loaded by this module.
847 if not isinstance(base_path, PackageResource) or base_path.pkg_name != self.pkg_name:
848 # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway
849 raise IOError()
850 to_try = [base_path.path]
851
852 err = None
853 for path in to_try:
854 full_path = os.path.join(path, grammar_path)
855 try:
856 text: Optional[bytes] = pkgutil.get_data(self.pkg_name, full_path)
857 except IOError as e:
858 err = e
859 continue
860 else:
861 return PackageResource(self.pkg_name, full_path), (text.decode() if text else '')
862
863 raise IOError('Cannot find grammar in given paths') from err
864
865
866stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS)
867
868
869
870def resolve_term_references(term_dict):
871 # TODO Solve with transitive closure (maybe)
872
873 while True:
874 changed = False
875 for name, token_tree in term_dict.items():
876 if token_tree is None: # Terminal added through %declare
877 continue
878 for exp in token_tree.find_data('value'):
879 item ,= exp.children
880 if isinstance(item, NonTerminal):
881 raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name))
882 elif isinstance(item, Terminal):
883 try:
884 term_value = term_dict[item.name]
885 except KeyError:
886 raise GrammarError("Terminal used but not defined: %s" % item.name)
887 assert term_value is not None
888 exp.children[0] = term_value
889 changed = True
890 else:
891 assert isinstance(item, Tree)
892 if not changed:
893 break
894
895 for name, term in term_dict.items():
896 if term: # Not just declared
897 for child in term.children:
898 ids = [id(x) for x in child.iter_subtrees()]
899 if id(term) in ids:
900 raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name)
901
902
903
904def symbol_from_strcase(s):
905 assert isinstance(s, str)
906 return Terminal(s, filter_out=s.startswith('_')) if s.isupper() else NonTerminal(s)
907
908@inline_args
909class PrepareGrammar(Transformer_InPlace):
910 def terminal(self, name):
911 return Terminal(str(name), filter_out=name.startswith('_'))
912
913 def nonterminal(self, name):
914 return NonTerminal(name.value)
915
916
917def _find_used_symbols(tree):
918 assert tree.data == 'expansions'
919 return {t.name for x in tree.find_data('expansion')
920 for t in x.scan_values(lambda t: isinstance(t, Symbol))}
921
922
923def _get_parser():
924 try:
925 return _get_parser.cache
926 except AttributeError:
927 terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
928
929 rules = [(name.lstrip('?'), x, RuleOptions(expand1=name.startswith('?')))
930 for name, x in RULES.items()]
931 rules = [Rule(NonTerminal(r), [symbol_from_strcase(s) for s in x.split()], i, None, o)
932 for r, xs, o in rules for i, x in enumerate(xs)]
933
934 callback = ParseTreeBuilder(rules, ST).create_callback()
935 import re
936 lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH'])
937 parser_conf = ParserConf(rules, callback, ['start'])
938 lexer_conf.lexer_type = 'basic'
939 parser_conf.parser_type = 'lalr'
940 _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None)
941 return _get_parser.cache
942
943GRAMMAR_ERRORS = [
944 ('Incorrect type of value', ['a: 1\n']),
945 ('Unclosed parenthesis', ['a: (\n']),
946 ('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']),
947 ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']),
948 ('Illegal name for rules or terminals', ['Aa:\n']),
949 ('Alias expects lowercase name', ['a: -> "a"\n']),
950 ('Unexpected colon', ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n']),
951 ('Misplaced operator', ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n']),
952 ('Expecting option ("|") or a new rule or terminal definition', ['a:a\n()\n']),
953 ('Terminal names cannot contain dots', ['A.B\n']),
954 ('Expecting rule or terminal definition', ['"a"\n']),
955 ('%import expects a name', ['%import "a"\n']),
956 ('%ignore expects a value', ['%ignore %import\n']),
957 ]
958
959def _translate_parser_exception(parse, e):
960 error = e.match_examples(parse, GRAMMAR_ERRORS, use_accepts=True)
961 if error:
962 return error
963 elif 'STRING' in e.expected:
964 return "Expecting a value"
965
966def _parse_grammar(text, name, start='start'):
967 try:
968 tree = _get_parser().parse(text + '\n', start)
969 except UnexpectedCharacters as e:
970 context = e.get_context(text)
971 raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
972 (e.line, e.column, name, context))
973 except UnexpectedToken as e:
974 context = e.get_context(text)
975 error = _translate_parser_exception(_get_parser().parse, e)
976 if error:
977 raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context))
978 raise
979
980 return PrepareGrammar().transform(tree)
981
982def _error_repr(error):
983 if isinstance(error, UnexpectedToken):
984 error2 = _translate_parser_exception(_get_parser().parse, error)
985 if error2:
986 return error2
987 expected = ', '.join(error.accepts or error.expected)
988 return "Unexpected token %r. Expected one of: {%s}" % (str(error.token), expected)
989 else:
990 return str(error)
991
992def _search_interactive_parser(interactive_parser, predicate):
993 def expand(node):
994 path, p = node
995 for choice in p.choices():
996 t = Token(choice, '')
997 try:
998 new_p = p.feed_token(t)
999 except ParseError: # Illegal
1000 pass
1001 else:
1002 yield path + (choice,), new_p
1003
1004 for path, p in bfs_all_unique([((), interactive_parser)], expand):
1005 if predicate(p):
1006 return path, p
1007
1008def find_grammar_errors(text: str, start: str='start') -> List[Tuple[UnexpectedInput, str]]:
1009 errors = []
1010 def on_error(e):
1011 errors.append((e, _error_repr(e)))
1012
1013 # recover to a new line
1014 token_path, _ = _search_interactive_parser(e.interactive_parser.as_immutable(), lambda p: '_NL' in p.choices())
1015 for token_type in token_path:
1016 e.interactive_parser.feed_token(Token(token_type, ''))
1017 e.interactive_parser.feed_token(Token('_NL', '\n'))
1018 return True
1019
1020 _tree = _get_parser().parse(text + '\n', start, on_error=on_error)
1021
1022 errors_by_line = classify(errors, lambda e: e[0].line)
1023 errors = [el[0] for el in errors_by_line.values()] # already sorted
1024
1025 for e in errors:
1026 e[0].interactive_parser = None
1027 return errors
1028
1029
1030def _get_mangle(prefix, aliases, base_mangle=None):
1031 def mangle(s):
1032 if s in aliases:
1033 s = aliases[s]
1034 else:
1035 if s[0] == '_':
1036 s = '_%s__%s' % (prefix, s[1:])
1037 else:
1038 s = '%s__%s' % (prefix, s)
1039 if base_mangle is not None:
1040 s = base_mangle(s)
1041 return s
1042 return mangle
1043
1044def _mangle_definition_tree(exp, mangle):
1045 if mangle is None:
1046 return exp
1047 exp = deepcopy(exp) # TODO: is this needed?
1048 for t in exp.iter_subtrees():
1049 for i, c in enumerate(t.children):
1050 if isinstance(c, Symbol):
1051 t.children[i] = c.renamed(mangle)
1052
1053 return exp
1054
1055def _make_rule_tuple(modifiers_tree, name, params, priority_tree, expansions):
1056 if modifiers_tree.children:
1057 m ,= modifiers_tree.children
1058 expand1 = '?' in m
1059 if expand1 and name.startswith('_'):
1060 raise GrammarError("Inlined rules (_rule) cannot use the ?rule modifier.")
1061 keep_all_tokens = '!' in m
1062 else:
1063 keep_all_tokens = False
1064 expand1 = False
1065
1066 if priority_tree.children:
1067 p ,= priority_tree.children
1068 priority = int(p)
1069 else:
1070 priority = None
1071
1072 if params is not None:
1073 params = [t.value for t in params.children] # For the grammar parser
1074
1075 return name, params, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority,
1076 template_source=(name if params else None))
1077
1078
1079class Definition:
1080 def __init__(self, is_term, tree, params=(), options=None):
1081 self.is_term = is_term
1082 self.tree = tree
1083 self.params = tuple(params)
1084 self.options = options
1085
1086class GrammarBuilder:
1087
1088 global_keep_all_tokens: bool
1089 import_paths: List[Union[str, Callable]]
1090 used_files: Dict[str, str]
1091
1092 _definitions: Dict[str, Definition]
1093 _ignore_names: List[str]
1094
1095 def __init__(self, global_keep_all_tokens: bool=False, import_paths: Optional[List[Union[str, Callable]]]=None, used_files: Optional[Dict[str, str]]=None) -> None:
1096 self.global_keep_all_tokens = global_keep_all_tokens
1097 self.import_paths = import_paths or []
1098 self.used_files = used_files or {}
1099
1100 self._definitions: Dict[str, Definition] = {}
1101 self._ignore_names: List[str] = []
1102
1103 def _grammar_error(self, is_term, msg, *names):
1104 args = {}
1105 for i, name in enumerate(names, start=1):
1106 postfix = '' if i == 1 else str(i)
1107 args['name' + postfix] = name
1108 args['type' + postfix] = lowercase_type = ("rule", "terminal")[is_term]
1109 args['Type' + postfix] = lowercase_type.title()
1110 raise GrammarError(msg.format(**args))
1111
1112 def _check_options(self, is_term, options):
1113 if is_term:
1114 if options is None:
1115 options = 1
1116 elif not isinstance(options, int):
1117 raise GrammarError("Terminal require a single int as 'options' (e.g. priority), got %s" % (type(options),))
1118 else:
1119 if options is None:
1120 options = RuleOptions()
1121 elif not isinstance(options, RuleOptions):
1122 raise GrammarError("Rules require a RuleOptions instance as 'options'")
1123 if self.global_keep_all_tokens:
1124 options.keep_all_tokens = True
1125 return options
1126
1127
1128 def _define(self, name, is_term, exp, params=(), options=None, *, override=False):
1129 if name in self._definitions:
1130 if not override:
1131 self._grammar_error(is_term, "{Type} '{name}' defined more than once", name)
1132 elif override:
1133 self._grammar_error(is_term, "Cannot override a nonexisting {type} {name}", name)
1134
1135 if name.startswith('__'):
1136 self._grammar_error(is_term, 'Names starting with double-underscore are reserved (Error at {name})', name)
1137
1138 self._definitions[name] = Definition(is_term, exp, params, self._check_options(is_term, options))
1139
1140 def _extend(self, name, is_term, exp, params=(), options=None):
1141 if name not in self._definitions:
1142 self._grammar_error(is_term, "Can't extend {type} {name} as it wasn't defined before", name)
1143
1144 d = self._definitions[name]
1145
1146 if is_term != d.is_term:
1147 self._grammar_error(is_term, "Cannot extend {type} {name} - one is a terminal, while the other is not.", name)
1148 if tuple(params) != d.params:
1149 self._grammar_error(is_term, "Cannot extend {type} with different parameters: {name}", name)
1150
1151 if d.tree is None:
1152 self._grammar_error(is_term, "Can't extend {type} {name} - it is abstract.", name)
1153
1154 # TODO: think about what to do with 'options'
1155 base = d.tree
1156
1157 assert isinstance(base, Tree) and base.data == 'expansions'
1158 base.children.insert(0, exp)
1159
1160 def _ignore(self, exp_or_name):
1161 if isinstance(exp_or_name, str):
1162 self._ignore_names.append(exp_or_name)
1163 else:
1164 assert isinstance(exp_or_name, Tree)
1165 t = exp_or_name
1166 if t.data == 'expansions' and len(t.children) == 1:
1167 t2 ,= t.children
1168 if t2.data=='expansion' and len(t2.children) == 1:
1169 item ,= t2.children
1170 if item.data == 'value':
1171 item ,= item.children
1172 if isinstance(item, Terminal):
1173 # Keep terminal name, no need to create a new definition
1174 self._ignore_names.append(item.name)
1175 return
1176
1177 name = '__IGNORE_%d'% len(self._ignore_names)
1178 self._ignore_names.append(name)
1179 self._definitions[name] = Definition(True, t, options=TOKEN_DEFAULT_PRIORITY)
1180
1181 def _unpack_import(self, stmt, grammar_name):
1182 if len(stmt.children) > 1:
1183 path_node, arg1 = stmt.children
1184 else:
1185 path_node, = stmt.children
1186 arg1 = None
1187
1188 if isinstance(arg1, Tree): # Multi import
1189 dotted_path = tuple(path_node.children)
1190 names = arg1.children
1191 aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names
1192 else: # Single import
1193 dotted_path = tuple(path_node.children[:-1])
1194 if not dotted_path:
1195 name ,= path_node.children
1196 raise GrammarError("Nothing was imported from grammar `%s`" % name)
1197 name = path_node.children[-1] # Get name from dotted path
1198 aliases = {name.value: (arg1 or name).value} # Aliases if exist
1199
1200 if path_node.data == 'import_lib': # Import from library
1201 base_path = None
1202 else: # Relative import
1203 if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
1204 try:
1205 base_file = os.path.abspath(sys.modules['__main__'].__file__)
1206 except AttributeError:
1207 base_file = None
1208 else:
1209 base_file = grammar_name # Import relative to grammar file path if external grammar file
1210 if base_file:
1211 if isinstance(base_file, PackageResource):
1212 base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0])
1213 else:
1214 base_path = os.path.split(base_file)[0]
1215 else:
1216 base_path = os.path.abspath(os.path.curdir)
1217
1218 return dotted_path, base_path, aliases
1219
1220 def _unpack_definition(self, tree, mangle):
1221
1222 if tree.data == 'rule':
1223 name, params, exp, opts = _make_rule_tuple(*tree.children)
1224 is_term = False
1225 else:
1226 name = tree.children[0].value
1227 params = () # TODO terminal templates
1228 opts = int(tree.children[1]) if len(tree.children) == 3 else TOKEN_DEFAULT_PRIORITY # priority
1229 exp = tree.children[-1]
1230 is_term = True
1231
1232 if mangle is not None:
1233 params = tuple(mangle(p) for p in params)
1234 name = mangle(name)
1235
1236 exp = _mangle_definition_tree(exp, mangle)
1237 return name, is_term, exp, params, opts
1238
1239
1240 def load_grammar(self, grammar_text: str, grammar_name: str="<?>", mangle: Optional[Callable[[str], str]]=None) -> None:
1241 tree = _parse_grammar(grammar_text, grammar_name)
1242
1243 imports: Dict[Tuple[str, ...], Tuple[Optional[str], Dict[str, str]]] = {}
1244
1245 for stmt in tree.children:
1246 if stmt.data == 'import':
1247 dotted_path, base_path, aliases = self._unpack_import(stmt, grammar_name)
1248 try:
1249 import_base_path, import_aliases = imports[dotted_path]
1250 assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path)
1251 import_aliases.update(aliases)
1252 except KeyError:
1253 imports[dotted_path] = base_path, aliases
1254
1255 for dotted_path, (base_path, aliases) in imports.items():
1256 self.do_import(dotted_path, base_path, aliases, mangle)
1257
1258 for stmt in tree.children:
1259 if stmt.data in ('term', 'rule'):
1260 self._define(*self._unpack_definition(stmt, mangle))
1261 elif stmt.data == 'override':
1262 r ,= stmt.children
1263 self._define(*self._unpack_definition(r, mangle), override=True)
1264 elif stmt.data == 'extend':
1265 r ,= stmt.children
1266 self._extend(*self._unpack_definition(r, mangle))
1267 elif stmt.data == 'ignore':
1268 # if mangle is not None, we shouldn't apply ignore, since we aren't in a toplevel grammar
1269 if mangle is None:
1270 self._ignore(*stmt.children)
1271 elif stmt.data == 'declare':
1272 for symbol in stmt.children:
1273 assert isinstance(symbol, Symbol), symbol
1274 is_term = isinstance(symbol, Terminal)
1275 if mangle is None:
1276 name = symbol.name
1277 else:
1278 name = mangle(symbol.name)
1279 self._define(name, is_term, None)
1280 elif stmt.data == 'import':
1281 pass
1282 else:
1283 assert False, stmt
1284
1285
1286 term_defs = { name: d.tree
1287 for name, d in self._definitions.items()
1288 if d.is_term
1289 }
1290 resolve_term_references(term_defs)
1291
1292
1293 def _remove_unused(self, used):
1294 def rule_dependencies(symbol):
1295 try:
1296 d = self._definitions[symbol]
1297 except KeyError:
1298 return []
1299 if d.is_term:
1300 return []
1301 return _find_used_symbols(d.tree) - set(d.params)
1302
1303 _used = set(bfs(used, rule_dependencies))
1304 self._definitions = {k: v for k, v in self._definitions.items() if k in _used}
1305
1306
1307 def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], aliases: Dict[str, str], base_mangle: Optional[Callable[[str], str]]=None) -> None:
1308 assert dotted_path
1309 mangle = _get_mangle('__'.join(dotted_path), aliases, base_mangle)
1310 grammar_path = os.path.join(*dotted_path) + EXT
1311 to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader]
1312 for source in to_try:
1313 try:
1314 if callable(source):
1315 joined_path, text = source(base_path, grammar_path)
1316 else:
1317 joined_path = os.path.join(source, grammar_path)
1318 with open(joined_path, encoding='utf8') as f:
1319 text = f.read()
1320 except IOError:
1321 continue
1322 else:
1323 h = sha256_digest(text)
1324 if self.used_files.get(joined_path, h) != h:
1325 raise RuntimeError("Grammar file was changed during importing")
1326 self.used_files[joined_path] = h
1327
1328 gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files)
1329 gb.load_grammar(text, joined_path, mangle)
1330 gb._remove_unused(map(mangle, aliases))
1331 for name in gb._definitions:
1332 if name in self._definitions:
1333 raise GrammarError("Cannot import '%s' from '%s': Symbol already defined." % (name, grammar_path))
1334
1335 self._definitions.update(**gb._definitions)
1336 break
1337 else:
1338 # Search failed. Make Python throw a nice error.
1339 open(grammar_path, encoding='utf8')
1340 assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,)
1341
1342
1343 def validate(self) -> None:
1344 for name, d in self._definitions.items():
1345 params = d.params
1346 exp = d.tree
1347
1348 for i, p in enumerate(params):
1349 if p in self._definitions:
1350 raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name))
1351 if p in params[:i]:
1352 raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name))
1353
1354 if exp is None: # Remaining checks don't apply to abstract rules/terminals (created with %declare)
1355 continue
1356
1357 for temp in exp.find_data('template_usage'):
1358 sym = temp.children[0].name
1359 args = temp.children[1:]
1360 if sym not in params:
1361 if sym not in self._definitions:
1362 self._grammar_error(d.is_term, "Template '%s' used but not defined (in {type} {name})" % sym, name)
1363 if len(args) != len(self._definitions[sym].params):
1364 expected, actual = len(self._definitions[sym].params), len(args)
1365 self._grammar_error(d.is_term, "Wrong number of template arguments used for {name} "
1366 "(expected %s, got %s) (in {type2} {name2})" % (expected, actual), sym, name)
1367
1368 for sym in _find_used_symbols(exp):
1369 if sym not in self._definitions and sym not in params:
1370 self._grammar_error(d.is_term, "{Type} '{name}' used but not defined (in {type2} {name2})", sym, name)
1371
1372 if not set(self._definitions).issuperset(self._ignore_names):
1373 raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(self._ignore_names) - set(self._definitions)))
1374
1375 def build(self) -> Grammar:
1376 self.validate()
1377 rule_defs = []
1378 term_defs = []
1379 for name, d in self._definitions.items():
1380 (params, exp, options) = d.params, d.tree, d.options
1381 if d.is_term:
1382 assert len(params) == 0
1383 term_defs.append((name, (exp, options)))
1384 else:
1385 rule_defs.append((name, params, exp, options))
1386 # resolve_term_references(term_defs)
1387 return Grammar(rule_defs, term_defs, self._ignore_names)
1388
1389
1390def verify_used_files(file_hashes):
1391 for path, old in file_hashes.items():
1392 text = None
1393 if isinstance(path, str) and os.path.exists(path):
1394 with open(path, encoding='utf8') as f:
1395 text = f.read()
1396 elif isinstance(path, PackageResource):
1397 with suppress(IOError):
1398 text = pkgutil.get_data(*path).decode('utf-8')
1399 if text is None: # We don't know how to load the path. ignore it.
1400 continue
1401
1402 current = sha256_digest(text)
1403 if old != current:
1404 logger.info("File %r changed, rebuilding Parser" % path)
1405 return False
1406 return True
1407
1408def list_grammar_imports(grammar, import_paths=[]):
1409 "Returns a list of paths to the lark grammars imported by the given grammar (recursively)"
1410 builder = GrammarBuilder(False, import_paths)
1411 builder.load_grammar(grammar, '<string>')
1412 return list(builder.used_files.keys())
1413
1414def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
1415 builder = GrammarBuilder(global_keep_all_tokens, import_paths)
1416 builder.load_grammar(grammar, source)
1417 return builder.build(), builder.used_files
1418
1419
1420def sha256_digest(s: str) -> str:
1421 """Get the sha256 digest of a string
1422
1423 Supports the `usedforsecurity` argument for Python 3.9+ to allow running on
1424 a FIPS-enabled system.
1425 """
1426 if sys.version_info >= (3, 9):
1427 return hashlib.sha256(s.encode('utf8'), usedforsecurity=False).hexdigest()
1428 else:
1429 return hashlib.sha256(s.encode('utf8')).hexdigest()