1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 
    2# Licensed to PSF under a Contributor Agreement. 
    3 
    4# Modifications: 
    5# Copyright David Halter and Contributors 
    6# Modifications are dual-licensed: MIT and PSF. 
    7 
    8""" 
    9This module defines the data structures used to represent a grammar. 
    10 
    11Specifying grammars in pgen is possible with this grammar:: 
    12 
    13    grammar: (NEWLINE | rule)* ENDMARKER 
    14    rule: NAME ':' rhs NEWLINE 
    15    rhs: items ('|' items)* 
    16    items: item+ 
    17    item: '[' rhs ']' | atom ['+' | '*'] 
    18    atom: '(' rhs ')' | NAME | STRING 
    19 
    20This grammar is self-referencing. 
    21 
    22This parser generator (pgen2) was created by Guido Rossum and used for lib2to3. 
    23Most of the code has been refactored to make it more Pythonic. Since this was a 
    24"copy" of the CPython Parser parser "pgen", there was some work needed to make 
    25it more readable. It should also be slightly faster than the original pgen2, 
    26because we made some optimizations. 
    27""" 
    28 
    29from ast import literal_eval 
    30from typing import TypeVar, Generic, Mapping, Sequence, Set, Union 
    31 
    32from parso.pgen2.grammar_parser import GrammarParser, NFAState 
    33 
    34_TokenTypeT = TypeVar("_TokenTypeT") 
    35 
    36 
    37class Grammar(Generic[_TokenTypeT]): 
    38    """ 
    39    Once initialized, this class supplies the grammar tables for the 
    40    parsing engine implemented by parse.py.  The parsing engine 
    41    accesses the instance variables directly. 
    42 
    43    The only important part in this parsers are dfas and transitions between 
    44    dfas. 
    45    """ 
    46 
    47    def __init__(self, 
    48                 start_nonterminal: str, 
    49                 rule_to_dfas: Mapping[str, Sequence['DFAState[_TokenTypeT]']], 
    50                 reserved_syntax_strings: Mapping[str, 'ReservedString']): 
    51        self.nonterminal_to_dfas = rule_to_dfas 
    52        self.reserved_syntax_strings = reserved_syntax_strings 
    53        self.start_nonterminal = start_nonterminal 
    54 
    55 
    56class DFAPlan: 
    57    """ 
    58    Plans are used for the parser to create stack nodes and do the proper 
    59    DFA state transitions. 
    60    """ 
    61    def __init__(self, next_dfa: 'DFAState', dfa_pushes: Sequence['DFAState'] = []): 
    62        self.next_dfa = next_dfa 
    63        self.dfa_pushes = dfa_pushes 
    64 
    65    def __repr__(self): 
    66        return '%s(%s, %s)' % (self.__class__.__name__, self.next_dfa, self.dfa_pushes) 
    67 
    68 
    69class DFAState(Generic[_TokenTypeT]): 
    70    """ 
    71    The DFAState object is the core class for pretty much anything. DFAState 
    72    are the vertices of an ordered graph while arcs and transitions are the 
    73    edges. 
    74 
    75    Arcs are the initial edges, where most DFAStates are not connected and 
    76    transitions are then calculated to connect the DFA state machines that have 
    77    different nonterminals. 
    78    """ 
    79    def __init__(self, from_rule: str, nfa_set: Set[NFAState], final: NFAState): 
    80        assert isinstance(nfa_set, set) 
    81        assert isinstance(next(iter(nfa_set)), NFAState) 
    82        assert isinstance(final, NFAState) 
    83        self.from_rule = from_rule 
    84        self.nfa_set = nfa_set 
    85        # map from terminals/nonterminals to DFAState 
    86        self.arcs: Mapping[str, DFAState] = {} 
    87        # In an intermediary step we set these nonterminal arcs (which has the 
    88        # same structure as arcs). These don't contain terminals anymore. 
    89        self.nonterminal_arcs: Mapping[str, DFAState] = {} 
    90 
    91        # Transitions are basically the only thing that  the parser is using 
    92        # with is_final. Everyting else is purely here to create a parser. 
    93        self.transitions: Mapping[Union[_TokenTypeT, ReservedString], DFAPlan] = {} 
    94        self.is_final = final in nfa_set 
    95 
    96    def add_arc(self, next_, label): 
    97        assert isinstance(label, str) 
    98        assert label not in self.arcs 
    99        assert isinstance(next_, DFAState) 
    100        self.arcs[label] = next_ 
    101 
    102    def unifystate(self, old, new): 
    103        for label, next_ in self.arcs.items(): 
    104            if next_ is old: 
    105                self.arcs[label] = new 
    106 
    107    def __eq__(self, other): 
    108        # Equality test -- ignore the nfa_set instance variable 
    109        assert isinstance(other, DFAState) 
    110        if self.is_final != other.is_final: 
    111            return False 
    112        # Can't just return self.arcs == other.arcs, because that 
    113        # would invoke this method recursively, with cycles... 
    114        if len(self.arcs) != len(other.arcs): 
    115            return False 
    116        for label, next_ in self.arcs.items(): 
    117            if next_ is not other.arcs.get(label): 
    118                return False 
    119        return True 
    120 
    121    def __repr__(self): 
    122        return '<%s: %s is_final=%s>' % ( 
    123            self.__class__.__name__, self.from_rule, self.is_final 
    124        ) 
    125 
    126 
    127class ReservedString: 
    128    """ 
    129    Most grammars will have certain keywords and operators that are mentioned 
    130    in the grammar as strings (e.g. "if") and not token types (e.g. NUMBER). 
    131    This class basically is the former. 
    132    """ 
    133 
    134    def __init__(self, value: str): 
    135        self.value = value 
    136 
    137    def __repr__(self): 
    138        return '%s(%s)' % (self.__class__.__name__, self.value) 
    139 
    140 
    141def _simplify_dfas(dfas): 
    142    """ 
    143    This is not theoretically optimal, but works well enough. 
    144    Algorithm: repeatedly look for two states that have the same 
    145    set of arcs (same labels pointing to the same nodes) and 
    146    unify them, until things stop changing. 
    147 
    148    dfas is a list of DFAState instances 
    149    """ 
    150    changes = True 
    151    while changes: 
    152        changes = False 
    153        for i, state_i in enumerate(dfas): 
    154            for j in range(i + 1, len(dfas)): 
    155                state_j = dfas[j] 
    156                if state_i == state_j: 
    157                    del dfas[j] 
    158                    for state in dfas: 
    159                        state.unifystate(state_j, state_i) 
    160                    changes = True 
    161                    break 
    162 
    163 
    164def _make_dfas(start, finish): 
    165    """ 
    166    Uses the powerset construction algorithm to create DFA states from sets of 
    167    NFA states. 
    168 
    169    Also does state reduction if some states are not needed. 
    170    """ 
    171    # To turn an NFA into a DFA, we define the states of the DFA 
    172    # to correspond to *sets* of states of the NFA.  Then do some 
    173    # state reduction. 
    174    assert isinstance(start, NFAState) 
    175    assert isinstance(finish, NFAState) 
    176 
    177    def addclosure(nfa_state, base_nfa_set): 
    178        assert isinstance(nfa_state, NFAState) 
    179        if nfa_state in base_nfa_set: 
    180            return 
    181        base_nfa_set.add(nfa_state) 
    182        for nfa_arc in nfa_state.arcs: 
    183            if nfa_arc.nonterminal_or_string is None: 
    184                addclosure(nfa_arc.next, base_nfa_set) 
    185 
    186    base_nfa_set = set() 
    187    addclosure(start, base_nfa_set) 
    188    states = [DFAState(start.from_rule, base_nfa_set, finish)] 
    189    for state in states:  # NB states grows while we're iterating 
    190        arcs = {} 
    191        # Find state transitions and store them in arcs. 
    192        for nfa_state in state.nfa_set: 
    193            for nfa_arc in nfa_state.arcs: 
    194                if nfa_arc.nonterminal_or_string is not None: 
    195                    nfa_set = arcs.setdefault(nfa_arc.nonterminal_or_string, set()) 
    196                    addclosure(nfa_arc.next, nfa_set) 
    197 
    198        # Now create the dfa's with no None's in arcs anymore. All Nones have 
    199        # been eliminated and state transitions (arcs) are properly defined, we 
    200        # just need to create the dfa's. 
    201        for nonterminal_or_string, nfa_set in arcs.items(): 
    202            for nested_state in states: 
    203                if nested_state.nfa_set == nfa_set: 
    204                    # The DFA state already exists for this rule. 
    205                    break 
    206            else: 
    207                nested_state = DFAState(start.from_rule, nfa_set, finish) 
    208                states.append(nested_state) 
    209 
    210            state.add_arc(nested_state, nonterminal_or_string) 
    211    return states  # List of DFAState instances; first one is start 
    212 
    213 
    214def _dump_nfa(start, finish): 
    215    print("Dump of NFA for", start.from_rule) 
    216    todo = [start] 
    217    for i, state in enumerate(todo): 
    218        print("  State", i, state is finish and "(final)" or "") 
    219        for arc in state.arcs: 
    220            label, next_ = arc.nonterminal_or_string, arc.next 
    221            if next_ in todo: 
    222                j = todo.index(next_) 
    223            else: 
    224                j = len(todo) 
    225                todo.append(next_) 
    226            if label is None: 
    227                print("    -> %d" % j) 
    228            else: 
    229                print("    %s -> %d" % (label, j)) 
    230 
    231 
    232def _dump_dfas(dfas): 
    233    print("Dump of DFA for", dfas[0].from_rule) 
    234    for i, state in enumerate(dfas): 
    235        print("  State", i, state.is_final and "(final)" or "") 
    236        for nonterminal, next_ in state.arcs.items(): 
    237            print("    %s -> %d" % (nonterminal, dfas.index(next_))) 
    238 
    239 
    240def generate_grammar(bnf_grammar: str, token_namespace) -> Grammar: 
    241    """ 
    242    ``bnf_text`` is a grammar in extended BNF (using * for repetition, + for 
    243    at-least-once repetition, [] for optional parts, | for alternatives and () 
    244    for grouping). 
    245 
    246    It's not EBNF according to ISO/IEC 14977. It's a dialect Python uses in its 
    247    own parser. 
    248    """ 
    249    rule_to_dfas = {} 
    250    start_nonterminal = None 
    251    for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse(): 
    252        # _dump_nfa(nfa_a, nfa_z) 
    253        dfas = _make_dfas(nfa_a, nfa_z) 
    254        # _dump_dfas(dfas) 
    255        # oldlen = len(dfas) 
    256        _simplify_dfas(dfas) 
    257        # newlen = len(dfas) 
    258        rule_to_dfas[nfa_a.from_rule] = dfas 
    259        # print(nfa_a.from_rule, oldlen, newlen) 
    260 
    261        if start_nonterminal is None: 
    262            start_nonterminal = nfa_a.from_rule 
    263 
    264    reserved_strings: Mapping[str, ReservedString] = {} 
    265    for nonterminal, dfas in rule_to_dfas.items(): 
    266        for dfa_state in dfas: 
    267            for terminal_or_nonterminal, next_dfa in dfa_state.arcs.items(): 
    268                if terminal_or_nonterminal in rule_to_dfas: 
    269                    dfa_state.nonterminal_arcs[terminal_or_nonterminal] = next_dfa 
    270                else: 
    271                    transition = _make_transition( 
    272                        token_namespace, 
    273                        reserved_strings, 
    274                        terminal_or_nonterminal 
    275                    ) 
    276                    dfa_state.transitions[transition] = DFAPlan(next_dfa) 
    277 
    278    _calculate_tree_traversal(rule_to_dfas) 
    279    return Grammar(start_nonterminal, rule_to_dfas, reserved_strings)  # type: ignore[arg-type] 
    280 
    281 
    282def _make_transition(token_namespace, reserved_syntax_strings, label): 
    283    """ 
    284    Creates a reserved string ("if", "for", "*", ...) or returns the token type 
    285    (NUMBER, STRING, ...) for a given grammar terminal. 
    286    """ 
    287    if label[0].isalpha(): 
    288        # A named token (e.g. NAME, NUMBER, STRING) 
    289        return getattr(token_namespace, label) 
    290    else: 
    291        # Either a keyword or an operator 
    292        assert label[0] in ('"', "'"), label 
    293        assert not label.startswith('"""') and not label.startswith("'''") 
    294        value = literal_eval(label) 
    295        try: 
    296            return reserved_syntax_strings[value] 
    297        except KeyError: 
    298            r = reserved_syntax_strings[value] = ReservedString(value) 
    299            return r 
    300 
    301 
    302def _calculate_tree_traversal(nonterminal_to_dfas): 
    303    """ 
    304    By this point we know how dfas can move around within a stack node, but we 
    305    don't know how we can add a new stack node (nonterminal transitions). 
    306    """ 
    307    # Map from grammar rule (nonterminal) name to a set of tokens. 
    308    first_plans = {} 
    309 
    310    nonterminals = list(nonterminal_to_dfas.keys()) 
    311    nonterminals.sort() 
    312    for nonterminal in nonterminals: 
    313        if nonterminal not in first_plans: 
    314            _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal) 
    315 
    316    # Now that we have calculated the first terminals, we are sure that 
    317    # there is no left recursion. 
    318 
    319    for dfas in nonterminal_to_dfas.values(): 
    320        for dfa_state in dfas: 
    321            transitions = dfa_state.transitions 
    322            for nonterminal, next_dfa in dfa_state.nonterminal_arcs.items(): 
    323                for transition, pushes in first_plans[nonterminal].items(): 
    324                    if transition in transitions: 
    325                        prev_plan = transitions[transition] 
    326                        # Make sure these are sorted so that error messages are 
    327                        # at least deterministic 
    328                        choices = sorted([ 
    329                            ( 
    330                                prev_plan.dfa_pushes[0].from_rule 
    331                                if prev_plan.dfa_pushes 
    332                                else prev_plan.next_dfa.from_rule 
    333                            ), 
    334                            ( 
    335                                pushes[0].from_rule 
    336                                if pushes else next_dfa.from_rule 
    337                            ), 
    338                        ]) 
    339                        raise ValueError( 
    340                            "Rule %s is ambiguous; given a %s token, we " 
    341                            "can't determine if we should evaluate %s or %s." 
    342                            % ( 
    343                                ( 
    344                                    dfa_state.from_rule, 
    345                                    transition, 
    346                                ) + tuple(choices) 
    347                            ) 
    348                        ) 
    349                    transitions[transition] = DFAPlan(next_dfa, pushes) 
    350 
    351 
    352def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal): 
    353    """ 
    354    Calculates the first plan in the first_plans dictionary for every given 
    355    nonterminal. This is going to be used to know when to create stack nodes. 
    356    """ 
    357    dfas = nonterminal_to_dfas[nonterminal] 
    358    new_first_plans = {} 
    359    first_plans[nonterminal] = None  # dummy to detect left recursion 
    360    # We only need to check the first dfa. All the following ones are not 
    361    # interesting to find first terminals. 
    362    state = dfas[0] 
    363    for transition, next_ in state.transitions.items(): 
    364        # It's a string. We have finally found a possible first token. 
    365        new_first_plans[transition] = [next_.next_dfa] 
    366 
    367    for nonterminal2, next_ in state.nonterminal_arcs.items(): 
    368        # It's a nonterminal and we have either a left recursion issue 
    369        # in the grammar or we have to recurse. 
    370        try: 
    371            first_plans2 = first_plans[nonterminal2] 
    372        except KeyError: 
    373            first_plans2 = _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal2) 
    374        else: 
    375            if first_plans2 is None: 
    376                raise ValueError("left recursion for rule %r" % nonterminal) 
    377 
    378        for t, pushes in first_plans2.items(): 
    379            new_first_plans[t] = [next_] + pushes 
    380 
    381    first_plans[nonterminal] = new_first_plans 
    382    return new_first_plans