/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // LR parsers are bottom-up parsers -- they scan the input from left to right, |
10 | | // and collect the right-hand side of a production rule (called handle) on top |
11 | | // of the stack, then replace (reduce) the handle with the nonterminal defined |
12 | | // by the production rule. |
13 | | // |
14 | | // This file defines LRGraph, a deterministic handle-finding finite-state |
15 | | // automaton, which is a key component in LR parsers to recognize any of |
16 | | // handles in the grammar efficiently. We build the LR table (ACTION and GOTO |
17 | | // Table) based on the LRGraph. |
18 | | // |
19 | | // LRGraph can be constructed for any context-free grammars. |
20 | | // Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but |
21 | | // interpretation of the FSA is nondeterministic -- we might in a state where |
22 | | // we can continue searching an handle and identify a handle (called |
23 | | // shift/reduce conflicts), or identify more than one handle (callled |
24 | | // reduce/reduce conflicts). |
25 | | // |
26 | | // LRGraph is a common model for all variants of LR automatons, from the most |
27 | | // basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead |
28 | | // in making decisions. |
29 | | //===----------------------------------------------------------------------===// |
30 | | |
31 | | #ifndef CLANG_PSEUDO_GRAMMAR_LRGRAPH_H |
32 | | #define CLANG_PSEUDO_GRAMMAR_LRGRAPH_H |
33 | | |
34 | | #include "clang-pseudo/grammar/Grammar.h" |
35 | | #include "llvm/ADT/Hashing.h" |
36 | | #include <vector> |
37 | | |
38 | | namespace clang { |
39 | | namespace pseudo { |
40 | | |
41 | | // An LR item -- a grammar rule with a dot at some position of the body. |
42 | | // e.g. a production rule A := X Y yields 3 items: |
43 | | // A := . X Y |
44 | | // A := X . Y |
45 | | // A := X Y . |
46 | | // An item indicates how much of a production rule has been recognized at a |
47 | | // position (described by dot), for example, A := X . Y indicates that we have |
48 | | // recognized the X part from the input, and we hope next to see the input |
49 | | // derivable from Y. |
50 | | class Item { |
51 | | public: |
52 | 120k | static Item start(RuleID ID, const Grammar &G) { |
53 | 120k | Item I; |
54 | 120k | I.RID = ID; |
55 | 120k | I.RuleLength = G.lookupRule(ID).Size; |
56 | 120k | return I; |
57 | 120k | } |
58 | 491k | static Item sentinel(RuleID ID) { |
59 | 491k | Item I; |
60 | 491k | I.RID = ID; |
61 | 491k | return I; |
62 | 491k | } |
63 | | |
64 | 105k | RuleID rule() const { return RID; } |
65 | 52.2k | uint8_t dot() const { return DotPos; } |
66 | | |
67 | 3.26M | bool hasNext() const { return DotPos < RuleLength; } |
68 | 1.96M | SymbolID next(const Grammar &G) const { |
69 | 1.96M | assert(hasNext()); |
70 | 0 | return G.lookupRule(RID).Sequence[DotPos]; |
71 | 1.96M | } |
72 | | |
73 | 51.2k | Item advance() const { |
74 | 51.2k | assert(hasNext()); |
75 | 0 | Item I = *this; |
76 | 51.2k | ++I.DotPos; |
77 | 51.2k | return I; |
78 | 51.2k | } |
79 | | |
80 | | std::string dump(const Grammar &G) const; |
81 | | |
82 | 1.47M | bool operator==(const Item &I) const { |
83 | 1.47M | return DotPos == I.DotPos && RID == I.RID; |
84 | 1.47M | } |
85 | 98.1k | bool operator<(const Item &I) const { |
86 | 98.1k | return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos); |
87 | 98.1k | } |
88 | 242k | friend llvm::hash_code hash_value(const Item &I) { |
89 | 242k | return llvm::hash_combine(I.RID, I.DotPos); |
90 | 242k | } |
91 | | |
92 | | private: |
93 | | RuleID RID = 0; |
94 | | uint8_t DotPos = 0; |
95 | | uint8_t RuleLength = 0; // the length of rule body. |
96 | | }; |
97 | | |
98 | | // A state represents a node in the LR automaton graph. It is an item set, which |
99 | | // contains all possible rules that the LR parser may be parsing in that state. |
100 | | // |
101 | | // Conceptually, If we knew in advance what we're parsing, at any point we're |
102 | | // partway through parsing a production, sitting on a stack of partially parsed |
103 | | // productions. But because we don't know, there could be *several* productions |
104 | | // we're partway through. The set of possibilities is the parser state, and we |
105 | | // precompute all the transitions between these states. |
106 | | struct State { |
107 | | // A full set of items (including non-kernel items) representing the state, |
108 | | // in a canonical order (see SortByNextSymbol in the cpp file). |
109 | | std::vector<Item> Items; |
110 | | |
111 | | std::string dump(const Grammar &G, unsigned Indent = 0) const; |
112 | | }; |
113 | | |
114 | | // LRGraph is a deterministic finite state automaton for LR parsing. |
115 | | // |
116 | | // Intuitively, an LR automaton is a transition graph. The graph has a |
117 | | // collection of nodes, called States. Each state corresponds to a particular |
118 | | // item set, which represents a condition that could occur during the process of |
119 | | // parsing a production. Edges are directed from one state to another. Each edge |
120 | | // is labeled by a grammar symbol (terminal or nonterminal). |
121 | | // |
122 | | // LRGraph is used to construct the LR parsing table which is a core |
123 | | // data-structure driving the LR parser. |
124 | | class LRGraph { |
125 | | public: |
126 | | // StateID is the index in States table. |
127 | | using StateID = uint16_t; |
128 | | |
129 | | // Constructs an LR(0) automaton. |
130 | | static LRGraph buildLR0(const Grammar &); |
131 | | |
132 | | // An edge in the LR graph, it represents a transition in the LR automaton. |
133 | | // If the parser is at state Src, with a lookahead Label, then it |
134 | | // transits to state Dst. |
135 | | struct Edge { |
136 | | StateID Src, Dst; |
137 | | SymbolID Label; |
138 | | }; |
139 | | |
140 | | // A possible error recovery: choose to match some tokens against a symbol. |
141 | | // |
142 | | // e.g. a state that contains |
143 | | // stmt := { . stmt-seq [recover=braces] } |
144 | | // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }. |
145 | | struct Recovery { |
146 | | StateID Src; // The state we are in when encountering the error. |
147 | | ExtensionID Strategy; // Heuristic choosing the tokens to match. |
148 | | SymbolID Result; // The symbol that is produced. |
149 | | }; |
150 | | |
151 | 2.95k | llvm::ArrayRef<State> states() const { return States; } |
152 | 1 | llvm::ArrayRef<Edge> edges() const { return Edges; } |
153 | 1 | llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; } |
154 | 1 | llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const { |
155 | 1 | return StartStates; |
156 | 1 | } |
157 | | |
158 | | std::string dumpForTests(const Grammar &) const; |
159 | | |
160 | | private: |
161 | | LRGraph(std::vector<State> States, std::vector<Edge> Edges, |
162 | | std::vector<Recovery> Recoveries, |
163 | | std::vector<std::pair<SymbolID, StateID>> StartStates) |
164 | | : States(std::move(States)), Edges(std::move(Edges)), |
165 | 1 | Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) { |
166 | 1 | } |
167 | | |
168 | | std::vector<State> States; |
169 | | std::vector<Edge> Edges; |
170 | | std::vector<Recovery> Recoveries; |
171 | | std::vector<std::pair<SymbolID, StateID>> StartStates; |
172 | | }; |
173 | | |
174 | | } // namespace pseudo |
175 | | } // namespace clang |
176 | | |
177 | | namespace llvm { |
178 | | // Support clang::pseudo::Item as DenseMap keys. |
179 | | template <> struct DenseMapInfo<clang::pseudo::Item> { |
180 | 274k | static inline clang::pseudo::Item getEmptyKey() { |
181 | 274k | return clang::pseudo::Item::sentinel(-1); |
182 | 274k | } |
183 | 217k | static inline clang::pseudo::Item getTombstoneKey() { |
184 | 217k | return clang::pseudo::Item::sentinel(-2); |
185 | 217k | } |
186 | 185k | static unsigned getHashValue(const clang::pseudo::Item &I) { |
187 | 185k | return hash_value(I); |
188 | 185k | } |
189 | | static bool isEqual(const clang::pseudo::Item &LHS, |
190 | 1.35M | const clang::pseudo::Item &RHS) { |
191 | 1.35M | return LHS == RHS; |
192 | 1.35M | } |
193 | | }; |
194 | | } // namespace llvm |
195 | | |
196 | | #endif // CLANG_PSEUDO_GRAMMAR_LRGRAPH_H |