/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // The LRTable (referred as LR parsing table in the LR literature) is the core |
10 | | // component in LR parsers, it drives the LR parsers by specifying an action to |
11 | | // take given the current state on the top of the stack and the current |
12 | | // lookahead token. |
13 | | // |
14 | | // The LRTable can be described as a matrix where the rows represent |
15 | | // the states of the LR graph, the columns represent the symbols of the |
16 | | // grammar, and each entry of the matrix (called action) represents a |
17 | | // state transition in the graph. |
18 | | // |
19 | | // Typically, based on the category of the grammar symbol, the LRTable is |
20 | | // broken into two logically separate tables: |
21 | | // - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies |
22 | | // next action (shift/reduce) on state S under a lookahead terminal a |
23 | | // - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies |
24 | | // the state which we transist to from the state S with the nonterminal X |
25 | | // |
26 | | // LRTable is *performance-critial* as it is consulted frequently during a |
27 | | // parse. In general, LRTable is very sparse (most of the entries are empty). |
28 | | // For example, for the C++ language, the SLR table has ~1500 states and 650 |
29 | | // symbols which results in a matrix having 975K entries, ~90% of entries are |
30 | | // empty. |
31 | | // |
32 | | // This file implements a speed-and-space-efficient LRTable. |
33 | | // |
34 | | //===----------------------------------------------------------------------===// |
35 | | |
36 | | #ifndef CLANG_PSEUDO_GRAMMAR_LRTABLE_H |
37 | | #define CLANG_PSEUDO_GRAMMAR_LRTABLE_H |
38 | | |
39 | | #include "clang-pseudo/grammar/Grammar.h" |
40 | | #include "llvm/ADT/ArrayRef.h" |
41 | | #include "llvm/ADT/BitVector.h" |
42 | | #include "llvm/ADT/SmallSet.h" |
43 | | #include "llvm/Support/Capacity.h" |
44 | | #include "llvm/Support/MathExtras.h" |
45 | | #include <cstdint> |
46 | | #include <vector> |
47 | | |
48 | | namespace clang { |
49 | | namespace pseudo { |
50 | | |
51 | | // Represents the LR parsing table, which can efficiently the question "what is |
52 | | // the next step given the lookahead token and current state on top of the |
53 | | // stack?". |
54 | | // |
55 | | // This is a dense implementation, which only takes an amount of space that is |
56 | | // proportional to the number of non-empty entries in the table. |
57 | | // |
58 | | // Unlike the typical LR parsing table which allows at most one available action |
59 | | // per entry, conflicted actions are allowed in LRTable. The LRTable is designed |
60 | | // to be used in nondeterministic LR parsers (e.g. GLR). |
61 | | // |
62 | | // There are no "accept" actions in the LRTable, instead the stack is inspected |
63 | | // after parsing completes: is the state goto(StartState, StartSymbol)? |
64 | | class LRTable { |
65 | | public: |
66 | | // StateID is only 13 bits wide. |
67 | | using StateID = uint16_t; |
68 | | static constexpr unsigned StateBits = 13; |
69 | | |
70 | | struct Recovery { |
71 | | ExtensionID Strategy; |
72 | | SymbolID Result; |
73 | | }; |
74 | | |
75 | | // Returns the state after we reduce a nonterminal. |
76 | | // Expected to be called by LR parsers. |
77 | | // If the nonterminal is invalid here, returns std::nullopt. |
78 | | std::optional<StateID> getGoToState(StateID State, |
79 | 276M | SymbolID Nonterminal) const { |
80 | 276M | return Gotos.get(gotoIndex(State, Nonterminal, numStates())); |
81 | 276M | } |
82 | | // Returns the state after we shift a terminal. |
83 | | // Expected to be called by LR parsers. |
84 | | // If the terminal is invalid here, returns std::nullopt. |
85 | | std::optional<StateID> getShiftState(StateID State, |
86 | 60.6M | SymbolID Terminal) const { |
87 | 60.6M | return Shifts.get(shiftIndex(State, Terminal, numStates())); |
88 | 60.6M | } |
89 | | |
90 | | // Returns the possible reductions from a state. |
91 | | // |
92 | | // These are not keyed by a lookahead token. Instead, call canFollow() to |
93 | | // check whether a reduction should apply in the current context: |
94 | | // for (RuleID R : LR.getReduceRules(S)) { |
95 | | // if (!LR.canFollow(G.lookupRule(R).Target, NextToken)) |
96 | | // continue; |
97 | | // // ...apply reduce... |
98 | | // } |
99 | 66.5M | llvm::ArrayRef<RuleID> getReduceRules(StateID State) const { |
100 | 66.5M | assert(State + 1u < ReduceOffset.size()); |
101 | 0 | return llvm::ArrayRef(Reduces.data() + ReduceOffset[State], |
102 | 66.5M | Reduces.data() + ReduceOffset[State + 1]); |
103 | 66.5M | } |
104 | | // Returns whether Terminal can follow Nonterminal in a valid source file. |
105 | 63.5M | bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const { |
106 | 63.5M | assert(isToken(Terminal)); |
107 | 0 | assert(isNonterminal(Nonterminal)); |
108 | | // tok::unknown is a sentinel value used in recovery: can follow anything. |
109 | 63.5M | return Terminal == tokenSymbol(tok::unknown) || |
110 | 63.5M | FollowSets.test(tok::NUM_TOKENS * Nonterminal + |
111 | 62.0M | symbolToToken(Terminal)); |
112 | 63.5M | } |
113 | | |
114 | | // Looks up available recovery actions if we stopped parsing in this state. |
115 | 10.2M | llvm::ArrayRef<Recovery> getRecovery(StateID State) const { |
116 | 10.2M | return llvm::ArrayRef(Recoveries.data() + RecoveryOffset[State], |
117 | 10.2M | Recoveries.data() + RecoveryOffset[State + 1]); |
118 | 10.2M | } |
119 | | |
120 | | // Returns the state from which the LR parser should start to parse the input |
121 | | // tokens as the given StartSymbol. |
122 | | // |
123 | | // In LR parsing, the start state of `translation-unit` corresponds to |
124 | | // `_ := • translation-unit`. |
125 | | // |
126 | | // Each start state responds to **a** single grammar rule like `_ := start`. |
127 | | // REQUIRE: The given StartSymbol must exist in the grammar (in a form of |
128 | | // `_ := start`). |
129 | | StateID getStartState(SymbolID StartSymbol) const; |
130 | | |
131 | 0 | size_t bytes() const { |
132 | 0 | return sizeof(*this) + Gotos.bytes() + Shifts.bytes() + |
133 | 0 | llvm::capacity_in_bytes(Reduces) + |
134 | 0 | llvm::capacity_in_bytes(ReduceOffset) + |
135 | 0 | llvm::capacity_in_bytes(FollowSets); |
136 | 0 | } |
137 | | |
138 | | std::string dumpStatistics() const; |
139 | | std::string dumpForTests(const Grammar &G) const; |
140 | | |
141 | | // Build a SLR(1) parsing table. |
142 | | static LRTable buildSLR(const Grammar &G); |
143 | | |
144 | | // Helper for building a table with specified actions/states. |
145 | | struct Builder { |
146 | | Builder() = default; |
147 | 1 | Builder(const Grammar &G) { |
148 | 1 | NumNonterminals = G.table().Nonterminals.size(); |
149 | 1 | FollowSets = followSets(G); |
150 | 1 | } |
151 | | |
152 | | unsigned int NumNonterminals = 0; |
153 | | // States representing `_ := . start` for various start symbols. |
154 | | std::vector<std::pair<SymbolID, StateID>> StartStates; |
155 | | // State transitions `X := ABC . D EFG` => `X := ABC D . EFG`. |
156 | | // Key is (initial state, D), value is final state. |
157 | | llvm::DenseMap<std::pair<StateID, SymbolID>, StateID> Transition; |
158 | | // Reductions available in a given state. |
159 | | llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduce; |
160 | | // FollowSets[NT] is the set of terminals that can follow the nonterminal. |
161 | | std::vector<llvm::DenseSet<SymbolID>> FollowSets; |
162 | | // Recovery options available at each state. |
163 | | std::vector<std::pair<StateID, Recovery>> Recoveries; |
164 | | |
165 | | LRTable build() &&; |
166 | | }; |
167 | | |
168 | | private: |
169 | 337M | unsigned numStates() const { return ReduceOffset.size() - 1; } |
170 | | |
171 | | // A map from unsigned key => StateID, used to store actions. |
172 | | // The keys should be sequential but the values are somewhat sparse. |
173 | | // |
174 | | // In practice, the keys encode (origin state, symbol) pairs, and the values |
175 | | // are the state we should move to after seeing that symbol. |
176 | | // |
177 | | // We store one bit for presence/absence of the value for each key. |
178 | | // At every 64th key, we store the offset into the table of values. |
179 | | // e.g. key 0x500 is checkpoint 0x500/64 = 20 |
180 | | // Checkpoints[20] = 34 |
181 | | // get(0x500) = Values[34] (assuming it has a value) |
182 | | // To look up values in between, we count the set bits: |
183 | | // get(0x509) has a value if HasValue[20] & (1<<9) |
184 | | // #values between 0x500 and 0x509: popcnt(HasValue[20] & (1<<9 - 1)) |
185 | | // get(0x509) = Values[34 + popcnt(...)] |
186 | | // |
187 | | // Overall size is 1.25 bits/key + 16 bits/value. |
188 | | // Lookup is constant time with a low factor (no hashing). |
189 | | class TransitionTable { |
190 | | using Word = uint64_t; |
191 | | constexpr static unsigned WordBits = CHAR_BIT * sizeof(Word); |
192 | | |
193 | | std::vector<StateID> Values; |
194 | | std::vector<Word> HasValue; |
195 | | std::vector<uint16_t> Checkpoints; |
196 | | |
197 | | public: |
198 | 2 | TransitionTable() = default; |
199 | | TransitionTable(const llvm::DenseMap<unsigned, StateID> &Entries, |
200 | 2 | unsigned NumKeys) { |
201 | 2 | assert( |
202 | 2 | Entries.size() < |
203 | 2 | std::numeric_limits<decltype(Checkpoints)::value_type>::max() && |
204 | 2 | "16 bits too small for value offsets!"); |
205 | 0 | unsigned NumWords = (NumKeys + WordBits - 1) / WordBits; |
206 | 2 | HasValue.resize(NumWords, 0); |
207 | 2 | Checkpoints.reserve(NumWords); |
208 | 2 | Values.reserve(Entries.size()); |
209 | 1.01M | for (unsigned I = 0; I < NumKeys; ++I) { |
210 | 1.01M | if ((I % WordBits) == 0) |
211 | 15.8k | Checkpoints.push_back(Values.size()); |
212 | 1.01M | auto It = Entries.find(I); |
213 | 1.01M | if (It != Entries.end()) { |
214 | 27.1k | HasValue[I / WordBits] |= (Word(1) << (I % WordBits)); |
215 | 27.1k | Values.push_back(It->second); |
216 | 27.1k | } |
217 | 1.01M | } |
218 | 2 | } |
219 | | |
220 | 337M | std::optional<StateID> get(unsigned Key) const { |
221 | | // Do we have a value for this key? |
222 | 337M | Word KeyMask = Word(1) << (Key % WordBits); |
223 | 337M | unsigned KeyWord = Key / WordBits; |
224 | 337M | if ((HasValue[KeyWord] & KeyMask) == 0) |
225 | 38.4M | return std::nullopt; |
226 | | // Count the number of values since the checkpoint. |
227 | 298M | Word BelowKeyMask = KeyMask - 1; |
228 | 298M | unsigned CountSinceCheckpoint = |
229 | 298M | llvm::popcount(HasValue[KeyWord] & BelowKeyMask); |
230 | | // Find the value relative to the last checkpoint. |
231 | 298M | return Values[Checkpoints[KeyWord] + CountSinceCheckpoint]; |
232 | 337M | } |
233 | | |
234 | 0 | unsigned size() const { return Values.size(); } |
235 | | |
236 | 0 | size_t bytes() const { |
237 | 0 | return llvm::capacity_in_bytes(HasValue) + |
238 | 0 | llvm::capacity_in_bytes(Values) + |
239 | 0 | llvm::capacity_in_bytes(Checkpoints); |
240 | 0 | } |
241 | | }; |
242 | | // Shift and Goto tables are keyed by encoded (State, Symbol). |
243 | | static unsigned shiftIndex(StateID State, SymbolID Terminal, |
244 | 60.6M | unsigned NumStates) { |
245 | 60.6M | return NumStates * symbolToToken(Terminal) + State; |
246 | 60.6M | } |
247 | | static unsigned gotoIndex(StateID State, SymbolID Nonterminal, |
248 | 276M | unsigned NumStates) { |
249 | 276M | assert(isNonterminal(Nonterminal)); |
250 | 0 | return NumStates * Nonterminal + State; |
251 | 276M | } |
252 | | TransitionTable Shifts; |
253 | | TransitionTable Gotos; |
254 | | |
255 | | // A sorted table, storing the start state for each target parsing symbol. |
256 | | std::vector<std::pair<SymbolID, StateID>> StartStates; |
257 | | |
258 | | // Given a state ID S, the half-open range of Reduces is |
259 | | // [ReduceOffset[S], ReduceOffset[S+1]) |
260 | | std::vector<uint32_t> ReduceOffset; |
261 | | std::vector<RuleID> Reduces; |
262 | | // Conceptually this is a bool[SymbolID][Token], each entry describing whether |
263 | | // the grammar allows the (nonterminal) symbol to be followed by the token. |
264 | | // |
265 | | // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token) |
266 | | // as an index: Nonterminal * NUM_TOKENS + Token. |
267 | | llvm::BitVector FollowSets; |
268 | | |
269 | | // Recovery stores all recovery actions from all states. |
270 | | // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]). |
271 | | std::vector<uint32_t> RecoveryOffset; |
272 | | std::vector<Recovery> Recoveries; |
273 | | }; |
274 | | |
275 | | } // namespace pseudo |
276 | | } // namespace clang |
277 | | |
278 | | #endif // CLANG_PSEUDO_GRAMMAR_LRTABLE_H |