Coverage Report

Created: 2024-01-17 10:31

/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
Line
Count
Source (jump to first uncovered line)
1
//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
//  The LRTable (referred as LR parsing table in the LR literature) is the core
10
//  component in LR parsers, it drives the LR parsers by specifying an action to
11
//  take given the current state on the top of the stack and the current
12
//  lookahead token.
13
//
14
//  The LRTable can be described as a matrix where the rows represent
15
//  the states of the LR graph, the columns represent the symbols of the
16
//  grammar, and each entry of the matrix (called action) represents a
17
//  state transition in the graph.
18
//
19
//  Typically, based on the category of the grammar symbol, the LRTable is
20
//  broken into two logically separate tables:
21
//    - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies
22
//      next action (shift/reduce) on state S under a lookahead terminal a
23
//    - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies
24
//      the state which we transist to from the state S with the nonterminal X
25
//
26
//  LRTable is *performance-critial* as it is consulted frequently during a
27
//  parse. In general, LRTable is very sparse (most of the entries are empty).
28
//  For example, for the C++ language, the SLR table has ~1500 states and 650
29
//  symbols which results in a matrix having 975K entries, ~90% of entries are
30
//  empty.
31
//
32
//  This file implements a speed-and-space-efficient LRTable.
33
//
34
//===----------------------------------------------------------------------===//
35
36
#ifndef CLANG_PSEUDO_GRAMMAR_LRTABLE_H
37
#define CLANG_PSEUDO_GRAMMAR_LRTABLE_H
38
39
#include "clang-pseudo/grammar/Grammar.h"
40
#include "llvm/ADT/ArrayRef.h"
41
#include "llvm/ADT/BitVector.h"
42
#include "llvm/ADT/SmallSet.h"
43
#include "llvm/Support/Capacity.h"
44
#include "llvm/Support/MathExtras.h"
45
#include <cstdint>
46
#include <vector>
47
48
namespace clang {
49
namespace pseudo {
50
51
// Represents the LR parsing table, which can efficiently the question "what is
52
// the next step given the lookahead token and current state on top of the
53
// stack?".
54
//
55
// This is a dense implementation, which only takes an amount of space that is
56
// proportional to the number of non-empty entries in the table.
57
//
58
// Unlike the typical LR parsing table which allows at most one available action
59
// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
60
// to be used in nondeterministic LR parsers (e.g. GLR).
61
//
62
// There are no "accept" actions in the LRTable, instead the stack is inspected
63
// after parsing completes: is the state goto(StartState, StartSymbol)?
64
class LRTable {
65
public:
66
  // StateID is only 13 bits wide.
67
  using StateID = uint16_t;
68
  static constexpr unsigned StateBits = 13;
69
70
  struct Recovery {
71
    ExtensionID Strategy;
72
    SymbolID Result;
73
  };
74
75
  // Returns the state after we reduce a nonterminal.
76
  // Expected to be called by LR parsers.
77
  // If the nonterminal is invalid here, returns std::nullopt.
78
  std::optional<StateID> getGoToState(StateID State,
79
276M
                                       SymbolID Nonterminal) const {
80
276M
    return Gotos.get(gotoIndex(State, Nonterminal, numStates()));
81
276M
  }
82
  // Returns the state after we shift a terminal.
83
  // Expected to be called by LR parsers.
84
  // If the terminal is invalid here, returns std::nullopt.
85
  std::optional<StateID> getShiftState(StateID State,
86
60.6M
                                        SymbolID Terminal) const {
87
60.6M
    return Shifts.get(shiftIndex(State, Terminal, numStates()));
88
60.6M
  }
89
90
  // Returns the possible reductions from a state.
91
  //
92
  // These are not keyed by a lookahead token. Instead, call canFollow() to
93
  // check whether a reduction should apply in the current context:
94
  //   for (RuleID R : LR.getReduceRules(S)) {
95
  //     if (!LR.canFollow(G.lookupRule(R).Target, NextToken))
96
  //       continue;
97
  //     // ...apply reduce...
98
  //   }
99
66.5M
  llvm::ArrayRef<RuleID> getReduceRules(StateID State) const {
100
66.5M
    assert(State + 1u < ReduceOffset.size());
101
0
    return llvm::ArrayRef(Reduces.data() + ReduceOffset[State],
102
66.5M
                          Reduces.data() + ReduceOffset[State + 1]);
103
66.5M
  }
104
  // Returns whether Terminal can follow Nonterminal in a valid source file.
105
63.5M
  bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
106
63.5M
    assert(isToken(Terminal));
107
0
    assert(isNonterminal(Nonterminal));
108
    // tok::unknown is a sentinel value used in recovery: can follow anything.
109
63.5M
    return Terminal == tokenSymbol(tok::unknown) ||
110
63.5M
           FollowSets.test(tok::NUM_TOKENS * Nonterminal +
111
62.0M
                           symbolToToken(Terminal));
112
63.5M
  }
113
114
  // Looks up available recovery actions if we stopped parsing in this state.
115
10.2M
  llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
116
10.2M
    return llvm::ArrayRef(Recoveries.data() + RecoveryOffset[State],
117
10.2M
                          Recoveries.data() + RecoveryOffset[State + 1]);
118
10.2M
  }
119
120
  // Returns the state from which the LR parser should start to parse the input
121
  // tokens as the given StartSymbol.
122
  //
123
  // In LR parsing, the start state of `translation-unit` corresponds to
124
  // `_ := • translation-unit`.
125
  //
126
  // Each start state responds to **a** single grammar rule like `_ := start`.
127
  // REQUIRE: The given StartSymbol must exist in the grammar (in a form of
128
  //          `_ := start`).
129
  StateID getStartState(SymbolID StartSymbol) const;
130
131
0
  size_t bytes() const {
132
0
    return sizeof(*this) + Gotos.bytes() + Shifts.bytes() +
133
0
           llvm::capacity_in_bytes(Reduces) +
134
0
           llvm::capacity_in_bytes(ReduceOffset) +
135
0
           llvm::capacity_in_bytes(FollowSets);
136
0
  }
137
138
  std::string dumpStatistics() const;
139
  std::string dumpForTests(const Grammar &G) const;
140
141
  // Build a SLR(1) parsing table.
142
  static LRTable buildSLR(const Grammar &G);
143
144
  // Helper for building a table with specified actions/states.
145
  struct Builder {
146
    Builder() = default;
147
1
    Builder(const Grammar &G) {
148
1
      NumNonterminals = G.table().Nonterminals.size();
149
1
      FollowSets = followSets(G);
150
1
    }
151
152
    unsigned int NumNonterminals = 0;
153
    // States representing `_ := . start` for various start symbols.
154
    std::vector<std::pair<SymbolID, StateID>> StartStates;
155
    // State transitions `X := ABC . D EFG` => `X := ABC D . EFG`.
156
    // Key is (initial state, D), value is final state.
157
    llvm::DenseMap<std::pair<StateID, SymbolID>, StateID> Transition;
158
    // Reductions available in a given state.
159
    llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduce;
160
    // FollowSets[NT] is the set of terminals that can follow the nonterminal.
161
    std::vector<llvm::DenseSet<SymbolID>> FollowSets;
162
    // Recovery options available at each state.
163
    std::vector<std::pair<StateID, Recovery>> Recoveries;
164
165
    LRTable build() &&;
166
  };
167
168
private:
169
337M
  unsigned numStates() const { return ReduceOffset.size() - 1; }
170
171
  // A map from unsigned key => StateID, used to store actions.
172
  // The keys should be sequential but the values are somewhat sparse.
173
  //
174
  // In practice, the keys encode (origin state, symbol) pairs, and the values
175
  // are the state we should move to after seeing that symbol.
176
  //
177
  // We store one bit for presence/absence of the value for each key.
178
  // At every 64th key, we store the offset into the table of values.
179
  //   e.g. key 0x500 is checkpoint 0x500/64 = 20
180
  //                     Checkpoints[20] = 34
181
  //        get(0x500) = Values[34]                (assuming it has a value)
182
  // To look up values in between, we count the set bits:
183
  //        get(0x509) has a value if HasValue[20] & (1<<9)
184
  //        #values between 0x500 and 0x509: popcnt(HasValue[20] & (1<<9 - 1))
185
  //        get(0x509) = Values[34 + popcnt(...)]
186
  //
187
  // Overall size is 1.25 bits/key + 16 bits/value.
188
  // Lookup is constant time with a low factor (no hashing).
189
  class TransitionTable {
190
    using Word = uint64_t;
191
    constexpr static unsigned WordBits = CHAR_BIT * sizeof(Word);
192
193
    std::vector<StateID> Values;
194
    std::vector<Word> HasValue;
195
    std::vector<uint16_t> Checkpoints;
196
197
  public:
198
2
    TransitionTable() = default;
199
    TransitionTable(const llvm::DenseMap<unsigned, StateID> &Entries,
200
2
                    unsigned NumKeys) {
201
2
      assert(
202
2
          Entries.size() <
203
2
              std::numeric_limits<decltype(Checkpoints)::value_type>::max() &&
204
2
          "16 bits too small for value offsets!");
205
0
      unsigned NumWords = (NumKeys + WordBits - 1) / WordBits;
206
2
      HasValue.resize(NumWords, 0);
207
2
      Checkpoints.reserve(NumWords);
208
2
      Values.reserve(Entries.size());
209
1.01M
      for (unsigned I = 0; I < NumKeys; ++I) {
210
1.01M
        if ((I % WordBits) == 0)
211
15.8k
          Checkpoints.push_back(Values.size());
212
1.01M
        auto It = Entries.find(I);
213
1.01M
        if (It != Entries.end()) {
214
27.1k
          HasValue[I / WordBits] |= (Word(1) << (I % WordBits));
215
27.1k
          Values.push_back(It->second);
216
27.1k
        }
217
1.01M
      }
218
2
    }
219
220
337M
    std::optional<StateID> get(unsigned Key) const {
221
      // Do we have a value for this key?
222
337M
      Word KeyMask = Word(1) << (Key % WordBits);
223
337M
      unsigned KeyWord = Key / WordBits;
224
337M
      if ((HasValue[KeyWord] & KeyMask) == 0)
225
38.4M
        return std::nullopt;
226
      // Count the number of values since the checkpoint.
227
298M
      Word BelowKeyMask = KeyMask - 1;
228
298M
      unsigned CountSinceCheckpoint =
229
298M
          llvm::popcount(HasValue[KeyWord] & BelowKeyMask);
230
      // Find the value relative to the last checkpoint.
231
298M
      return Values[Checkpoints[KeyWord] + CountSinceCheckpoint];
232
337M
    }
233
234
0
    unsigned size() const { return Values.size(); }
235
236
0
    size_t bytes() const {
237
0
      return llvm::capacity_in_bytes(HasValue) +
238
0
             llvm::capacity_in_bytes(Values) +
239
0
             llvm::capacity_in_bytes(Checkpoints);
240
0
    }
241
  };
242
  // Shift and Goto tables are keyed by encoded (State, Symbol).
243
  static unsigned shiftIndex(StateID State, SymbolID Terminal,
244
60.6M
                             unsigned NumStates) {
245
60.6M
    return NumStates * symbolToToken(Terminal) + State;
246
60.6M
  }
247
  static unsigned gotoIndex(StateID State, SymbolID Nonterminal,
248
276M
                            unsigned NumStates) {
249
276M
    assert(isNonterminal(Nonterminal));
250
0
    return NumStates * Nonterminal + State;
251
276M
  }
252
  TransitionTable Shifts;
253
  TransitionTable Gotos;
254
255
  // A sorted table, storing the start state for each target parsing symbol.
256
  std::vector<std::pair<SymbolID, StateID>> StartStates;
257
258
  // Given a state ID S, the half-open range of Reduces is
259
  // [ReduceOffset[S], ReduceOffset[S+1])
260
  std::vector<uint32_t> ReduceOffset;
261
  std::vector<RuleID> Reduces;
262
  // Conceptually this is a bool[SymbolID][Token], each entry describing whether
263
  // the grammar allows the (nonterminal) symbol to be followed by the token.
264
  //
265
  // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
266
  // as an index: Nonterminal * NUM_TOKENS + Token.
267
  llvm::BitVector FollowSets;
268
269
  // Recovery stores all recovery actions from all states.
270
  // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
271
  std::vector<uint32_t> RecoveryOffset;
272
  std::vector<Recovery> Recoveries;
273
};
274
275
} // namespace pseudo
276
} // namespace clang
277
278
#endif // CLANG_PSEUDO_GRAMMAR_LRTABLE_H