/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h

Source (jump to first uncovered line)
//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//  The LRTable (referred as LR parsing table in the LR literature) is the core
//  component in LR parsers, it drives the LR parsers by specifying an action to
//  take given the current state on the top of the stack and the current
//  lookahead token.
//
//  The LRTable can be described as a matrix where the rows represent
//  the states of the LR graph, the columns represent the symbols of the
//  grammar, and each entry of the matrix (called action) represents a
//  state transition in the graph.
//
//  Typically, based on the category of the grammar symbol, the LRTable is
//  broken into two logically separate tables:
//    - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies
//      next action (shift/reduce) on state S under a lookahead terminal a
//    - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies
//      the state which we transist to from the state S with the nonterminal X
//
//  LRTable is *performance-critial* as it is consulted frequently during a
//  parse. In general, LRTable is very sparse (most of the entries are empty).
//  For example, for the C++ language, the SLR table has ~1500 states and 650
//  symbols which results in a matrix having 975K entries, ~90% of entries are
//  empty.
//
//  This file implements a speed-and-space-efficient LRTable.
//
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_GRAMMAR_LRTABLE_H
#define CLANG_PSEUDO_GRAMMAR_LRTABLE_H

#include "clang-pseudo/grammar/Grammar.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Support/Capacity.h"
#include "llvm/Support/MathExtras.h"
#include <cstdint>
#include <vector>

namespace clang {
namespace pseudo {

// Represents the LR parsing table, which can efficiently the question "what is
// the next step given the lookahead token and current state on top of the
// stack?".
//
// This is a dense implementation, which only takes an amount of space that is
// proportional to the number of non-empty entries in the table.
//
// Unlike the typical LR parsing table which allows at most one available action
// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
// to be used in nondeterministic LR parsers (e.g. GLR).
//
// There are no "accept" actions in the LRTable, instead the stack is inspected
// after parsing completes: is the state goto(StartState, StartSymbol)?
class LRTable {
public:
  // StateID is only 13 bits wide.
  using StateID = uint16_t;
  static constexpr unsigned StateBits = 13;

  struct Recovery {
    ExtensionID Strategy;
    SymbolID Result;
  };

  // Returns the state after we reduce a nonterminal.
  // Expected to be called by LR parsers.
  // If the nonterminal is invalid here, returns std::nullopt.
  std::optional<StateID> getGoToState(StateID State,
                                       SymbolID Nonterminal) const {
    return Gotos.get(gotoIndex(State, Nonterminal, numStates()));
  }
  // Returns the state after we shift a terminal.
  // Expected to be called by LR parsers.
  // If the terminal is invalid here, returns std::nullopt.
  std::optional<StateID> getShiftState(StateID State,
                                        SymbolID Terminal) const {
    return Shifts.get(shiftIndex(State, Terminal, numStates()));
  }

  // Returns the possible reductions from a state.
  //
  // These are not keyed by a lookahead token. Instead, call canFollow() to
  // check whether a reduction should apply in the current context:
  //   for (RuleID R : LR.getReduceRules(S)) {
  //     if (!LR.canFollow(G.lookupRule(R).Target, NextToken))
  //       continue;
  //     // ...apply reduce...
  //   }
  llvm::ArrayRef<RuleID> getReduceRules(StateID State) const {
    assert(State + 1u < ReduceOffset.size());
    return llvm::ArrayRef(Reduces.data() + ReduceOffset[State],
                          Reduces.data() + ReduceOffset[State + 1]);
  }
  // Returns whether Terminal can follow Nonterminal in a valid source file.
  bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
    assert(isToken(Terminal));
    assert(isNonterminal(Nonterminal));
    // tok::unknown is a sentinel value used in recovery: can follow anything.
    return Terminal == tokenSymbol(tok::unknown) ||
           FollowSets.test(tok::NUM_TOKENS * Nonterminal +
                           symbolToToken(Terminal));
  }

  // Looks up available recovery actions if we stopped parsing in this state.
  llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
    return llvm::ArrayRef(Recoveries.data() + RecoveryOffset[State],
                          Recoveries.data() + RecoveryOffset[State + 1]);
  }

  // Returns the state from which the LR parser should start to parse the input
  // tokens as the given StartSymbol.
  //
  // In LR parsing, the start state of `translation-unit` corresponds to
  // `_ := • translation-unit`.
  //
  // Each start state responds to **a** single grammar rule like `_ := start`.
  // REQUIRE: The given StartSymbol must exist in the grammar (in a form of
  //          `_ := start`).
  StateID getStartState(SymbolID StartSymbol) const;

  size_t bytes() const {
    return sizeof(*this) + Gotos.bytes() + Shifts.bytes() +
           llvm::capacity_in_bytes(Reduces) +
           llvm::capacity_in_bytes(ReduceOffset) +
           llvm::capacity_in_bytes(FollowSets);
  }

  std::string dumpStatistics() const;
  std::string dumpForTests(const Grammar &G) const;

  // Build a SLR(1) parsing table.
  static LRTable buildSLR(const Grammar &G);

  // Helper for building a table with specified actions/states.
  struct Builder {
    Builder() = default;
    Builder(const Grammar &G) {
      NumNonterminals = G.table().Nonterminals.size();
      FollowSets = followSets(G);
    }

    unsigned int NumNonterminals = 0;
    // States representing `_ := . start` for various start symbols.
    std::vector<std::pair<SymbolID, StateID>> StartStates;
    // State transitions `X := ABC . D EFG` => `X := ABC D . EFG`.
    // Key is (initial state, D), value is final state.
    llvm::DenseMap<std::pair<StateID, SymbolID>, StateID> Transition;
    // Reductions available in a given state.
    llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduce;
    // FollowSets[NT] is the set of terminals that can follow the nonterminal.
    std::vector<llvm::DenseSet<SymbolID>> FollowSets;
    // Recovery options available at each state.
    std::vector<std::pair<StateID, Recovery>> Recoveries;

    LRTable build() &&;
  };

private:
  unsigned numStates() const { return ReduceOffset.size() - 1; }

  // A map from unsigned key => StateID, used to store actions.
  // The keys should be sequential but the values are somewhat sparse.
  //
  // In practice, the keys encode (origin state, symbol) pairs, and the values
  // are the state we should move to after seeing that symbol.
  //
  // We store one bit for presence/absence of the value for each key.
  // At every 64th key, we store the offset into the table of values.
  //   e.g. key 0x500 is checkpoint 0x500/64 = 20
  //                     Checkpoints[20] = 34
  //        get(0x500) = Values[34]                (assuming it has a value)
  // To look up values in between, we count the set bits:
  //        get(0x509) has a value if HasValue[20] & (1<<9)
  //        #values between 0x500 and 0x509: popcnt(HasValue[20] & (1<<9 - 1))
  //        get(0x509) = Values[34 + popcnt(...)]
  //
  // Overall size is 1.25 bits/key + 16 bits/value.
  // Lookup is constant time with a low factor (no hashing).
  class TransitionTable {
    using Word = uint64_t;
    constexpr static unsigned WordBits = CHAR_BIT * sizeof(Word);

    std::vector<StateID> Values;
    std::vector<Word> HasValue;
    std::vector<uint16_t> Checkpoints;

  public:
    TransitionTable() = default;
    TransitionTable(const llvm::DenseMap<unsigned, StateID> &Entries,
                    unsigned NumKeys) {
      assert(
          Entries.size() <
              std::numeric_limits<decltype(Checkpoints)::value_type>::max() &&
          "16 bits too small for value offsets!");
      unsigned NumWords = (NumKeys + WordBits - 1) / WordBits;
      HasValue.resize(NumWords, 0);
      Checkpoints.reserve(NumWords);
      Values.reserve(Entries.size());
      for (unsigned I = 0; I < NumKeys; ++I) {
        if ((I % WordBits) == 0)
          Checkpoints.push_back(Values.size());
        auto It = Entries.find(I);
        if (It != Entries.end()) {
          HasValue[I / WordBits] |= (Word(1) << (I % WordBits));
          Values.push_back(It->second);
        }
      }
    }

    std::optional<StateID> get(unsigned Key) const {
      // Do we have a value for this key?
      Word KeyMask = Word(1) << (Key % WordBits);
      unsigned KeyWord = Key / WordBits;
      if ((HasValue[KeyWord] & KeyMask) == 0)
        return std::nullopt;
      // Count the number of values since the checkpoint.
      Word BelowKeyMask = KeyMask - 1;
      unsigned CountSinceCheckpoint =
          llvm::popcount(HasValue[KeyWord] & BelowKeyMask);
      // Find the value relative to the last checkpoint.
      return Values[Checkpoints[KeyWord] + CountSinceCheckpoint];
    }

    unsigned size() const { return Values.size(); }

    size_t bytes() const {
      return llvm::capacity_in_bytes(HasValue) +
             llvm::capacity_in_bytes(Values) +
             llvm::capacity_in_bytes(Checkpoints);
    }
  };
  // Shift and Goto tables are keyed by encoded (State, Symbol).
  static unsigned shiftIndex(StateID State, SymbolID Terminal,
                             unsigned NumStates) {
    return NumStates * symbolToToken(Terminal) + State;
  }
  static unsigned gotoIndex(StateID State, SymbolID Nonterminal,
                            unsigned NumStates) {
    assert(isNonterminal(Nonterminal));
    return NumStates * Nonterminal + State;
  }
  TransitionTable Shifts;
  TransitionTable Gotos;

  // A sorted table, storing the start state for each target parsing symbol.
  std::vector<std::pair<SymbolID, StateID>> StartStates;

  // Given a state ID S, the half-open range of Reduces is
  // [ReduceOffset[S], ReduceOffset[S+1])
  std::vector<uint32_t> ReduceOffset;
  std::vector<RuleID> Reduces;
  // Conceptually this is a bool[SymbolID][Token], each entry describing whether
  // the grammar allows the (nonterminal) symbol to be followed by the token.
  //
  // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
  // as an index: Nonterminal * NUM_TOKENS + Token.
  llvm::BitVector FollowSets;

  // Recovery stores all recovery actions from all states.
  // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
  std::vector<uint32_t> RecoveryOffset;
  std::vector<Recovery> Recoveries;
};

} // namespace pseudo
} // namespace clang

#endif // CLANG_PSEUDO_GRAMMAR_LRTABLE_H

Coverage Report

Created: 2024-01-17 10:31

Line	Count	Source (jump to first uncovered line)
1		//===--- LRTable.h - Define LR Parsing Table ---------------------- C++--===//
2		//
3		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4		// See https://llvm.org/LICENSE.txt for license information.
5		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6		//
7		//===----------------------------------------------------------------------===//
8		//
9		// The LRTable (referred as LR parsing table in the LR literature) is the core
10		// component in LR parsers, it drives the LR parsers by specifying an action to
11		// take given the current state on the top of the stack and the current
12		// lookahead token.
13		//
14		// The LRTable can be described as a matrix where the rows represent
15		// the states of the LR graph, the columns represent the symbols of the
16		// grammar, and each entry of the matrix (called action) represents a
17		// state transition in the graph.
18		//
19		// Typically, based on the category of the grammar symbol, the LRTable is
20		// broken into two logically separate tables:
21		// - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies
22		// next action (shift/reduce) on state S under a lookahead terminal a
23		// - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies
24		// the state which we transist to from the state S with the nonterminal X
25		//
26		// LRTable is performance-critial as it is consulted frequently during a
27		// parse. In general, LRTable is very sparse (most of the entries are empty).
28		// For example, for the C++ language, the SLR table has ~1500 states and 650
29		// symbols which results in a matrix having 975K entries, ~90% of entries are
30		// empty.
31		//
32		// This file implements a speed-and-space-efficient LRTable.
33		//
34		//===----------------------------------------------------------------------===//
35
36		#ifndef CLANG_PSEUDO_GRAMMAR_LRTABLE_H
37		#define CLANG_PSEUDO_GRAMMAR_LRTABLE_H
38
39		#include "clang-pseudo/grammar/Grammar.h"
40		#include "llvm/ADT/ArrayRef.h"
41		#include "llvm/ADT/BitVector.h"
42		#include "llvm/ADT/SmallSet.h"
43		#include "llvm/Support/Capacity.h"
44		#include "llvm/Support/MathExtras.h"
45		#include <cstdint>
46		#include <vector>
47
48		namespace clang {
49		namespace pseudo {
50
51		// Represents the LR parsing table, which can efficiently the question "what is
52		// the next step given the lookahead token and current state on top of the
53		// stack?".
54		//
55		// This is a dense implementation, which only takes an amount of space that is
56		// proportional to the number of non-empty entries in the table.
57		//
58		// Unlike the typical LR parsing table which allows at most one available action
59		// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
60		// to be used in nondeterministic LR parsers (e.g. GLR).
61		//
62		// There are no "accept" actions in the LRTable, instead the stack is inspected
63		// after parsing completes: is the state goto(StartState, StartSymbol)?
64		class LRTable {
65		public:
66		// StateID is only 13 bits wide.
67		using StateID = uint16_t;
68		static constexpr unsigned StateBits = 13;
69
70		struct Recovery {
71		ExtensionID Strategy;
72		SymbolID Result;
73		};
74
75		// Returns the state after we reduce a nonterminal.
76		// Expected to be called by LR parsers.
77		// If the nonterminal is invalid here, returns std::nullopt.
78		std::optional<StateID> getGoToState(StateID State,
79	276M	SymbolID Nonterminal) const {
80	276M	return Gotos.get(gotoIndex(State, Nonterminal, numStates()));
81	276M	}
82		// Returns the state after we shift a terminal.
83		// Expected to be called by LR parsers.
84		// If the terminal is invalid here, returns std::nullopt.
85		std::optional<StateID> getShiftState(StateID State,
86	60.6M	SymbolID Terminal) const {
87	60.6M	return Shifts.get(shiftIndex(State, Terminal, numStates()));
88	60.6M	}
89
90		// Returns the possible reductions from a state.
91		//
92		// These are not keyed by a lookahead token. Instead, call canFollow() to
93		// check whether a reduction should apply in the current context:
94		// for (RuleID R : LR.getReduceRules(S)) {
95		// if (!LR.canFollow(G.lookupRule(R).Target, NextToken))
96		// continue;
97		// // ...apply reduce...
98		// }
99	66.5M	llvm::ArrayRef<RuleID> getReduceRules(StateID State) const {
100	66.5M	assert(State + 1u < ReduceOffset.size());
101	0	return llvm::ArrayRef(Reduces.data() + ReduceOffset[State],
102	66.5M	Reduces.data() + ReduceOffset[State + 1]);
103	66.5M	}
104		// Returns whether Terminal can follow Nonterminal in a valid source file.
105	63.5M	bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
106	63.5M	assert(isToken(Terminal));
107	0	assert(isNonterminal(Nonterminal));
108		// tok::unknown is a sentinel value used in recovery: can follow anything.
109	63.5M	return Terminal == tokenSymbol(tok::unknown) \|\|
110	63.5M	FollowSets.test(tok::NUM_TOKENS * Nonterminal +
111	62.0M	symbolToToken(Terminal));
112	63.5M	}
113
114		// Looks up available recovery actions if we stopped parsing in this state.
115	10.2M	llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
116	10.2M	return llvm::ArrayRef(Recoveries.data() + RecoveryOffset[State],
117	10.2M	Recoveries.data() + RecoveryOffset[State + 1]);
118	10.2M	}
119
120		// Returns the state from which the LR parser should start to parse the input
121		// tokens as the given StartSymbol.
122		//
123		// In LR parsing, the start state of `translation-unit` corresponds to
124		// `_ := • translation-unit`.
125		//
126		// Each start state responds to a single grammar rule like `_ := start`.
127		// REQUIRE: The given StartSymbol must exist in the grammar (in a form of
128		// `_ := start`).
129		StateID getStartState(SymbolID StartSymbol) const;
130
131	0	size_t bytes() const {
132	0	return sizeof(*this) + Gotos.bytes() + Shifts.bytes() +
133	0	llvm::capacity_in_bytes(Reduces) +
134	0	llvm::capacity_in_bytes(ReduceOffset) +
135	0	llvm::capacity_in_bytes(FollowSets);
136	0	}
137
138		std::string dumpStatistics() const;
139		std::string dumpForTests(const Grammar &G) const;
140
141		// Build a SLR(1) parsing table.
142		static LRTable buildSLR(const Grammar &G);
143
144		// Helper for building a table with specified actions/states.
145		struct Builder {
146		Builder() = default;
147	1	Builder(const Grammar &G) {
148	1	NumNonterminals = G.table().Nonterminals.size();
149	1	FollowSets = followSets(G);
150	1	}
151
152		unsigned int NumNonterminals = 0;
153		// States representing `_ := . start` for various start symbols.
154		std::vector<std::pair<SymbolID, StateID>> StartStates;
155		// State transitions `X := ABC . D EFG` => `X := ABC D . EFG`.
156		// Key is (initial state, D), value is final state.
157		llvm::DenseMap<std::pair<StateID, SymbolID>, StateID> Transition;
158		// Reductions available in a given state.
159		llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduce;
160		// FollowSets[NT] is the set of terminals that can follow the nonterminal.
161		std::vector<llvm::DenseSet<SymbolID>> FollowSets;
162		// Recovery options available at each state.
163		std::vector<std::pair<StateID, Recovery>> Recoveries;
164
165		LRTable build() &&;
166		};
167
168		private:
169	337M	unsigned numStates() const { return ReduceOffset.size() - 1; }
170
171		// A map from unsigned key => StateID, used to store actions.
172		// The keys should be sequential but the values are somewhat sparse.
173		//
174		// In practice, the keys encode (origin state, symbol) pairs, and the values
175		// are the state we should move to after seeing that symbol.
176		//
177		// We store one bit for presence/absence of the value for each key.
178		// At every 64th key, we store the offset into the table of values.
179		// e.g. key 0x500 is checkpoint 0x500/64 = 20
180		// Checkpoints[20] = 34
181		// get(0x500) = Values[34] (assuming it has a value)
182		// To look up values in between, we count the set bits:
183		// get(0x509) has a value if HasValue[20] & (1<<9)
184		// #values between 0x500 and 0x509: popcnt(HasValue[20] & (1<<9 - 1))
185		// get(0x509) = Values[34 + popcnt(...)]
186		//
187		// Overall size is 1.25 bits/key + 16 bits/value.
188		// Lookup is constant time with a low factor (no hashing).
189		class TransitionTable {
190		using Word = uint64_t;
191		constexpr static unsigned WordBits = CHAR_BIT * sizeof(Word);
192
193		std::vector<StateID> Values;
194		std::vector<Word> HasValue;
195		std::vector<uint16_t> Checkpoints;
196
197		public:
198	2	TransitionTable() = default;
199		TransitionTable(const llvm::DenseMap<unsigned, StateID> &Entries,
200	2	unsigned NumKeys) {
201	2	assert(
202	2	Entries.size() <
203	2	std::numeric_limits<decltype(Checkpoints)::value_type>::max() &&
204	2	"16 bits too small for value offsets!");
205	0	unsigned NumWords = (NumKeys + WordBits - 1) / WordBits;
206	2	HasValue.resize(NumWords, 0);
207	2	Checkpoints.reserve(NumWords);
208	2	Values.reserve(Entries.size());
209	1.01M	for (unsigned I = 0; I < NumKeys; ++I) {
210	1.01M	if ((I % WordBits) == 0)
211	15.8k	Checkpoints.push_back(Values.size());
212	1.01M	auto It = Entries.find(I);
213	1.01M	if (It != Entries.end()) {
214	27.1k	HasValue[I / WordBits] \|= (Word(1) << (I % WordBits));
215	27.1k	Values.push_back(It->second);
216	27.1k	}
217	1.01M	}
218	2	}
219
220	337M	std::optional<StateID> get(unsigned Key) const {
221		// Do we have a value for this key?
222	337M	Word KeyMask = Word(1) << (Key % WordBits);
223	337M	unsigned KeyWord = Key / WordBits;
224	337M	if ((HasValue[KeyWord] & KeyMask) == 0)
225	38.4M	return std::nullopt;
226		// Count the number of values since the checkpoint.
227	298M	Word BelowKeyMask = KeyMask - 1;
228	298M	unsigned CountSinceCheckpoint =
229	298M	llvm::popcount(HasValue[KeyWord] & BelowKeyMask);
230		// Find the value relative to the last checkpoint.
231	298M	return Values[Checkpoints[KeyWord] + CountSinceCheckpoint];
232	337M	}
233
234	0	unsigned size() const { return Values.size(); }
235
236	0	size_t bytes() const {
237	0	return llvm::capacity_in_bytes(HasValue) +
238	0	llvm::capacity_in_bytes(Values) +
239	0	llvm::capacity_in_bytes(Checkpoints);
240	0	}
241		};
242		// Shift and Goto tables are keyed by encoded (State, Symbol).
243		static unsigned shiftIndex(StateID State, SymbolID Terminal,
244	60.6M	unsigned NumStates) {
245	60.6M	return NumStates * symbolToToken(Terminal) + State;
246	60.6M	}
247		static unsigned gotoIndex(StateID State, SymbolID Nonterminal,
248	276M	unsigned NumStates) {
249	276M	assert(isNonterminal(Nonterminal));
250	0	return NumStates * Nonterminal + State;
251	276M	}
252		TransitionTable Shifts;
253		TransitionTable Gotos;
254
255		// A sorted table, storing the start state for each target parsing symbol.
256		std::vector<std::pair<SymbolID, StateID>> StartStates;
257
258		// Given a state ID S, the half-open range of Reduces is
259		// [ReduceOffset[S], ReduceOffset[S+1])
260		std::vector<uint32_t> ReduceOffset;
261		std::vector<RuleID> Reduces;
262		// Conceptually this is a bool[SymbolID][Token], each entry describing whether
263		// the grammar allows the (nonterminal) symbol to be followed by the token.
264		//
265		// This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
266		// as an index: Nonterminal * NUM_TOKENS + Token.
267		llvm::BitVector FollowSets;
268
269		// Recovery stores all recovery actions from all states.
270		// A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
271		std::vector<uint32_t> RecoveryOffset;
272		std::vector<Recovery> Recoveries;
273		};
274
275		} // namespace pseudo
276		} // namespace clang
277
278		#endif // CLANG_PSEUDO_GRAMMAR_LRTABLE_H