/src/llvm-project/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp

Source (jump to first uncovered line)
//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "clang-pseudo/grammar/LRGraph.h"
#include "clang-pseudo/grammar/Grammar.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/raw_ostream.h"

using ItemSet = std::vector<clang::pseudo::Item>;

namespace llvm {
// Support clang::pseudo::Item as DenseMap keys.
template <> struct DenseMapInfo<ItemSet> {
  static inline ItemSet getEmptyKey() {
    return {DenseMapInfo<clang::pseudo::Item>::getEmptyKey()};
  }
  static inline ItemSet getTombstoneKey() {
    return {DenseMapInfo<clang::pseudo::Item>::getTombstoneKey()};
  }
  static unsigned getHashValue(const ItemSet &I) {
    return llvm::hash_combine_range(I.begin(), I.end());
  }
  static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) {
    return LHS == RHS;
  }
};
} // namespace llvm

namespace clang {
namespace pseudo {
namespace {

struct SortByNextSymbol {
  SortByNextSymbol(const Grammar &G) : G(G) {}
  bool operator()(const Item &L, const Item &R) {
    if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G))
      return L.next(G) < R.next(G);
    if (L.hasNext() != R.hasNext())
      return L.hasNext() < R.hasNext(); //  a trailing dot is minimal.
    return L < R;
  }
  const Grammar &G;
};

// Computes a closure of the given item set S:
//  - extends the given S to contain all options for parsing next token;
//  - nonterminals after a dot are recursively expanded into the begin-state
//    of all production rules that produce that nonterminal;
//
// Given
//   Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ]
//   Input = [ E := . T ]
// returns [ E :=  . T, T := . n, T := . ( E ) ]
State closure(ItemSet Queue, const Grammar &G) {
  llvm::DenseSet<Item> InQueue = {Queue.begin(), Queue.end()};
  // We reuse the passed-by-value Queue as the final result, as it's already
  // initialized to the right elements.
  size_t ItIndex = 0;
  while (ItIndex < Queue.size()) {
    const Item &ExpandingItem = Queue[ItIndex];
    ++ItIndex;
    if (!ExpandingItem.hasNext())
      continue;

    SymbolID NextSym = ExpandingItem.next(G);
    if (pseudo::isToken(NextSym))
      continue;
    auto RRange = G.table().Nonterminals[NextSym].RuleRange;
    for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) {
      Item NewItem = Item::start(RID, G);
      if (InQueue.insert(NewItem).second) // new
        Queue.push_back(std::move(NewItem));
    }
  }
  Queue.shrink_to_fit();
  llvm::sort(Queue, SortByNextSymbol(G));
  return {std::move(Queue)};
}

// Returns all next (with a dot advanced) kernel item sets, partitioned by the
// advanced symbol.
//
// Given
//  S = [ E := . a b, E := E . - T ]
// returns [
//   {id(a), [ E := a . b ]},
//   {id(-), [ E := E - . T ]}
// ]
std::vector<std::pair<SymbolID, ItemSet>>
nextAvailableKernelItems(const State &S, const Grammar &G) {
  std::vector<std::pair<SymbolID, ItemSet>> Results;
  llvm::ArrayRef<Item> AllItems = S.Items;
  AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); });
  while (!AllItems.empty()) {
    SymbolID AdvancedSymbol = AllItems.front().next(G);
    auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) {
      assert(I.hasNext());
      return I.next(G) == AdvancedSymbol;
    });
    assert(!Batch.empty());
    AllItems = AllItems.drop_front(Batch.size());

    // Advance a dot over the Symbol.
    ItemSet Next;
    for (const Item &I : Batch)
      Next.push_back(I.advance());
    // sort the set to keep order determinism for hash computation.
    llvm::sort(Next);
    Results.push_back({AdvancedSymbol, std::move(Next)});
  }
  return Results;
}

std::vector<std::pair<ExtensionID, SymbolID>>
availableRecovery(const State &S, const Grammar &G) {
  std::vector<std::pair<ExtensionID, SymbolID>> Result;
  for (const Item &I : S.Items) {
    const auto &Rule = G.lookupRule(I.rule());
    if (I.dot() != Rule.RecoveryIndex)
      continue;
    Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]});
  }
  llvm::sort(Result);
  Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
  return Result;
}

} // namespace

std::string Item::dump(const Grammar &G) const {
  const auto &Rule = G.lookupRule(RID);
  auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) {
    std::vector<llvm::StringRef> Results;
    for (auto SID : Syms)
      Results.push_back(G.symbolName(SID));
    return Results;
  };
  return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target),
                       llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "),
                       Rule.RecoveryIndex == DotPos ? " [recovery]" : "")
      .str();
}

std::string State::dump(const Grammar &G, unsigned Indent) const {
  std::string Result;
  llvm::raw_string_ostream OS(Result);
  for (const auto &Item : Items)
    OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G));
  return OS.str();
}

std::string LRGraph::dumpForTests(const Grammar &G) const {
  std::string Result;
  llvm::raw_string_ostream OS(Result);
  OS << "States:\n";
  for (StateID ID = 0; ID < States.size(); ++ID) {
    OS << llvm::formatv("State {0}\n", ID);
    OS << States[ID].dump(G, /*Indent*/ 4);
  }
  for (const auto &E : Edges) {
    OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label),
                        E.Dst);
  }
  return OS.str();
}

LRGraph LRGraph::buildLR0(const Grammar &G) {
  class Builder {
  public:
    Builder(const Grammar &G) : G(G) {}

    // Adds a given state if not existed.
    std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) {
      assert(llvm::is_sorted(KernelItems) &&
             "Item must be sorted before inserting to a hash map!");
      auto It = StatesIndex.find(KernelItems);
      if (It != StatesIndex.end())
        return {It->second, false};
      States.push_back(closure(KernelItems, G));
      StateID NextStateID = States.size() - 1;
      StatesIndex.insert({std::move(KernelItems), NextStateID});
      return {NextStateID, true};
    }

    void insertEdge(StateID Src, StateID Dst, SymbolID Label) {
      Edges.push_back({Src, Dst, Label});
    }

    void insertRecovery(StateID Src, ExtensionID Strategy, SymbolID Result) {
      Recoveries.push_back({Src, Strategy, Result});
    }

    // Returns a state with the given id.
    const State &find(StateID ID) const {
      assert(ID < States.size());
      return States[ID];
    }

    void addStartState(SymbolID Sym, StateID State) {
      StartStates.push_back({Sym, State});
    }

    LRGraph build() && {
      States.shrink_to_fit();
      Edges.shrink_to_fit();
      Recoveries.shrink_to_fit();
      llvm::sort(StartStates);
      StartStates.shrink_to_fit();
      return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries),
                     std::move(StartStates));
    }

  private:
    // Key is the **kernel** item sets.
    llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
    std::vector<State> States;
    std::vector<Edge> Edges;
    std::vector<Recovery> Recoveries;
    const Grammar &G;
    std::vector<std::pair<SymbolID, StateID>> StartStates;
  } Builder(G);

  std::vector<StateID> PendingStates;
  // Initialize states with the start symbol.
  auto RRange = G.table().Nonterminals[G.underscore()].RuleRange;
  for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) {
    auto StartState = std::vector<Item>{Item::start(RID, G)};
    auto Result = Builder.insert(std::move(StartState));
    assert(Result.second && "State must be new");
    PendingStates.push_back(Result.first);

    const Rule &StartRule = G.lookupRule(RID);
    assert(StartRule.Size == 2 &&
           StartRule.seq().back() == tokenSymbol(tok::eof) &&
           "Start rule must be of the form `_ := start-symbol EOF`!");
    Builder.addStartState(StartRule.seq().front(), Result.first);
  }

  while (!PendingStates.empty()) {
    auto StateID = PendingStates.back();
    PendingStates.pop_back();
    for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) {
      auto Insert = Builder.insert(Next.second);
      if (Insert.second) // new state, insert to the pending queue.
        PendingStates.push_back(Insert.first);
      Builder.insertEdge(StateID, Insert.first, Next.first);
    }
    for (auto Recovery : availableRecovery(Builder.find(StateID), G))
      Builder.insertRecovery(StateID, Recovery.first, Recovery.second);
  }
  return std::move(Builder).build();
}

} // namespace pseudo
} // namespace clang

Coverage Report

Created: 2024-01-17 10:31

Line	Count	Source (jump to first uncovered line)
1		//===--- LRGraph.cpp - -------------------------------------------- C++--===//
2		//
3		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4		// See https://llvm.org/LICENSE.txt for license information.
5		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6		//
7		//===----------------------------------------------------------------------===//
8
9		#include "clang-pseudo/grammar/LRGraph.h"
10		#include "clang-pseudo/grammar/Grammar.h"
11		#include "llvm/ADT/DenseSet.h"
12		#include "llvm/ADT/Hashing.h"
13		#include "llvm/ADT/STLExtras.h"
14		#include "llvm/ADT/StringExtras.h"
15		#include "llvm/Support/FormatVariadic.h"
16		#include "llvm/Support/raw_ostream.h"
17
18		using ItemSet = std::vector<clang::pseudo::Item>;
19
20		namespace llvm {
21		// Support clang::pseudo::Item as DenseMap keys.
22		template <> struct DenseMapInfo<ItemSet> {
23	31.5k	static inline ItemSet getEmptyKey() {
24	31.5k	return {DenseMapInfo<clang::pseudo::Item>::getEmptyKey()};
25	31.5k	}
26	30.0k	static inline ItemSet getTombstoneKey() {
27	30.0k	return {DenseMapInfo<clang::pseudo::Item>::getTombstoneKey()};
28	30.0k	}
29	30.0k	static unsigned getHashValue(const ItemSet &I) {
30	30.0k	return llvm::hash_combine_range(I.begin(), I.end());
31	30.0k	}
32	131k	static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) {
33	131k	return LHS == RHS;
34	131k	}
35		};
36		} // namespace llvm
37
38		namespace clang {
39		namespace pseudo {
40		namespace {
41
42		struct SortByNextSymbol {
43	1.47k	SortByNextSymbol(const Grammar &G) : G(G) {}
44	478k	bool operator()(const Item &L, const Item &R) {
45	478k	if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G))
46	427k	return L.next(G) < R.next(G);
47	50.4k	if (L.hasNext() != R.hasNext())
48	610	return L.hasNext() < R.hasNext(); // a trailing dot is minimal.
49	49.8k	return L < R;
50	50.4k	}
51		const Grammar &G;
52		};
53
54		// Computes a closure of the given item set S:
55		// - extends the given S to contain all options for parsing next token;
56		// - nonterminals after a dot are recursively expanded into the begin-state
57		// of all production rules that produce that nonterminal;
58		//
59		// Given
60		// Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ]
61		// Input = [ E := . T ]
62		// returns [ E := . T, T := . n, T := . ( E ) ]
63	1.47k	State closure(ItemSet Queue, const Grammar &G) {
64	1.47k	llvm::DenseSet<Item> InQueue = {Queue.begin(), Queue.end()};
65		// We reuse the passed-by-value Queue as the final result, as it's already
66		// initialized to the right elements.
67	1.47k	size_t ItIndex = 0;
68	53.7k	while (ItIndex < Queue.size()) {
69	52.2k	const Item &ExpandingItem = Queue[ItIndex];
70	52.2k	++ItIndex;
71	52.2k	if (!ExpandingItem.hasNext())
72	1.05k	continue;
73
74	51.2k	SymbolID NextSym = ExpandingItem.next(G);
75	51.2k	if (pseudo::isToken(NextSym))
76	24.2k	continue;
77	26.9k	auto RRange = G.table().Nonterminals[NextSym].RuleRange;
78	147k	for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) {
79	120k	Item NewItem = Item::start(RID, G);
80	120k	if (InQueue.insert(NewItem).second) // new
81	49.6k	Queue.push_back(std::move(NewItem));
82	120k	}
83	26.9k	}
84	1.47k	Queue.shrink_to_fit();
85	1.47k	llvm::sort(Queue, SortByNextSymbol(G));
86	1.47k	return {std::move(Queue)};
87	1.47k	}
88
89		// Returns all next (with a dot advanced) kernel item sets, partitioned by the
90		// advanced symbol.
91		//
92		// Given
93		// S = [ E := . a b, E := E . - T ]
94		// returns [
95		// {id(a), [ E := a . b ]},
96		// {id(-), [ E := E - . T ]}
97		// ]
98		std::vector<std::pair<SymbolID, ItemSet>>
99	1.47k	nextAvailableKernelItems(const State &S, const Grammar &G) {
100	1.47k	std::vector<std::pair<SymbolID, ItemSet>> Results;
101	1.47k	llvm::ArrayRef<Item> AllItems = S.Items;
102	1.83k	AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); });
103	28.5k	while (!AllItems.empty()) {
104	27.1k	SymbolID AdvancedSymbol = AllItems.front().next(G);
105	77.5k	auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) {
106	77.5k	assert(I.hasNext());
107	0	return I.next(G) == AdvancedSymbol;
108	77.5k	});
109	27.1k	assert(!Batch.empty());
110	0	AllItems = AllItems.drop_front(Batch.size());
111
112		// Advance a dot over the Symbol.
113	27.1k	ItemSet Next;
114	27.1k	for (const Item &I : Batch)
115	51.2k	Next.push_back(I.advance());
116		// sort the set to keep order determinism for hash computation.
117	27.1k	llvm::sort(Next);
118	27.1k	Results.push_back({AdvancedSymbol, std::move(Next)});
119	27.1k	}
120	1.47k	return Results;
121	1.47k	}
122
123		std::vector<std::pair<ExtensionID, SymbolID>>
124	1.47k	availableRecovery(const State &S, const Grammar &G) {
125	1.47k	std::vector<std::pair<ExtensionID, SymbolID>> Result;
126	52.2k	for (const Item &I : S.Items) {
127	52.2k	const auto &Rule = G.lookupRule(I.rule());
128	52.2k	if (I.dot() != Rule.RecoveryIndex)
129	52.2k	continue;
130	27	Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]});
131	27	}
132	1.47k	llvm::sort(Result);
133	1.47k	Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
134	1.47k	return Result;
135	1.47k	}
136
137		} // namespace
138
139	0	std::string Item::dump(const Grammar &G) const {
140	0	const auto &Rule = G.lookupRule(RID);
141	0	auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) {
142	0	std::vector<llvm::StringRef> Results;
143	0	for (auto SID : Syms)
144	0	Results.push_back(G.symbolName(SID));
145	0	return Results;
146	0	};
147	0	return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target),
148	0	llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
149	0	llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "),
150	0	Rule.RecoveryIndex == DotPos ? " [recovery]" : "")
151	0	.str();
152	0	}
153
154	0	std::string State::dump(const Grammar &G, unsigned Indent) const {
155	0	std::string Result;
156	0	llvm::raw_string_ostream OS(Result);
157	0	for (const auto &Item : Items)
158	0	OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G));
159	0	return OS.str();
160	0	}
161
162	0	std::string LRGraph::dumpForTests(const Grammar &G) const {
163	0	std::string Result;
164	0	llvm::raw_string_ostream OS(Result);
165	0	OS << "States:\n";
166	0	for (StateID ID = 0; ID < States.size(); ++ID) {
167	0	OS << llvm::formatv("State {0}\n", ID);
168	0	OS << States[ID].dump(G, /Indent/ 4);
169	0	}
170	0	for (const auto &E : Edges) {
171	0	OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label),
172	0	E.Dst);
173	0	}
174	0	return OS.str();
175	0	}
176
177	1	LRGraph LRGraph::buildLR0(const Grammar &G) {
178	1	class Builder {
179	1	public:
180	1	Builder(const Grammar &G) : G(G) {}
181
182		// Adds a given state if not existed.
183	27.1k	std::pair<StateID, /inserted/ bool> insert(ItemSet KernelItems) {
184	27.1k	assert(llvm::is_sorted(KernelItems) &&
185	27.1k	"Item must be sorted before inserting to a hash map!");
186	0	auto It = StatesIndex.find(KernelItems);
187	27.1k	if (It != StatesIndex.end())
188	25.6k	return {It->second, false};
189	1.47k	States.push_back(closure(KernelItems, G));
190	1.47k	StateID NextStateID = States.size() - 1;
191	1.47k	StatesIndex.insert({std::move(KernelItems), NextStateID});
192	1.47k	return {NextStateID, true};
193	27.1k	}
194
195	27.1k	void insertEdge(StateID Src, StateID Dst, SymbolID Label) {
196	27.1k	Edges.push_back({Src, Dst, Label});
197	27.1k	}
198
199	6	void insertRecovery(StateID Src, ExtensionID Strategy, SymbolID Result) {
200	6	Recoveries.push_back({Src, Strategy, Result});
201	6	}
202
203		// Returns a state with the given id.
204	2.95k	const State &find(StateID ID) const {
205	2.95k	assert(ID < States.size());
206	0	return States[ID];
207	2.95k	}
208
209	3	void addStartState(SymbolID Sym, StateID State) {
210	3	StartStates.push_back({Sym, State});
211	3	}
212
213	1	LRGraph build() && {
214	1	States.shrink_to_fit();
215	1	Edges.shrink_to_fit();
216	1	Recoveries.shrink_to_fit();
217	1	llvm::sort(StartStates);
218	1	StartStates.shrink_to_fit();
219	1	return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries),
220	1	std::move(StartStates));
221	1	}
222
223	1	private:
224		// Key is the kernel item sets.
225	1	llvm::DenseMap<ItemSet, /index of States/ size_t> StatesIndex;
226	1	std::vector<State> States;
227	1	std::vector<Edge> Edges;
228	1	std::vector<Recovery> Recoveries;
229	1	const Grammar &G;
230	1	std::vector<std::pair<SymbolID, StateID>> StartStates;
231	1	} Builder(G);
232
233	1	std::vector<StateID> PendingStates;
234		// Initialize states with the start symbol.
235	1	auto RRange = G.table().Nonterminals[G.underscore()].RuleRange;
236	4	for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) {
237	3	auto StartState = std::vector<Item>{Item::start(RID, G)};
238	3	auto Result = Builder.insert(std::move(StartState));
239	3	assert(Result.second && "State must be new");
240	0	PendingStates.push_back(Result.first);
241
242	3	const Rule &StartRule = G.lookupRule(RID);
243	3	assert(StartRule.Size == 2 &&
244	3	StartRule.seq().back() == tokenSymbol(tok::eof) &&
245	3	"Start rule must be of the form `_ := start-symbol EOF`!");
246	0	Builder.addStartState(StartRule.seq().front(), Result.first);
247	3	}
248
249	1.47k	while (!PendingStates.empty()) {
250	1.47k	auto StateID = PendingStates.back();
251	1.47k	PendingStates.pop_back();
252	27.1k	for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) {
253	27.1k	auto Insert = Builder.insert(Next.second);
254	27.1k	if (Insert.second) // new state, insert to the pending queue.
255	1.47k	PendingStates.push_back(Insert.first);
256	27.1k	Builder.insertEdge(StateID, Insert.first, Next.first);
257	27.1k	}
258	1.47k	for (auto Recovery : availableRecovery(Builder.find(StateID), G))
259	6	Builder.insertRecovery(StateID, Recovery.first, Recovery.second);
260	1.47k	}
261	1	return std::move(Builder).build();
262	1	}
263
264		} // namespace pseudo
265		} // namespace clang