/src/llvm-project/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp

Source (jump to first uncovered line)
//===--- Grammar.cpp - Grammar for clang pseudoparser  -----------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "clang-pseudo/grammar/Grammar.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/raw_ostream.h"
#include <optional>

namespace clang {
namespace pseudo {

Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
    : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) {
  assert(Sequence.size() <= Rule::MaxElements);
  llvm::copy(Sequence, this->Sequence);
}

Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) {
  Underscore = *findNonterminal("_");
}

llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
  assert(isNonterminal(SID));
  const auto &R = T->Nonterminals[SID].RuleRange;
  assert(R.End <= T->Rules.size());
  return llvm::ArrayRef(&T->Rules[R.Start], R.End - R.Start);
}

const Rule &Grammar::lookupRule(RuleID RID) const {
  assert(RID < T->Rules.size());
  return T->Rules[RID];
}

llvm::StringRef Grammar::symbolName(SymbolID SID) const {
  if (isToken(SID))
    return T->Terminals[symbolToToken(SID)];
  return T->Nonterminals[SID].Name;
}

std::optional<SymbolID> Grammar::findNonterminal(llvm::StringRef Name) const {
  auto It = llvm::partition_point(
      T->Nonterminals,
      [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; });
  if (It != T->Nonterminals.end() && It->Name == Name)
    return It - T->Nonterminals.begin();
  return std::nullopt;
}

std::string Grammar::dumpRule(RuleID RID) const {
  std::string Result;
  llvm::raw_string_ostream OS(Result);
  const Rule &R = T->Rules[RID];
  OS << symbolName(R.Target) << " :=";
  for (unsigned I = 0; I < R.Size; ++I) {
    OS << " " << symbolName(R.Sequence[I]);
    if (R.RecoveryIndex == I)
      OS << " [recover=" << T->AttributeValues[R.Recovery] << "]";
  }
  if (R.Guarded)
    OS << " [guard]";
  return Result;
}

std::string Grammar::dumpRules(SymbolID SID) const {
  assert(isNonterminal(SID));
  std::string Result;
  const auto &Range = T->Nonterminals[SID].RuleRange;
  for (RuleID RID = Range.Start; RID < Range.End; ++RID)
    Result.append(dumpRule(RID)).push_back('\n');
  return Result;
}

std::string Grammar::dump() const {
  std::string Result;
  llvm::raw_string_ostream OS(Result);
  OS << "Nonterminals:\n";
  for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
    OS << llvm::formatv("  {0} {1}\n", SID, symbolName(SID));
  OS << "Rules:\n";
  for (RuleID RID = 0; RID < T->Rules.size(); ++RID)
    OS << llvm::formatv("  {0} {1}\n", RID, dumpRule(RID));
  return OS.str();
}

std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
  std::vector<llvm::DenseSet<SymbolID>> FirstSets(
      G.table().Nonterminals.size());
  auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
    assert(isNonterminal(Target));
    if (isToken(First))
      return FirstSets[Target].insert(First).second;
    bool Changed = false;
    for (SymbolID SID : FirstSets[First])
      Changed |= FirstSets[Target].insert(SID).second;
    return Changed;
  };

  // A rule S := T ... implies elements in FIRST(S):
  //  - if T is a terminal, FIRST(S) contains T
  //  - if T is a nonterminal, FIRST(S) contains FIRST(T)
  // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
  // end up being incomplete.
  // We iterate until we hit a fixed point.
  // (This isn't particularly efficient, but table building isn't on the
  // critical path).
  bool Changed = true;
  while (Changed) {
    Changed = false;
    for (const auto &R : G.table().Rules)
      // We only need to consider the first element because symbols are
      // non-nullable.
      Changed |= ExpandFirstSet(R.Target, R.seq().front());
  }
  return FirstSets;
}

std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
  auto FirstSets = firstSets(G);
  std::vector<llvm::DenseSet<SymbolID>> FollowSets(
      G.table().Nonterminals.size());
  // Expand the follow set of a nonterminal symbol Y by adding all from the
  // given symbol set.
  auto ExpandFollowSet = [&FollowSets](SymbolID Y,
                                       const llvm::DenseSet<SymbolID> &ToAdd) {
    assert(isNonterminal(Y));
    bool Changed = false;
    for (SymbolID F : ToAdd)
      Changed |= FollowSets[Y].insert(F).second;
    return Changed;
  };
  // Follow sets is computed based on the following 3 rules, the computation
  // is completed at a fixed point where there is no more new symbols can be
  // added to any of the follow sets.
  //
  // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the
  // augmented grammar, in our case it is '_'.
  FollowSets[G.underscore()].insert(tokenSymbol(tok::eof));
  bool Changed = true;
  while (Changed) {
    Changed = false;
    for (const auto &R : G.table().Rules) {
      // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
      // FOLLOW(Y).
      for (size_t I = 0; I + 1 < R.seq().size(); ++I) {
        if (isToken(R.seq()[I]))
          continue;
        // We only need to consider the next symbol because symbols are
        // non-nullable.
        SymbolID Next = R.seq()[I + 1];
        if (isToken(Next))
          // First set for a terminal is itself.
          Changed |= ExpandFollowSet(R.seq()[I], {Next});
        else
          Changed |= ExpandFollowSet(R.seq()[I], FirstSets[Next]);
      }
      // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
      // FOLLOW(Z).
      SymbolID Z = R.seq().back();
      if (isNonterminal(Z))
        Changed |= ExpandFollowSet(Z, FollowSets[R.Target]);
    }
  }
  return FollowSets;
}

static llvm::ArrayRef<std::string> getTerminalNames() {
  static const auto &TerminalNames = []() {
    auto TerminalNames = new std::string[NumTerminals];
#define PUNCTUATOR(Tok, Spelling) TerminalNames[tok::Tok] = Spelling;
#define KEYWORD(Keyword, Condition)                                            \
  TerminalNames[tok::kw_##Keyword] = llvm::StringRef(#Keyword).upper();
#define TOK(Tok) TerminalNames[tok::Tok] = llvm::StringRef(#Tok).upper();
#include "clang/Basic/TokenKinds.def"
    return llvm::ArrayRef(TerminalNames, NumTerminals);
  }();
  return TerminalNames;
}
GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}

} // namespace pseudo
} // namespace clang

Line	Count	Source (jump to first uncovered line)
1		//===--- Grammar.cpp - Grammar for clang pseudoparser ------------ C++--===//
2		//
3		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4		// See https://llvm.org/LICENSE.txt for license information.
5		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6		//
7		//===----------------------------------------------------------------------===//
8
9		#include "clang-pseudo/grammar/Grammar.h"
10		#include "clang/Basic/TokenKinds.h"
11		#include "llvm/ADT/ArrayRef.h"
12		#include "llvm/ADT/STLExtras.h"
13		#include "llvm/ADT/StringRef.h"
14		#include "llvm/Support/FormatVariadic.h"
15		#include "llvm/Support/raw_ostream.h"
16		#include <optional>
17
18		namespace clang {
19		namespace pseudo {
20
21		Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
22	884	: Target(Target), Size(static_cast<uint8_t>(Sequence.size())) {
23	884	assert(Sequence.size() <= Rule::MaxElements);
24	0	llvm::copy(Sequence, this->Sequence);
25	884	}
26
27	1	Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) {
28	1	Underscore = *findNonterminal("_");
29	1	}
30
31	0	llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
32	0	assert(isNonterminal(SID));
33	0	const auto &R = T->Nonterminals[SID].RuleRange;
34	0	assert(R.End <= T->Rules.size());
35	0	return llvm::ArrayRef(&T->Rules[R.Start], R.End - R.Start);
36	0	}
37
38	65.6M	const Rule &Grammar::lookupRule(RuleID RID) const {
39	65.6M	assert(RID < T->Rules.size());
40	0	return T->Rules[RID];
41	65.6M	}
42
43	0	llvm::StringRef Grammar::symbolName(SymbolID SID) const {
44	0	if (isToken(SID))
45	0	return T->Terminals[symbolToToken(SID)];
46	0	return T->Nonterminals[SID].Name;
47	0	}
48
49	9.56k	std::optional<SymbolID> Grammar::findNonterminal(llvm::StringRef Name) const {
50	9.56k	auto It = llvm::partition_point(
51	9.56k	T->Nonterminals,
52	76.5k	[&](const GrammarTable::Nonterminal &X) { return X.Name < Name; });
53	9.56k	if (It != T->Nonterminals.end() && It->Name == Name)
54	9.56k	return It - T->Nonterminals.begin();
55	0	return std::nullopt;
56	9.56k	}
57
58	0	std::string Grammar::dumpRule(RuleID RID) const {
59	0	std::string Result;
60	0	llvm::raw_string_ostream OS(Result);
61	0	const Rule &R = T->Rules[RID];
62	0	OS << symbolName(R.Target) << " :=";
63	0	for (unsigned I = 0; I < R.Size; ++I) {
64	0	OS << " " << symbolName(R.Sequence[I]);
65	0	if (R.RecoveryIndex == I)
66	0	OS << " [recover=" << T->AttributeValues[R.Recovery] << "]";
67	0	}
68	0	if (R.Guarded)
69	0	OS << " [guard]";
70	0	return Result;
71	0	}
72
73	0	std::string Grammar::dumpRules(SymbolID SID) const {
74	0	assert(isNonterminal(SID));
75	0	std::string Result;
76	0	const auto &Range = T->Nonterminals[SID].RuleRange;
77	0	for (RuleID RID = Range.Start; RID < Range.End; ++RID)
78	0	Result.append(dumpRule(RID)).push_back('\n');
79	0	return Result;
80	0	}
81
82	0	std::string Grammar::dump() const {
83	0	std::string Result;
84	0	llvm::raw_string_ostream OS(Result);
85	0	OS << "Nonterminals:\n";
86	0	for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
87	0	OS << llvm::formatv(" {0} {1}\n", SID, symbolName(SID));
88	0	OS << "Rules:\n";
89	0	for (RuleID RID = 0; RID < T->Rules.size(); ++RID)
90	0	OS << llvm::formatv(" {0} {1}\n", RID, dumpRule(RID));
91	0	return OS.str();
92	0	}
93
94	2	std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
95	2	std::vector<llvm::DenseSet<SymbolID>> FirstSets(
96	2	G.table().Nonterminals.size());
97	7.07k	auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
98	7.07k	assert(isNonterminal(Target));
99	7.07k	if (isToken(First))
100	3.37k	return FirstSets[Target].insert(First).second;
101	3.69k	bool Changed = false;
102	3.69k	for (SymbolID SID : FirstSets[First])
103	57.9k	Changed \|= FirstSets[Target].insert(SID).second;
104	3.69k	return Changed;
105	7.07k	};
106
107		// A rule S := T ... implies elements in FIRST(S):
108		// - if T is a terminal, FIRST(S) contains T
109		// - if T is a nonterminal, FIRST(S) contains FIRST(T)
110		// Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
111		// end up being incomplete.
112		// We iterate until we hit a fixed point.
113		// (This isn't particularly efficient, but table building isn't on the
114		// critical path).
115	2	bool Changed = true;
116	10	while (Changed) {
117	8	Changed = false;
118	8	for (const auto &R : G.table().Rules)
119		// We only need to consider the first element because symbols are
120		// non-nullable.
121	7.07k	Changed \|= ExpandFirstSet(R.Target, R.seq().front());
122	8	}
123	2	return FirstSets;
124	2	}
125
126	2	std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
127	2	auto FirstSets = firstSets(G);
128	2	std::vector<llvm::DenseSet<SymbolID>> FollowSets(
129	2	G.table().Nonterminals.size());
130		// Expand the follow set of a nonterminal symbol Y by adding all from the
131		// given symbol set.
132	2	auto ExpandFollowSet = [&FollowSets](SymbolID Y,
133	46.6k	const llvm::DenseSet<SymbolID> &ToAdd) {
134	46.6k	assert(isNonterminal(Y));
135	0	bool Changed = false;
136	46.6k	for (SymbolID F : ToAdd)
137	821k	Changed \|= FollowSets[Y].insert(F).second;
138	46.6k	return Changed;
139	46.6k	};
140		// Follow sets is computed based on the following 3 rules, the computation
141		// is completed at a fixed point where there is no more new symbols can be
142		// added to any of the follow sets.
143		//
144		// Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the
145		// augmented grammar, in our case it is '_'.
146	2	FollowSets[G.underscore()].insert(tokenSymbol(tok::eof));
147	2	bool Changed = true;
148	46	while (Changed) {
149	44	Changed = false;
150	38.8k	for (const auto &R : G.table().Rules) {
151		// Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
152		// FOLLOW(Y).
153	90.6k	for (size_t I = 0; I + 1 < R.seq().size(); ++I) {
154	51.7k	if (isToken(R.seq()[I]))
155	26.5k	continue;
156		// We only need to consider the next symbol because symbols are
157		// non-nullable.
158	25.2k	SymbolID Next = R.seq()[I + 1];
159	25.2k	if (isToken(Next))
160		// First set for a terminal is itself.
161	15.3k	Changed \|= ExpandFollowSet(R.seq()[I], {Next});
162	9.85k	else
163	9.85k	Changed \|= ExpandFollowSet(R.seq()[I], FirstSets[Next]);
164	25.2k	}
165		// Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
166		// FOLLOW(Z).
167	38.8k	SymbolID Z = R.seq().back();
168	38.8k	if (isNonterminal(Z))
169	21.4k	Changed \|= ExpandFollowSet(Z, FollowSets[R.Target]);
170	38.8k	}
171	44	}
172	2	return FollowSets;
173	2	}
174
175	1	static llvm::ArrayRef<std::string> getTerminalNames() {
176	1	static const auto &TerminalNames = []() {
177	1	auto TerminalNames = new std::string[NumTerminals];
178	57	#define PUNCTUATOR(Tok, Spelling) TerminalNames[tok::Tok] = Spelling;
179	1	#define KEYWORD(Keyword, Condition) \
180	323	TerminalNames[tok::kw_##Keyword] = llvm::StringRef(#Keyword).upper();
181	63	#define TOK(Tok) TerminalNames[tok::Tok] = llvm::StringRef(#Tok).upper();
182	1	#include "clang/Basic/TokenKinds.def"
183	1	return llvm::ArrayRef(TerminalNames, NumTerminals);
184	1	}();
185	1	return TerminalNames;
186	1	}
187	1	GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}
188
189		} // namespace pseudo
190		} // namespace clang

Coverage Report

Created: 2024-01-17 10:31