/src/llvm-project/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- Grammar.cpp - Grammar for clang pseudoparser -----------*- C++-*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | |
9 | | #include "clang-pseudo/grammar/Grammar.h" |
10 | | #include "clang/Basic/TokenKinds.h" |
11 | | #include "llvm/ADT/ArrayRef.h" |
12 | | #include "llvm/ADT/STLExtras.h" |
13 | | #include "llvm/ADT/StringRef.h" |
14 | | #include "llvm/Support/FormatVariadic.h" |
15 | | #include "llvm/Support/raw_ostream.h" |
16 | | #include <optional> |
17 | | |
18 | | namespace clang { |
19 | | namespace pseudo { |
20 | | |
21 | | Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence) |
22 | 884 | : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) { |
23 | 884 | assert(Sequence.size() <= Rule::MaxElements); |
24 | 0 | llvm::copy(Sequence, this->Sequence); |
25 | 884 | } |
26 | | |
27 | 1 | Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) { |
28 | 1 | Underscore = *findNonterminal("_"); |
29 | 1 | } |
30 | | |
31 | 0 | llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const { |
32 | 0 | assert(isNonterminal(SID)); |
33 | 0 | const auto &R = T->Nonterminals[SID].RuleRange; |
34 | 0 | assert(R.End <= T->Rules.size()); |
35 | 0 | return llvm::ArrayRef(&T->Rules[R.Start], R.End - R.Start); |
36 | 0 | } |
37 | | |
38 | 65.6M | const Rule &Grammar::lookupRule(RuleID RID) const { |
39 | 65.6M | assert(RID < T->Rules.size()); |
40 | 0 | return T->Rules[RID]; |
41 | 65.6M | } |
42 | | |
43 | 0 | llvm::StringRef Grammar::symbolName(SymbolID SID) const { |
44 | 0 | if (isToken(SID)) |
45 | 0 | return T->Terminals[symbolToToken(SID)]; |
46 | 0 | return T->Nonterminals[SID].Name; |
47 | 0 | } |
48 | | |
49 | 9.56k | std::optional<SymbolID> Grammar::findNonterminal(llvm::StringRef Name) const { |
50 | 9.56k | auto It = llvm::partition_point( |
51 | 9.56k | T->Nonterminals, |
52 | 76.5k | [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; }); |
53 | 9.56k | if (It != T->Nonterminals.end() && It->Name == Name) |
54 | 9.56k | return It - T->Nonterminals.begin(); |
55 | 0 | return std::nullopt; |
56 | 9.56k | } |
57 | | |
58 | 0 | std::string Grammar::dumpRule(RuleID RID) const { |
59 | 0 | std::string Result; |
60 | 0 | llvm::raw_string_ostream OS(Result); |
61 | 0 | const Rule &R = T->Rules[RID]; |
62 | 0 | OS << symbolName(R.Target) << " :="; |
63 | 0 | for (unsigned I = 0; I < R.Size; ++I) { |
64 | 0 | OS << " " << symbolName(R.Sequence[I]); |
65 | 0 | if (R.RecoveryIndex == I) |
66 | 0 | OS << " [recover=" << T->AttributeValues[R.Recovery] << "]"; |
67 | 0 | } |
68 | 0 | if (R.Guarded) |
69 | 0 | OS << " [guard]"; |
70 | 0 | return Result; |
71 | 0 | } |
72 | | |
73 | 0 | std::string Grammar::dumpRules(SymbolID SID) const { |
74 | 0 | assert(isNonterminal(SID)); |
75 | 0 | std::string Result; |
76 | 0 | const auto &Range = T->Nonterminals[SID].RuleRange; |
77 | 0 | for (RuleID RID = Range.Start; RID < Range.End; ++RID) |
78 | 0 | Result.append(dumpRule(RID)).push_back('\n'); |
79 | 0 | return Result; |
80 | 0 | } |
81 | | |
82 | 0 | std::string Grammar::dump() const { |
83 | 0 | std::string Result; |
84 | 0 | llvm::raw_string_ostream OS(Result); |
85 | 0 | OS << "Nonterminals:\n"; |
86 | 0 | for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) |
87 | 0 | OS << llvm::formatv(" {0} {1}\n", SID, symbolName(SID)); |
88 | 0 | OS << "Rules:\n"; |
89 | 0 | for (RuleID RID = 0; RID < T->Rules.size(); ++RID) |
90 | 0 | OS << llvm::formatv(" {0} {1}\n", RID, dumpRule(RID)); |
91 | 0 | return OS.str(); |
92 | 0 | } |
93 | | |
94 | 2 | std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) { |
95 | 2 | std::vector<llvm::DenseSet<SymbolID>> FirstSets( |
96 | 2 | G.table().Nonterminals.size()); |
97 | 7.07k | auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) { |
98 | 7.07k | assert(isNonterminal(Target)); |
99 | 7.07k | if (isToken(First)) |
100 | 3.37k | return FirstSets[Target].insert(First).second; |
101 | 3.69k | bool Changed = false; |
102 | 3.69k | for (SymbolID SID : FirstSets[First]) |
103 | 57.9k | Changed |= FirstSets[Target].insert(SID).second; |
104 | 3.69k | return Changed; |
105 | 7.07k | }; |
106 | | |
107 | | // A rule S := T ... implies elements in FIRST(S): |
108 | | // - if T is a terminal, FIRST(S) contains T |
109 | | // - if T is a nonterminal, FIRST(S) contains FIRST(T) |
110 | | // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may |
111 | | // end up being incomplete. |
112 | | // We iterate until we hit a fixed point. |
113 | | // (This isn't particularly efficient, but table building isn't on the |
114 | | // critical path). |
115 | 2 | bool Changed = true; |
116 | 10 | while (Changed) { |
117 | 8 | Changed = false; |
118 | 8 | for (const auto &R : G.table().Rules) |
119 | | // We only need to consider the first element because symbols are |
120 | | // non-nullable. |
121 | 7.07k | Changed |= ExpandFirstSet(R.Target, R.seq().front()); |
122 | 8 | } |
123 | 2 | return FirstSets; |
124 | 2 | } |
125 | | |
126 | 2 | std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) { |
127 | 2 | auto FirstSets = firstSets(G); |
128 | 2 | std::vector<llvm::DenseSet<SymbolID>> FollowSets( |
129 | 2 | G.table().Nonterminals.size()); |
130 | | // Expand the follow set of a nonterminal symbol Y by adding all from the |
131 | | // given symbol set. |
132 | 2 | auto ExpandFollowSet = [&FollowSets](SymbolID Y, |
133 | 46.6k | const llvm::DenseSet<SymbolID> &ToAdd) { |
134 | 46.6k | assert(isNonterminal(Y)); |
135 | 0 | bool Changed = false; |
136 | 46.6k | for (SymbolID F : ToAdd) |
137 | 821k | Changed |= FollowSets[Y].insert(F).second; |
138 | 46.6k | return Changed; |
139 | 46.6k | }; |
140 | | // Follow sets is computed based on the following 3 rules, the computation |
141 | | // is completed at a fixed point where there is no more new symbols can be |
142 | | // added to any of the follow sets. |
143 | | // |
144 | | // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the |
145 | | // augmented grammar, in our case it is '_'. |
146 | 2 | FollowSets[G.underscore()].insert(tokenSymbol(tok::eof)); |
147 | 2 | bool Changed = true; |
148 | 46 | while (Changed) { |
149 | 44 | Changed = false; |
150 | 38.8k | for (const auto &R : G.table().Rules) { |
151 | | // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to |
152 | | // FOLLOW(Y). |
153 | 90.6k | for (size_t I = 0; I + 1 < R.seq().size(); ++I) { |
154 | 51.7k | if (isToken(R.seq()[I])) |
155 | 26.5k | continue; |
156 | | // We only need to consider the next symbol because symbols are |
157 | | // non-nullable. |
158 | 25.2k | SymbolID Next = R.seq()[I + 1]; |
159 | 25.2k | if (isToken(Next)) |
160 | | // First set for a terminal is itself. |
161 | 15.3k | Changed |= ExpandFollowSet(R.seq()[I], {Next}); |
162 | 9.85k | else |
163 | 9.85k | Changed |= ExpandFollowSet(R.seq()[I], FirstSets[Next]); |
164 | 25.2k | } |
165 | | // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to |
166 | | // FOLLOW(Z). |
167 | 38.8k | SymbolID Z = R.seq().back(); |
168 | 38.8k | if (isNonterminal(Z)) |
169 | 21.4k | Changed |= ExpandFollowSet(Z, FollowSets[R.Target]); |
170 | 38.8k | } |
171 | 44 | } |
172 | 2 | return FollowSets; |
173 | 2 | } |
174 | | |
175 | 1 | static llvm::ArrayRef<std::string> getTerminalNames() { |
176 | 1 | static const auto &TerminalNames = []() { |
177 | 1 | auto TerminalNames = new std::string[NumTerminals]; |
178 | 57 | #define PUNCTUATOR(Tok, Spelling) TerminalNames[tok::Tok] = Spelling; |
179 | 1 | #define KEYWORD(Keyword, Condition) \ |
180 | 323 | TerminalNames[tok::kw_##Keyword] = llvm::StringRef(#Keyword).upper(); |
181 | 63 | #define TOK(Tok) TerminalNames[tok::Tok] = llvm::StringRef(#Tok).upper(); |
182 | 1 | #include "clang/Basic/TokenKinds.def" |
183 | 1 | return llvm::ArrayRef(TerminalNames, NumTerminals); |
184 | 1 | }(); |
185 | 1 | return TerminalNames; |
186 | 1 | } |
187 | 1 | GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {} |
188 | | |
189 | | } // namespace pseudo |
190 | | } // namespace clang |