/src/llvm-project/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- GrammarBNF.cpp - build grammar from BNF files ----------*- C++-*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | |
9 | | #include "clang-pseudo/grammar/Grammar.h" |
10 | | #include "clang/Basic/TokenKinds.h" |
11 | | #include "llvm/ADT/DenseSet.h" |
12 | | #include "llvm/ADT/STLExtras.h" |
13 | | #include "llvm/ADT/StringExtras.h" |
14 | | #include "llvm/Support/FormatVariadic.h" |
15 | | #include <memory> |
16 | | #include <utility> |
17 | | |
18 | | namespace clang { |
19 | | namespace pseudo { |
20 | | |
21 | | namespace { |
22 | | static const llvm::StringRef OptSuffix = "_opt"; |
23 | | static const llvm::StringRef StartSymbol = "_"; |
24 | | |
25 | | // Builds grammar from BNF files. |
26 | | class GrammarBuilder { |
27 | | public: |
28 | | GrammarBuilder(std::vector<std::string> &Diagnostics) |
29 | 1 | : Diagnostics(Diagnostics) {} |
30 | | |
31 | 1 | Grammar build(llvm::StringRef BNF) { |
32 | 1 | auto Specs = eliminateOptional(parse(BNF)); |
33 | | |
34 | 1 | assert(llvm::all_of(Specs, |
35 | 1 | [](const RuleSpec &R) { |
36 | 1 | if (R.Target.ends_with(OptSuffix)) |
37 | 1 | return false; |
38 | 1 | return llvm::all_of( |
39 | 1 | R.Sequence, [](const RuleSpec::Element &E) { |
40 | 1 | return !E.Symbol.ends_with(OptSuffix); |
41 | 1 | }); |
42 | 1 | }) && |
43 | 1 | "Optional symbols should be eliminated!"); |
44 | | |
45 | 0 | auto T = std::make_unique<GrammarTable>(); |
46 | | |
47 | | // Assemble the name->ID and ID->nonterminal name maps. |
48 | 1 | llvm::DenseSet<llvm::StringRef> UniqueNonterminals; |
49 | 1 | llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds; |
50 | | |
51 | 1 | llvm::DenseSet<llvm::StringRef> UniqueAttributeValues; |
52 | | |
53 | 444 | for (uint16_t I = 0; I < NumTerminals; ++I) |
54 | 443 | SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I))); |
55 | 2.94k | auto Consider = [&](llvm::StringRef Name) { |
56 | 2.94k | if (!SymbolIds.count(Name)) |
57 | 1.94k | UniqueNonterminals.insert(Name); |
58 | 2.94k | }; |
59 | 884 | for (const auto &Spec : Specs) { |
60 | 884 | Consider(Spec.Target); |
61 | 2.06k | for (const RuleSpec::Element &Elt : Spec.Sequence) { |
62 | 2.06k | Consider(Elt.Symbol); |
63 | 2.06k | for (const auto& KV : Elt.Attributes) |
64 | 53 | UniqueAttributeValues.insert(KV.second); |
65 | 2.06k | } |
66 | 884 | } |
67 | 243 | for (llvm::StringRef Name : UniqueNonterminals) { |
68 | 243 | T->Nonterminals.emplace_back(); |
69 | 243 | T->Nonterminals.back().Name = Name.str(); |
70 | 243 | } |
71 | 1 | assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) && |
72 | 1 | "Too many nonterminals to fit in SymbolID bits!"); |
73 | 0 | llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L, |
74 | 1.97k | const GrammarTable::Nonterminal &R) { |
75 | 1.97k | return L.Name < R.Name; |
76 | 1.97k | }); |
77 | | // Add an empty string for the corresponding sentinel unset attribute. |
78 | 1 | T->AttributeValues.push_back(""); |
79 | 1 | UniqueAttributeValues.erase(""); |
80 | 1 | for (llvm::StringRef Name : UniqueAttributeValues) { |
81 | 1 | T->AttributeValues.emplace_back(); |
82 | 1 | T->AttributeValues.back() = Name.str(); |
83 | 1 | } |
84 | 1 | llvm::sort(T->AttributeValues); |
85 | 1 | assert(T->AttributeValues.front() == ""); |
86 | | |
87 | | // Build name -> ID maps for nonterminals. |
88 | 244 | for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) |
89 | 243 | SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID); |
90 | | |
91 | | // Convert the rules. |
92 | 1 | T->Rules.reserve(Specs.size()); |
93 | 1 | std::vector<SymbolID> Symbols; |
94 | 2.94k | auto Lookup = [SymbolIds](llvm::StringRef Name) { |
95 | 2.94k | auto It = SymbolIds.find(Name); |
96 | 2.94k | assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!"); |
97 | 0 | return It->second; |
98 | 2.94k | }; |
99 | 884 | for (const auto &Spec : Specs) { |
100 | 884 | assert(Spec.Sequence.size() <= Rule::MaxElements); |
101 | 0 | Symbols.clear(); |
102 | 884 | for (const RuleSpec::Element &Elt : Spec.Sequence) |
103 | 2.06k | Symbols.push_back(Lookup(Elt.Symbol)); |
104 | 884 | T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols)); |
105 | 884 | applyAttributes(Spec, *T, T->Rules.back()); |
106 | 884 | } |
107 | | |
108 | 1 | assert(T->Rules.size() < (1 << RuleBits) && |
109 | 1 | "Too many rules to fit in RuleID bits!"); |
110 | 0 | const auto &SymbolOrder = getTopologicalOrder(T.get()); |
111 | 1 | llvm::stable_sort( |
112 | 15.1k | T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) { |
113 | | // Sorted by the topological order of the nonterminal Target. |
114 | 15.1k | return SymbolOrder[Left.Target] < SymbolOrder[Right.Target]; |
115 | 15.1k | }); |
116 | 244 | for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) { |
117 | 2.38k | auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) { |
118 | 2.38k | return SymbolOrder[R.Target] < SymbolOrder[SID]; |
119 | 2.38k | }); |
120 | 243 | RuleID Start = StartIt - T->Rules.begin(); |
121 | 243 | RuleID End = Start; |
122 | 1.12k | while (End < T->Rules.size() && T->Rules[End].Target == SID) |
123 | 884 | ++End; |
124 | 243 | T->Nonterminals[SID].RuleRange = {Start, End}; |
125 | 243 | } |
126 | 1 | Grammar G(std::move(T)); |
127 | 1 | diagnoseGrammar(G); |
128 | 1 | return G; |
129 | 1 | } |
130 | | |
131 | | // Gets topological order for nonterminal symbols. |
132 | | // |
133 | | // The topological order is defined as: if a *single* nonterminal A produces |
134 | | // (or transitively) a nonterminal B (that said, there is a production rule |
135 | | // B := A), then A is less than B. |
136 | | // |
137 | | // It returns the sort key for each symbol, the array is indexed by SymbolID. |
138 | 1 | std::vector<unsigned> getTopologicalOrder(GrammarTable *T) { |
139 | 1 | std::vector<std::pair<SymbolID, SymbolID>> Dependencies; |
140 | 884 | for (const auto &Rule : T->Rules) { |
141 | | // if A := B, A depends on B. |
142 | 884 | if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0])) |
143 | 210 | Dependencies.push_back({Rule.Target, Rule.Sequence[0]}); |
144 | 884 | } |
145 | 1 | llvm::sort(Dependencies); |
146 | 1 | std::vector<SymbolID> Order; |
147 | | // Each nonterminal state flows: NotVisited -> Visiting -> Visited. |
148 | 1 | enum State { |
149 | 1 | NotVisited, |
150 | 1 | Visiting, |
151 | 1 | Visited, |
152 | 1 | }; |
153 | 1 | std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited); |
154 | 453 | std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void { |
155 | 453 | if (VisitStates[SID] == Visited) |
156 | 210 | return; |
157 | 243 | if (VisitStates[SID] == Visiting) { |
158 | 0 | Diagnostics.push_back( |
159 | 0 | llvm::formatv("The grammar contains a cycle involving symbol {0}", |
160 | 0 | T->Nonterminals[SID].Name)); |
161 | 0 | return; |
162 | 0 | } |
163 | 243 | VisitStates[SID] = Visiting; |
164 | 243 | for (auto It = llvm::lower_bound(Dependencies, |
165 | 243 | std::pair<SymbolID, SymbolID>{SID, 0}); |
166 | 453 | It != Dependencies.end() && It->first == SID; ++It) |
167 | 210 | DFS(It->second); |
168 | 243 | VisitStates[SID] = Visited; |
169 | 243 | Order.push_back(SID); |
170 | 243 | }; |
171 | 244 | for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID) |
172 | 243 | DFS(ID); |
173 | 1 | std::vector<unsigned> Result(T->Nonterminals.size(), 0); |
174 | 244 | for (size_t I = 0; I < Order.size(); ++I) |
175 | 243 | Result[Order[I]] = I; |
176 | 1 | return Result; |
177 | 1 | } |
178 | | |
179 | | private: |
180 | | // Text representation of a BNF grammar rule. |
181 | | struct RuleSpec { |
182 | | llvm::StringRef Target; |
183 | | struct Element { |
184 | | llvm::StringRef Symbol; // Name of the symbol |
185 | | // Attributes that are associated to the sequence symbol or rule. |
186 | | std::vector<std::pair<llvm::StringRef/*Key*/, llvm::StringRef/*Value*/>> |
187 | | Attributes; |
188 | | }; |
189 | | std::vector<Element> Sequence; |
190 | | |
191 | 0 | std::string toString() const { |
192 | 0 | std::vector<llvm::StringRef> Body; |
193 | 0 | for (const auto &E : Sequence) |
194 | 0 | Body.push_back(E.Symbol); |
195 | 0 | return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " ")); |
196 | 0 | } |
197 | | }; |
198 | | |
199 | 1 | std::vector<RuleSpec> parse(llvm::StringRef Lines) { |
200 | 1 | std::vector<RuleSpec> Specs; |
201 | 778 | for (llvm::StringRef Line : llvm::split(Lines, '\n')) { |
202 | 778 | Line = Line.trim(); |
203 | | // Strip anything coming after the '#' (comment). |
204 | 29.6k | Line = Line.take_while([](char C) { return C != '#'; }); |
205 | 778 | if (Line.empty()) |
206 | 127 | continue; |
207 | 651 | RuleSpec Rule; |
208 | 651 | if (parseLine(Line, Rule)) |
209 | 651 | Specs.push_back(std::move(Rule)); |
210 | 651 | } |
211 | 1 | return Specs; |
212 | 1 | } |
213 | | |
214 | 651 | bool parseLine(llvm::StringRef Line, RuleSpec &Out) { |
215 | 651 | auto Parts = Line.split(":="); |
216 | 651 | if (Parts.first == Line) { // no separator in Line |
217 | 0 | Diagnostics.push_back( |
218 | 0 | llvm::formatv("Failed to parse '{0}': no separator :=", Line).str()); |
219 | 0 | return false; |
220 | 0 | } |
221 | | |
222 | 651 | Out.Target = Parts.first.trim(); |
223 | 651 | Out.Sequence.clear(); |
224 | 2.02k | for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) { |
225 | 2.02k | Chunk = Chunk.trim(); |
226 | 2.02k | if (Chunk.empty()) |
227 | 651 | continue; // skip empty |
228 | 1.37k | if (Chunk.starts_with("[") && Chunk.ends_with("]")) { |
229 | 42 | if (Out.Sequence.empty()) |
230 | 0 | continue; |
231 | | |
232 | 42 | parseAttributes(Chunk, Out.Sequence.back().Attributes); |
233 | 42 | continue; |
234 | 42 | } |
235 | | |
236 | 1.32k | Out.Sequence.push_back({Chunk, /*Attributes=*/{}}); |
237 | 1.32k | } |
238 | 651 | return true; |
239 | 651 | } |
240 | | |
241 | | bool parseAttributes( |
242 | | llvm::StringRef Content, |
243 | 42 | std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) { |
244 | 42 | assert(Content.starts_with("[") && Content.ends_with("]")); |
245 | 0 | auto KV = Content.drop_front().drop_back().split('='); |
246 | 42 | Out.push_back({KV.first, KV.second.trim()}); |
247 | | |
248 | 42 | return true; |
249 | 42 | } |
250 | | // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule. |
251 | 884 | void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) { |
252 | 884 | auto LookupExtensionID = [&T](llvm::StringRef Name) { |
253 | 11 | const auto It = llvm::partition_point( |
254 | 22 | T.AttributeValues, [&](llvm::StringRef X) { return X < Name; }); |
255 | 11 | assert(It != T.AttributeValues.end() && *It == Name && |
256 | 11 | "Didn't find the attribute in AttrValues!"); |
257 | 0 | return It - T.AttributeValues.begin(); |
258 | 11 | }; |
259 | 2.94k | for (unsigned I = 0; I < Spec.Sequence.size(); ++I) { |
260 | 2.06k | for (const auto &KV : Spec.Sequence[I].Attributes) { |
261 | 53 | if (KV.first == "guard") { |
262 | 42 | R.Guarded = true; |
263 | 42 | } else if (KV.first == "recover") { |
264 | 11 | R.Recovery = LookupExtensionID(KV.second); |
265 | 11 | R.RecoveryIndex = I; |
266 | 11 | } else { |
267 | 0 | Diagnostics.push_back( |
268 | 0 | llvm::formatv("Unknown attribute '{0}'", KV.first).str()); |
269 | 0 | } |
270 | 53 | } |
271 | 2.06k | } |
272 | 884 | } |
273 | | |
274 | | // Inlines all _opt symbols. |
275 | | // For example, a rule E := id +_opt id, after elimination, we have two |
276 | | // equivalent rules: |
277 | | // 1) E := id + id |
278 | | // 2) E := id id |
279 | 1 | std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) { |
280 | 1 | std::vector<RuleSpec> Results; |
281 | 1 | std::vector<RuleSpec::Element> Storage; |
282 | 651 | for (const auto &R : Input) { |
283 | 651 | eliminateOptionalTail( |
284 | 884 | R.Sequence, Storage, [&Results, &Storage, &R, this]() { |
285 | 884 | if (Storage.empty()) { |
286 | 0 | Diagnostics.push_back( |
287 | 0 | llvm::formatv("Rule '{0}' has a nullable RHS", R.toString())); |
288 | 0 | return; |
289 | 0 | } |
290 | 884 | Results.push_back({R.Target, Storage}); |
291 | 884 | }); |
292 | 651 | assert(Storage.empty()); |
293 | 651 | } |
294 | 1 | return Results; |
295 | 1 | } |
296 | | void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements, |
297 | | std::vector<RuleSpec::Element> &Result, |
298 | 2.49k | llvm::function_ref<void()> CB) { |
299 | 2.49k | if (Elements.empty()) |
300 | 884 | return CB(); |
301 | 1.60k | auto Front = Elements.front(); |
302 | 1.60k | if (!Front.Symbol.ends_with(OptSuffix)) { |
303 | 1.37k | Result.push_back(std::move(Front)); |
304 | 1.37k | eliminateOptionalTail(Elements.drop_front(1), Result, CB); |
305 | 1.37k | Result.pop_back(); |
306 | 1.37k | return; |
307 | 1.37k | } |
308 | | // Enumerate two options: skip the opt symbol, or inline the symbol. |
309 | 233 | eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip |
310 | 233 | Front.Symbol = Front.Symbol.drop_back(OptSuffix.size()); // drop "_opt" |
311 | 233 | Result.push_back(std::move(Front)); |
312 | 233 | eliminateOptionalTail(Elements.drop_front(1), Result, CB); |
313 | 233 | Result.pop_back(); |
314 | 233 | } |
315 | | |
316 | | // Diagnoses the grammar and emit warnings if any. |
317 | 1 | void diagnoseGrammar(const Grammar &G) { |
318 | 1 | const auto &T = G.table(); |
319 | 244 | for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) { |
320 | 243 | auto Range = T.Nonterminals[SID].RuleRange; |
321 | 243 | if (Range.Start == Range.End) |
322 | 0 | Diagnostics.push_back( |
323 | 0 | llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID))); |
324 | 243 | llvm::StringRef NameRef = T.Nonterminals[SID].Name; |
325 | 243 | if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) { |
326 | 0 | Diagnostics.push_back(llvm::formatv( |
327 | 0 | "Token-like name {0} is used as a nonterminal", G.symbolName(SID))); |
328 | 0 | } |
329 | 243 | } |
330 | 1 | llvm::DenseSet<llvm::hash_code> VisitedRules; |
331 | 885 | for (RuleID RID = 0; RID < T.Rules.size(); ++RID) { |
332 | 884 | const auto &R = T.Rules[RID]; |
333 | 884 | auto Code = llvm::hash_combine( |
334 | 884 | R.Target, llvm::hash_combine_range(R.seq().begin(), R.seq().end())); |
335 | 884 | auto [_, New] = VisitedRules.insert(Code); |
336 | 884 | if (!New) |
337 | 0 | Diagnostics.push_back( |
338 | 0 | llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID))); |
339 | 884 | } |
340 | | // symbol-id -> used counts |
341 | 1 | std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0); |
342 | 1 | for (const Rule &R : T.Rules) |
343 | 884 | for (SymbolID SID : R.seq()) |
344 | 2.06k | if (isNonterminal(SID)) |
345 | 1.06k | ++UseCounts[SID]; |
346 | 244 | for (SymbolID SID = 0; SID < UseCounts.size(); ++SID) |
347 | 243 | if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol) |
348 | 0 | Diagnostics.push_back( |
349 | 0 | llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID))); |
350 | 1 | } |
351 | | std::vector<std::string> &Diagnostics; |
352 | | }; |
353 | | } // namespace |
354 | | |
355 | | Grammar Grammar::parseBNF(llvm::StringRef BNF, |
356 | 1 | std::vector<std::string> &Diagnostics) { |
357 | 1 | Diagnostics.clear(); |
358 | 1 | return GrammarBuilder(Diagnostics).build(BNF); |
359 | 1 | } |
360 | | |
361 | | } // namespace pseudo |
362 | | } // namespace clang |