Coverage Report

Created: 2024-01-17 10:31

/src/llvm-project/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- GrammarBNF.cpp - build grammar from BNF files  ----------*- C++-*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "clang-pseudo/grammar/Grammar.h"
10
#include "clang/Basic/TokenKinds.h"
11
#include "llvm/ADT/DenseSet.h"
12
#include "llvm/ADT/STLExtras.h"
13
#include "llvm/ADT/StringExtras.h"
14
#include "llvm/Support/FormatVariadic.h"
15
#include <memory>
16
#include <utility>
17
18
namespace clang {
19
namespace pseudo {
20
21
namespace {
22
static const llvm::StringRef OptSuffix = "_opt";
23
static const llvm::StringRef StartSymbol = "_";
24
25
// Builds grammar from BNF files.
26
class GrammarBuilder {
27
public:
28
  GrammarBuilder(std::vector<std::string> &Diagnostics)
29
1
      : Diagnostics(Diagnostics) {}
30
31
1
  Grammar build(llvm::StringRef BNF) {
32
1
    auto Specs = eliminateOptional(parse(BNF));
33
34
1
    assert(llvm::all_of(Specs,
35
1
                        [](const RuleSpec &R) {
36
1
                          if (R.Target.ends_with(OptSuffix))
37
1
                            return false;
38
1
                          return llvm::all_of(
39
1
                              R.Sequence, [](const RuleSpec::Element &E) {
40
1
                                return !E.Symbol.ends_with(OptSuffix);
41
1
                              });
42
1
                        }) &&
43
1
           "Optional symbols should be eliminated!");
44
45
0
    auto T = std::make_unique<GrammarTable>();
46
47
    // Assemble the name->ID and ID->nonterminal name maps.
48
1
    llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
49
1
    llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
50
51
1
    llvm::DenseSet<llvm::StringRef> UniqueAttributeValues;
52
53
444
    for (uint16_t I = 0; I < NumTerminals; ++I)
54
443
      SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
55
2.94k
    auto Consider = [&](llvm::StringRef Name) {
56
2.94k
      if (!SymbolIds.count(Name))
57
1.94k
        UniqueNonterminals.insert(Name);
58
2.94k
    };
59
884
    for (const auto &Spec : Specs) {
60
884
      Consider(Spec.Target);
61
2.06k
      for (const RuleSpec::Element &Elt : Spec.Sequence) {
62
2.06k
        Consider(Elt.Symbol);
63
2.06k
        for (const auto& KV : Elt.Attributes)
64
53
           UniqueAttributeValues.insert(KV.second);
65
2.06k
      }
66
884
    }
67
243
    for (llvm::StringRef Name : UniqueNonterminals) {
68
243
      T->Nonterminals.emplace_back();
69
243
      T->Nonterminals.back().Name = Name.str();
70
243
    }
71
1
    assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
72
1
           "Too many nonterminals to fit in SymbolID bits!");
73
0
    llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L,
74
1.97k
                                   const GrammarTable::Nonterminal &R) {
75
1.97k
      return L.Name < R.Name;
76
1.97k
    });
77
    // Add an empty string for the corresponding sentinel unset attribute.
78
1
    T->AttributeValues.push_back("");
79
1
    UniqueAttributeValues.erase("");
80
1
    for (llvm::StringRef Name : UniqueAttributeValues) {
81
1
      T->AttributeValues.emplace_back();
82
1
      T->AttributeValues.back() = Name.str();
83
1
    }
84
1
    llvm::sort(T->AttributeValues);
85
1
    assert(T->AttributeValues.front() == "");
86
87
    // Build name -> ID maps for nonterminals.
88
244
    for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
89
243
      SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
90
91
    // Convert the rules.
92
1
    T->Rules.reserve(Specs.size());
93
1
    std::vector<SymbolID> Symbols;
94
2.94k
    auto Lookup = [SymbolIds](llvm::StringRef Name) {
95
2.94k
      auto It = SymbolIds.find(Name);
96
2.94k
      assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
97
0
      return It->second;
98
2.94k
    };
99
884
    for (const auto &Spec : Specs) {
100
884
      assert(Spec.Sequence.size() <= Rule::MaxElements);
101
0
      Symbols.clear();
102
884
      for (const RuleSpec::Element &Elt : Spec.Sequence)
103
2.06k
        Symbols.push_back(Lookup(Elt.Symbol));
104
884
      T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
105
884
      applyAttributes(Spec, *T, T->Rules.back());
106
884
    }
107
108
1
    assert(T->Rules.size() < (1 << RuleBits) &&
109
1
           "Too many rules to fit in RuleID bits!");
110
0
    const auto &SymbolOrder = getTopologicalOrder(T.get());
111
1
    llvm::stable_sort(
112
15.1k
        T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {
113
          // Sorted by the topological order of the nonterminal Target.
114
15.1k
          return SymbolOrder[Left.Target] < SymbolOrder[Right.Target];
115
15.1k
        });
116
244
    for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
117
2.38k
      auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) {
118
2.38k
        return SymbolOrder[R.Target] < SymbolOrder[SID];
119
2.38k
      });
120
243
      RuleID Start = StartIt - T->Rules.begin();
121
243
      RuleID End = Start;
122
1.12k
      while (End < T->Rules.size() && T->Rules[End].Target == SID)
123
884
        ++End;
124
243
      T->Nonterminals[SID].RuleRange = {Start, End};
125
243
    }
126
1
    Grammar G(std::move(T));
127
1
    diagnoseGrammar(G);
128
1
    return G;
129
1
  }
130
131
  // Gets topological order for nonterminal symbols.
132
  //
133
  // The topological order is defined as: if a *single* nonterminal A produces
134
  // (or transitively) a nonterminal B (that said, there is a production rule
135
  // B := A), then A is less than B.
136
  //
137
  // It returns the sort key for each symbol, the array is indexed by SymbolID.
138
1
  std::vector<unsigned> getTopologicalOrder(GrammarTable *T) {
139
1
    std::vector<std::pair<SymbolID, SymbolID>> Dependencies;
140
884
    for (const auto &Rule : T->Rules) {
141
      // if A := B, A depends on B.
142
884
      if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0]))
143
210
        Dependencies.push_back({Rule.Target, Rule.Sequence[0]});
144
884
    }
145
1
    llvm::sort(Dependencies);
146
1
    std::vector<SymbolID> Order;
147
    // Each nonterminal state flows: NotVisited -> Visiting -> Visited.
148
1
    enum State {
149
1
      NotVisited,
150
1
      Visiting,
151
1
      Visited,
152
1
    };
153
1
    std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited);
154
453
    std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void {
155
453
      if (VisitStates[SID] == Visited)
156
210
        return;
157
243
      if (VisitStates[SID] == Visiting) {
158
0
        Diagnostics.push_back(
159
0
            llvm::formatv("The grammar contains a cycle involving symbol {0}",
160
0
                          T->Nonterminals[SID].Name));
161
0
        return;
162
0
      }
163
243
      VisitStates[SID] = Visiting;
164
243
      for (auto It = llvm::lower_bound(Dependencies,
165
243
                                       std::pair<SymbolID, SymbolID>{SID, 0});
166
453
           It != Dependencies.end() && It->first == SID; ++It)
167
210
        DFS(It->second);
168
243
      VisitStates[SID] = Visited;
169
243
      Order.push_back(SID);
170
243
    };
171
244
    for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID)
172
243
      DFS(ID);
173
1
    std::vector<unsigned> Result(T->Nonterminals.size(), 0);
174
244
    for (size_t I = 0; I < Order.size(); ++I)
175
243
      Result[Order[I]] = I;
176
1
    return Result;
177
1
  }
178
179
private:
180
  // Text representation of a BNF grammar rule.
181
  struct RuleSpec {
182
    llvm::StringRef Target;
183
    struct Element {
184
      llvm::StringRef Symbol; // Name of the symbol
185
      // Attributes that are associated to the sequence symbol or rule.
186
      std::vector<std::pair<llvm::StringRef/*Key*/, llvm::StringRef/*Value*/>>
187
          Attributes;
188
    };
189
    std::vector<Element> Sequence;
190
191
0
    std::string toString() const {
192
0
      std::vector<llvm::StringRef> Body;
193
0
      for (const auto &E : Sequence)
194
0
        Body.push_back(E.Symbol);
195
0
      return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " "));
196
0
    }
197
  };
198
199
1
  std::vector<RuleSpec> parse(llvm::StringRef Lines) {
200
1
    std::vector<RuleSpec> Specs;
201
778
    for (llvm::StringRef Line : llvm::split(Lines, '\n')) {
202
778
      Line = Line.trim();
203
      // Strip anything coming after the '#' (comment).
204
29.6k
      Line = Line.take_while([](char C) { return C != '#'; });
205
778
      if (Line.empty())
206
127
        continue;
207
651
      RuleSpec Rule;
208
651
      if (parseLine(Line, Rule))
209
651
        Specs.push_back(std::move(Rule));
210
651
    }
211
1
    return Specs;
212
1
  }
213
214
651
  bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
215
651
    auto Parts = Line.split(":=");
216
651
    if (Parts.first == Line) { // no separator in Line
217
0
      Diagnostics.push_back(
218
0
          llvm::formatv("Failed to parse '{0}': no separator :=", Line).str());
219
0
      return false;
220
0
    }
221
222
651
    Out.Target = Parts.first.trim();
223
651
    Out.Sequence.clear();
224
2.02k
    for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) {
225
2.02k
      Chunk = Chunk.trim();
226
2.02k
      if (Chunk.empty())
227
651
        continue; // skip empty
228
1.37k
      if (Chunk.starts_with("[") && Chunk.ends_with("]")) {
229
42
        if (Out.Sequence.empty())
230
0
          continue;
231
232
42
        parseAttributes(Chunk, Out.Sequence.back().Attributes);
233
42
        continue;
234
42
      }
235
236
1.32k
      Out.Sequence.push_back({Chunk, /*Attributes=*/{}});
237
1.32k
    }
238
651
    return true;
239
651
  }
240
241
  bool parseAttributes(
242
      llvm::StringRef Content,
243
42
      std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) {
244
42
    assert(Content.starts_with("[") && Content.ends_with("]"));
245
0
    auto KV = Content.drop_front().drop_back().split('=');
246
42
    Out.push_back({KV.first, KV.second.trim()});
247
248
42
    return true;
249
42
  }
250
  // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule.
251
884
  void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) {
252
884
    auto LookupExtensionID = [&T](llvm::StringRef Name) {
253
11
      const auto It = llvm::partition_point(
254
22
          T.AttributeValues, [&](llvm::StringRef X) { return X < Name; });
255
11
      assert(It != T.AttributeValues.end() && *It == Name &&
256
11
             "Didn't find the attribute in AttrValues!");
257
0
      return It - T.AttributeValues.begin();
258
11
    };
259
2.94k
    for (unsigned I = 0; I < Spec.Sequence.size(); ++I) {
260
2.06k
      for (const auto &KV : Spec.Sequence[I].Attributes) {
261
53
        if (KV.first == "guard") {
262
42
          R.Guarded = true;
263
42
        } else if (KV.first == "recover") {
264
11
          R.Recovery = LookupExtensionID(KV.second);
265
11
          R.RecoveryIndex = I;
266
11
        } else {
267
0
          Diagnostics.push_back(
268
0
              llvm::formatv("Unknown attribute '{0}'", KV.first).str());
269
0
        }
270
53
      }
271
2.06k
    }
272
884
  }
273
274
  // Inlines all _opt symbols.
275
  // For example, a rule E := id +_opt id, after elimination, we have two
276
  // equivalent rules:
277
  //   1) E := id + id
278
  //   2) E := id id
279
1
  std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
280
1
    std::vector<RuleSpec> Results;
281
1
    std::vector<RuleSpec::Element> Storage;
282
651
    for (const auto &R : Input) {
283
651
      eliminateOptionalTail(
284
884
          R.Sequence, Storage, [&Results, &Storage, &R, this]() {
285
884
            if (Storage.empty()) {
286
0
              Diagnostics.push_back(
287
0
                  llvm::formatv("Rule '{0}' has a nullable RHS", R.toString()));
288
0
              return;
289
0
            }
290
884
            Results.push_back({R.Target, Storage});
291
884
          });
292
651
      assert(Storage.empty());
293
651
    }
294
1
    return Results;
295
1
  }
296
  void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
297
                             std::vector<RuleSpec::Element> &Result,
298
2.49k
                             llvm::function_ref<void()> CB) {
299
2.49k
    if (Elements.empty())
300
884
      return CB();
301
1.60k
    auto Front = Elements.front();
302
1.60k
    if (!Front.Symbol.ends_with(OptSuffix)) {
303
1.37k
      Result.push_back(std::move(Front));
304
1.37k
      eliminateOptionalTail(Elements.drop_front(1), Result, CB);
305
1.37k
      Result.pop_back();
306
1.37k
      return;
307
1.37k
    }
308
    // Enumerate two options: skip the opt symbol, or inline the symbol.
309
233
    eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip
310
233
    Front.Symbol = Front.Symbol.drop_back(OptSuffix.size());   // drop "_opt"
311
233
    Result.push_back(std::move(Front));
312
233
    eliminateOptionalTail(Elements.drop_front(1), Result, CB);
313
233
    Result.pop_back();
314
233
  }
315
316
  // Diagnoses the grammar and emit warnings if any.
317
1
  void diagnoseGrammar(const Grammar &G) {
318
1
    const auto &T = G.table();
319
244
    for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) {
320
243
      auto Range = T.Nonterminals[SID].RuleRange;
321
243
      if (Range.Start == Range.End)
322
0
        Diagnostics.push_back(
323
0
            llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID)));
324
243
      llvm::StringRef NameRef = T.Nonterminals[SID].Name;
325
243
      if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) {
326
0
        Diagnostics.push_back(llvm::formatv(
327
0
            "Token-like name {0} is used as a nonterminal", G.symbolName(SID)));
328
0
      }
329
243
    }
330
1
    llvm::DenseSet<llvm::hash_code> VisitedRules;
331
885
    for (RuleID RID = 0; RID < T.Rules.size(); ++RID) {
332
884
      const auto &R = T.Rules[RID];
333
884
      auto Code = llvm::hash_combine(
334
884
          R.Target, llvm::hash_combine_range(R.seq().begin(), R.seq().end()));
335
884
      auto [_, New] = VisitedRules.insert(Code);
336
884
      if (!New)
337
0
        Diagnostics.push_back(
338
0
            llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID)));
339
884
    }
340
    // symbol-id -> used counts
341
1
    std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0);
342
1
    for (const Rule &R : T.Rules)
343
884
      for (SymbolID SID : R.seq())
344
2.06k
        if (isNonterminal(SID))
345
1.06k
          ++UseCounts[SID];
346
244
    for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
347
243
      if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol)
348
0
        Diagnostics.push_back(
349
0
            llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID)));
350
1
  }
351
  std::vector<std::string> &Diagnostics;
352
};
353
} // namespace
354
355
Grammar Grammar::parseBNF(llvm::StringRef BNF,
356
1
                          std::vector<std::string> &Diagnostics) {
357
1
  Diagnostics.clear();
358
1
  return GrammarBuilder(Diagnostics).build(BNF);
359
1
}
360
361
} // namespace pseudo
362
} // namespace clang