/src/llvm-project/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp

Source (jump to first uncovered line)
//===--- GrammarBNF.cpp - build grammar from BNF files  ----------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "clang-pseudo/grammar/Grammar.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/FormatVariadic.h"
#include <memory>
#include <utility>

namespace clang {
namespace pseudo {

namespace {
static const llvm::StringRef OptSuffix = "_opt";
static const llvm::StringRef StartSymbol = "_";

// Builds grammar from BNF files.
class GrammarBuilder {
public:
  GrammarBuilder(std::vector<std::string> &Diagnostics)
      : Diagnostics(Diagnostics) {}

  Grammar build(llvm::StringRef BNF) {
    auto Specs = eliminateOptional(parse(BNF));

    assert(llvm::all_of(Specs,
                        [](const RuleSpec &R) {
                          if (R.Target.ends_with(OptSuffix))
                            return false;
                          return llvm::all_of(
                              R.Sequence, [](const RuleSpec::Element &E) {
                                return !E.Symbol.ends_with(OptSuffix);
                              });
                        }) &&
           "Optional symbols should be eliminated!");

    auto T = std::make_unique<GrammarTable>();

    // Assemble the name->ID and ID->nonterminal name maps.
    llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
    llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;

    llvm::DenseSet<llvm::StringRef> UniqueAttributeValues;

    for (uint16_t I = 0; I < NumTerminals; ++I)
      SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
    auto Consider = [&](llvm::StringRef Name) {
      if (!SymbolIds.count(Name))
        UniqueNonterminals.insert(Name);
    };
    for (const auto &Spec : Specs) {
      Consider(Spec.Target);
      for (const RuleSpec::Element &Elt : Spec.Sequence) {
        Consider(Elt.Symbol);
        for (const auto& KV : Elt.Attributes)
           UniqueAttributeValues.insert(KV.second);
      }
    }
    for (llvm::StringRef Name : UniqueNonterminals) {
      T->Nonterminals.emplace_back();
      T->Nonterminals.back().Name = Name.str();
    }
    assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
           "Too many nonterminals to fit in SymbolID bits!");
    llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L,
                                   const GrammarTable::Nonterminal &R) {
      return L.Name < R.Name;
    });
    // Add an empty string for the corresponding sentinel unset attribute.
    T->AttributeValues.push_back("");
    UniqueAttributeValues.erase("");
    for (llvm::StringRef Name : UniqueAttributeValues) {
      T->AttributeValues.emplace_back();
      T->AttributeValues.back() = Name.str();
    }
    llvm::sort(T->AttributeValues);
    assert(T->AttributeValues.front() == "");

    // Build name -> ID maps for nonterminals.
    for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
      SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);

    // Convert the rules.
    T->Rules.reserve(Specs.size());
    std::vector<SymbolID> Symbols;
    auto Lookup = [SymbolIds](llvm::StringRef Name) {
      auto It = SymbolIds.find(Name);
      assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
      return It->second;
    };
    for (const auto &Spec : Specs) {
      assert(Spec.Sequence.size() <= Rule::MaxElements);
      Symbols.clear();
      for (const RuleSpec::Element &Elt : Spec.Sequence)
        Symbols.push_back(Lookup(Elt.Symbol));
      T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
      applyAttributes(Spec, *T, T->Rules.back());
    }

    assert(T->Rules.size() < (1 << RuleBits) &&
           "Too many rules to fit in RuleID bits!");
    const auto &SymbolOrder = getTopologicalOrder(T.get());
    llvm::stable_sort(
        T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {
          // Sorted by the topological order of the nonterminal Target.
          return SymbolOrder[Left.Target] < SymbolOrder[Right.Target];
        });
    for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
      auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) {
        return SymbolOrder[R.Target] < SymbolOrder[SID];
      });
      RuleID Start = StartIt - T->Rules.begin();
      RuleID End = Start;
      while (End < T->Rules.size() && T->Rules[End].Target == SID)
        ++End;
      T->Nonterminals[SID].RuleRange = {Start, End};
    }
    Grammar G(std::move(T));
    diagnoseGrammar(G);
    return G;
  }

  // Gets topological order for nonterminal symbols.
  //
  // The topological order is defined as: if a *single* nonterminal A produces
  // (or transitively) a nonterminal B (that said, there is a production rule
  // B := A), then A is less than B.
  //
  // It returns the sort key for each symbol, the array is indexed by SymbolID.
  std::vector<unsigned> getTopologicalOrder(GrammarTable *T) {
    std::vector<std::pair<SymbolID, SymbolID>> Dependencies;
    for (const auto &Rule : T->Rules) {
      // if A := B, A depends on B.
      if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0]))
        Dependencies.push_back({Rule.Target, Rule.Sequence[0]});
    }
    llvm::sort(Dependencies);
    std::vector<SymbolID> Order;
    // Each nonterminal state flows: NotVisited -> Visiting -> Visited.
    enum State {
      NotVisited,
      Visiting,
      Visited,
    };
    std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited);
    std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void {
      if (VisitStates[SID] == Visited)
        return;
      if (VisitStates[SID] == Visiting) {
        Diagnostics.push_back(
            llvm::formatv("The grammar contains a cycle involving symbol {0}",
                          T->Nonterminals[SID].Name));
        return;
      }
      VisitStates[SID] = Visiting;
      for (auto It = llvm::lower_bound(Dependencies,
                                       std::pair<SymbolID, SymbolID>{SID, 0});
           It != Dependencies.end() && It->first == SID; ++It)
        DFS(It->second);
      VisitStates[SID] = Visited;
      Order.push_back(SID);
    };
    for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID)
      DFS(ID);
    std::vector<unsigned> Result(T->Nonterminals.size(), 0);
    for (size_t I = 0; I < Order.size(); ++I)
      Result[Order[I]] = I;
    return Result;
  }

private:
  // Text representation of a BNF grammar rule.
  struct RuleSpec {
    llvm::StringRef Target;
    struct Element {
      llvm::StringRef Symbol; // Name of the symbol
      // Attributes that are associated to the sequence symbol or rule.
      std::vector<std::pair<llvm::StringRef/*Key*/, llvm::StringRef/*Value*/>>
          Attributes;
    };
    std::vector<Element> Sequence;

    std::string toString() const {
      std::vector<llvm::StringRef> Body;
      for (const auto &E : Sequence)
        Body.push_back(E.Symbol);
      return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " "));
    }
  };

  std::vector<RuleSpec> parse(llvm::StringRef Lines) {
    std::vector<RuleSpec> Specs;
    for (llvm::StringRef Line : llvm::split(Lines, '\n')) {
      Line = Line.trim();
      // Strip anything coming after the '#' (comment).
      Line = Line.take_while([](char C) { return C != '#'; });
      if (Line.empty())
        continue;
      RuleSpec Rule;
      if (parseLine(Line, Rule))
        Specs.push_back(std::move(Rule));
    }
    return Specs;
  }

  bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
    auto Parts = Line.split(":=");
    if (Parts.first == Line) { // no separator in Line
      Diagnostics.push_back(
          llvm::formatv("Failed to parse '{0}': no separator :=", Line).str());
      return false;
    }

    Out.Target = Parts.first.trim();
    Out.Sequence.clear();
    for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) {
      Chunk = Chunk.trim();
      if (Chunk.empty())
        continue; // skip empty
      if (Chunk.starts_with("[") && Chunk.ends_with("]")) {
        if (Out.Sequence.empty())
          continue;

        parseAttributes(Chunk, Out.Sequence.back().Attributes);
        continue;
      }

      Out.Sequence.push_back({Chunk, /*Attributes=*/{}});
    }
    return true;
  }

  bool parseAttributes(
      llvm::StringRef Content,
      std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) {
    assert(Content.starts_with("[") && Content.ends_with("]"));
    auto KV = Content.drop_front().drop_back().split('=');
    Out.push_back({KV.first, KV.second.trim()});

    return true;
  }
  // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule.
  void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) {
    auto LookupExtensionID = [&T](llvm::StringRef Name) {
      const auto It = llvm::partition_point(
          T.AttributeValues, [&](llvm::StringRef X) { return X < Name; });
      assert(It != T.AttributeValues.end() && *It == Name &&
             "Didn't find the attribute in AttrValues!");
      return It - T.AttributeValues.begin();
    };
    for (unsigned I = 0; I < Spec.Sequence.size(); ++I) {
      for (const auto &KV : Spec.Sequence[I].Attributes) {
        if (KV.first == "guard") {
          R.Guarded = true;
        } else if (KV.first == "recover") {
          R.Recovery = LookupExtensionID(KV.second);
          R.RecoveryIndex = I;
        } else {
          Diagnostics.push_back(
              llvm::formatv("Unknown attribute '{0}'", KV.first).str());
        }
      }
    }
  }

  // Inlines all _opt symbols.
  // For example, a rule E := id +_opt id, after elimination, we have two
  // equivalent rules:
  //   1) E := id + id
  //   2) E := id id
  std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
    std::vector<RuleSpec> Results;
    std::vector<RuleSpec::Element> Storage;
    for (const auto &R : Input) {
      eliminateOptionalTail(
          R.Sequence, Storage, [&Results, &Storage, &R, this]() {
            if (Storage.empty()) {
              Diagnostics.push_back(
                  llvm::formatv("Rule '{0}' has a nullable RHS", R.toString()));
              return;
            }
            Results.push_back({R.Target, Storage});
          });
      assert(Storage.empty());
    }
    return Results;
  }
  void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
                             std::vector<RuleSpec::Element> &Result,
                             llvm::function_ref<void()> CB) {
    if (Elements.empty())
      return CB();
    auto Front = Elements.front();
    if (!Front.Symbol.ends_with(OptSuffix)) {
      Result.push_back(std::move(Front));
      eliminateOptionalTail(Elements.drop_front(1), Result, CB);
      Result.pop_back();
      return;
    }
    // Enumerate two options: skip the opt symbol, or inline the symbol.
    eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip
    Front.Symbol = Front.Symbol.drop_back(OptSuffix.size());   // drop "_opt"
    Result.push_back(std::move(Front));
    eliminateOptionalTail(Elements.drop_front(1), Result, CB);
    Result.pop_back();
  }

  // Diagnoses the grammar and emit warnings if any.
  void diagnoseGrammar(const Grammar &G) {
    const auto &T = G.table();
    for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) {
      auto Range = T.Nonterminals[SID].RuleRange;
      if (Range.Start == Range.End)
        Diagnostics.push_back(
            llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID)));
      llvm::StringRef NameRef = T.Nonterminals[SID].Name;
      if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) {
        Diagnostics.push_back(llvm::formatv(
            "Token-like name {0} is used as a nonterminal", G.symbolName(SID)));
      }
    }
    llvm::DenseSet<llvm::hash_code> VisitedRules;
    for (RuleID RID = 0; RID < T.Rules.size(); ++RID) {
      const auto &R = T.Rules[RID];
      auto Code = llvm::hash_combine(
          R.Target, llvm::hash_combine_range(R.seq().begin(), R.seq().end()));
      auto [_, New] = VisitedRules.insert(Code);
      if (!New)
        Diagnostics.push_back(
            llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID)));
    }
    // symbol-id -> used counts
    std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0);
    for (const Rule &R : T.Rules)
      for (SymbolID SID : R.seq())
        if (isNonterminal(SID))
          ++UseCounts[SID];
    for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
      if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol)
        Diagnostics.push_back(
            llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID)));
  }
  std::vector<std::string> &Diagnostics;
};
} // namespace

Grammar Grammar::parseBNF(llvm::StringRef BNF,
                          std::vector<std::string> &Diagnostics) {
  Diagnostics.clear();
  return GrammarBuilder(Diagnostics).build(BNF);
}

} // namespace pseudo
} // namespace clang

Coverage Report

Created: 2024-01-17 10:31

Line	Count	Source (jump to first uncovered line)
1		//===--- GrammarBNF.cpp - build grammar from BNF files ----------- C++--===//
2		//
3		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4		// See https://llvm.org/LICENSE.txt for license information.
5		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6		//
7		//===----------------------------------------------------------------------===//
8
9		#include "clang-pseudo/grammar/Grammar.h"
10		#include "clang/Basic/TokenKinds.h"
11		#include "llvm/ADT/DenseSet.h"
12		#include "llvm/ADT/STLExtras.h"
13		#include "llvm/ADT/StringExtras.h"
14		#include "llvm/Support/FormatVariadic.h"
15		#include <memory>
16		#include <utility>
17
18		namespace clang {
19		namespace pseudo {
20
21		namespace {
22		static const llvm::StringRef OptSuffix = "_opt";
23		static const llvm::StringRef StartSymbol = "_";
24
25		// Builds grammar from BNF files.
26		class GrammarBuilder {
27		public:
28		GrammarBuilder(std::vector<std::string> &Diagnostics)
29	1	: Diagnostics(Diagnostics) {}
30
31	1	Grammar build(llvm::StringRef BNF) {
32	1	auto Specs = eliminateOptional(parse(BNF));
33
34	1	assert(llvm::all_of(Specs,
35	1	[](const RuleSpec &R) {
36	1	if (R.Target.ends_with(OptSuffix))
37	1	return false;
38	1	return llvm::all_of(
39	1	R.Sequence, [](const RuleSpec::Element &E) {
40	1	return !E.Symbol.ends_with(OptSuffix);
41	1	});
42	1	}) &&
43	1	"Optional symbols should be eliminated!");
44
45	0	auto T = std::make_unique<GrammarTable>();
46
47		// Assemble the name->ID and ID->nonterminal name maps.
48	1	llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
49	1	llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
50
51	1	llvm::DenseSet<llvm::StringRef> UniqueAttributeValues;
52
53	444	for (uint16_t I = 0; I < NumTerminals; ++I)
54	443	SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
55	2.94k	auto Consider = [&](llvm::StringRef Name) {
56	2.94k	if (!SymbolIds.count(Name))
57	1.94k	UniqueNonterminals.insert(Name);
58	2.94k	};
59	884	for (const auto &Spec : Specs) {
60	884	Consider(Spec.Target);
61	2.06k	for (const RuleSpec::Element &Elt : Spec.Sequence) {
62	2.06k	Consider(Elt.Symbol);
63	2.06k	for (const auto& KV : Elt.Attributes)
64	53	UniqueAttributeValues.insert(KV.second);
65	2.06k	}
66	884	}
67	243	for (llvm::StringRef Name : UniqueNonterminals) {
68	243	T->Nonterminals.emplace_back();
69	243	T->Nonterminals.back().Name = Name.str();
70	243	}
71	1	assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
72	1	"Too many nonterminals to fit in SymbolID bits!");
73	0	llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L,
74	1.97k	const GrammarTable::Nonterminal &R) {
75	1.97k	return L.Name < R.Name;
76	1.97k	});
77		// Add an empty string for the corresponding sentinel unset attribute.
78	1	T->AttributeValues.push_back("");
79	1	UniqueAttributeValues.erase("");
80	1	for (llvm::StringRef Name : UniqueAttributeValues) {
81	1	T->AttributeValues.emplace_back();
82	1	T->AttributeValues.back() = Name.str();
83	1	}
84	1	llvm::sort(T->AttributeValues);
85	1	assert(T->AttributeValues.front() == "");
86
87		// Build name -> ID maps for nonterminals.
88	244	for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
89	243	SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
90
91		// Convert the rules.
92	1	T->Rules.reserve(Specs.size());
93	1	std::vector<SymbolID> Symbols;
94	2.94k	auto Lookup = [SymbolIds](llvm::StringRef Name) {
95	2.94k	auto It = SymbolIds.find(Name);
96	2.94k	assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
97	0	return It->second;
98	2.94k	};
99	884	for (const auto &Spec : Specs) {
100	884	assert(Spec.Sequence.size() <= Rule::MaxElements);
101	0	Symbols.clear();
102	884	for (const RuleSpec::Element &Elt : Spec.Sequence)
103	2.06k	Symbols.push_back(Lookup(Elt.Symbol));
104	884	T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
105	884	applyAttributes(Spec, *T, T->Rules.back());
106	884	}
107
108	1	assert(T->Rules.size() < (1 << RuleBits) &&
109	1	"Too many rules to fit in RuleID bits!");
110	0	const auto &SymbolOrder = getTopologicalOrder(T.get());
111	1	llvm::stable_sort(
112	15.1k	T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {
113		// Sorted by the topological order of the nonterminal Target.
114	15.1k	return SymbolOrder[Left.Target] < SymbolOrder[Right.Target];
115	15.1k	});
116	244	for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
117	2.38k	auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) {
118	2.38k	return SymbolOrder[R.Target] < SymbolOrder[SID];
119	2.38k	});
120	243	RuleID Start = StartIt - T->Rules.begin();
121	243	RuleID End = Start;
122	1.12k	while (End < T->Rules.size() && T->Rules[End].Target == SID)
123	884	++End;
124	243	T->Nonterminals[SID].RuleRange = {Start, End};
125	243	}
126	1	Grammar G(std::move(T));
127	1	diagnoseGrammar(G);
128	1	return G;
129	1	}
130
131		// Gets topological order for nonterminal symbols.
132		//
133		// The topological order is defined as: if a single nonterminal A produces
134		// (or transitively) a nonterminal B (that said, there is a production rule
135		// B := A), then A is less than B.
136		//
137		// It returns the sort key for each symbol, the array is indexed by SymbolID.
138	1	std::vector<unsigned> getTopologicalOrder(GrammarTable *T) {
139	1	std::vector<std::pair<SymbolID, SymbolID>> Dependencies;
140	884	for (const auto &Rule : T->Rules) {
141		// if A := B, A depends on B.
142	884	if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0]))
143	210	Dependencies.push_back({Rule.Target, Rule.Sequence[0]});
144	884	}
145	1	llvm::sort(Dependencies);
146	1	std::vector<SymbolID> Order;
147		// Each nonterminal state flows: NotVisited -> Visiting -> Visited.
148	1	enum State {
149	1	NotVisited,
150	1	Visiting,
151	1	Visited,
152	1	};
153	1	std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited);
154	453	std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void {
155	453	if (VisitStates[SID] == Visited)
156	210	return;
157	243	if (VisitStates[SID] == Visiting) {
158	0	Diagnostics.push_back(
159	0	llvm::formatv("The grammar contains a cycle involving symbol {0}",
160	0	T->Nonterminals[SID].Name));
161	0	return;
162	0	}
163	243	VisitStates[SID] = Visiting;
164	243	for (auto It = llvm::lower_bound(Dependencies,
165	243	std::pair<SymbolID, SymbolID>{SID, 0});
166	453	It != Dependencies.end() && It->first == SID; ++It)
167	210	DFS(It->second);
168	243	VisitStates[SID] = Visited;
169	243	Order.push_back(SID);
170	243	};
171	244	for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID)
172	243	DFS(ID);
173	1	std::vector<unsigned> Result(T->Nonterminals.size(), 0);
174	244	for (size_t I = 0; I < Order.size(); ++I)
175	243	Result[Order[I]] = I;
176	1	return Result;
177	1	}
178
179		private:
180		// Text representation of a BNF grammar rule.
181		struct RuleSpec {
182		llvm::StringRef Target;
183		struct Element {
184		llvm::StringRef Symbol; // Name of the symbol
185		// Attributes that are associated to the sequence symbol or rule.
186		std::vector<std::pair<llvm::StringRef/Key/, llvm::StringRef/Value/>>
187		Attributes;
188		};
189		std::vector<Element> Sequence;
190
191	0	std::string toString() const {
192	0	std::vector<llvm::StringRef> Body;
193	0	for (const auto &E : Sequence)
194	0	Body.push_back(E.Symbol);
195	0	return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " "));
196	0	}
197		};
198
199	1	std::vector<RuleSpec> parse(llvm::StringRef Lines) {
200	1	std::vector<RuleSpec> Specs;
201	778	for (llvm::StringRef Line : llvm::split(Lines, '\n')) {
202	778	Line = Line.trim();
203		// Strip anything coming after the '#' (comment).
204	29.6k	Line = Line.take_while([](char C) { return C != '#'; });
205	778	if (Line.empty())
206	127	continue;
207	651	RuleSpec Rule;
208	651	if (parseLine(Line, Rule))
209	651	Specs.push_back(std::move(Rule));
210	651	}
211	1	return Specs;
212	1	}
213
214	651	bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
215	651	auto Parts = Line.split(":=");
216	651	if (Parts.first == Line) { // no separator in Line
217	0	Diagnostics.push_back(
218	0	llvm::formatv("Failed to parse '{0}': no separator :=", Line).str());
219	0	return false;
220	0	}
221
222	651	Out.Target = Parts.first.trim();
223	651	Out.Sequence.clear();
224	2.02k	for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) {
225	2.02k	Chunk = Chunk.trim();
226	2.02k	if (Chunk.empty())
227	651	continue; // skip empty
228	1.37k	if (Chunk.starts_with("[") && Chunk.ends_with("]")) {
229	42	if (Out.Sequence.empty())
230	0	continue;
231
232	42	parseAttributes(Chunk, Out.Sequence.back().Attributes);
233	42	continue;
234	42	}
235
236	1.32k	Out.Sequence.push_back({Chunk, /Attributes=/{}});
237	1.32k	}
238	651	return true;
239	651	}
240
241		bool parseAttributes(
242		llvm::StringRef Content,
243	42	std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) {
244	42	assert(Content.starts_with("[") && Content.ends_with("]"));
245	0	auto KV = Content.drop_front().drop_back().split('=');
246	42	Out.push_back({KV.first, KV.second.trim()});
247
248	42	return true;
249	42	}
250		// Apply the parsed extensions (stored in RuleSpec) to the grammar Rule.
251	884	void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) {
252	884	auto LookupExtensionID = [&T](llvm::StringRef Name) {
253	11	const auto It = llvm::partition_point(
254	22	T.AttributeValues, [&](llvm::StringRef X) { return X < Name; });
255	11	assert(It != T.AttributeValues.end() && *It == Name &&
256	11	"Didn't find the attribute in AttrValues!");
257	0	return It - T.AttributeValues.begin();
258	11	};
259	2.94k	for (unsigned I = 0; I < Spec.Sequence.size(); ++I) {
260	2.06k	for (const auto &KV : Spec.Sequence[I].Attributes) {
261	53	if (KV.first == "guard") {
262	42	R.Guarded = true;
263	42	} else if (KV.first == "recover") {
264	11	R.Recovery = LookupExtensionID(KV.second);
265	11	R.RecoveryIndex = I;
266	11	} else {
267	0	Diagnostics.push_back(
268	0	llvm::formatv("Unknown attribute '{0}'", KV.first).str());
269	0	}
270	53	}
271	2.06k	}
272	884	}
273
274		// Inlines all _opt symbols.
275		// For example, a rule E := id +_opt id, after elimination, we have two
276		// equivalent rules:
277		// 1) E := id + id
278		// 2) E := id id
279	1	std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
280	1	std::vector<RuleSpec> Results;
281	1	std::vector<RuleSpec::Element> Storage;
282	651	for (const auto &R : Input) {
283	651	eliminateOptionalTail(
284	884	R.Sequence, Storage, [&Results, &Storage, &R, this]() {
285	884	if (Storage.empty()) {
286	0	Diagnostics.push_back(
287	0	llvm::formatv("Rule '{0}' has a nullable RHS", R.toString()));
288	0	return;
289	0	}
290	884	Results.push_back({R.Target, Storage});
291	884	});
292	651	assert(Storage.empty());
293	651	}
294	1	return Results;
295	1	}
296		void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
297		std::vector<RuleSpec::Element> &Result,
298	2.49k	llvm::function_ref<void()> CB) {
299	2.49k	if (Elements.empty())
300	884	return CB();
301	1.60k	auto Front = Elements.front();
302	1.60k	if (!Front.Symbol.ends_with(OptSuffix)) {
303	1.37k	Result.push_back(std::move(Front));
304	1.37k	eliminateOptionalTail(Elements.drop_front(1), Result, CB);
305	1.37k	Result.pop_back();
306	1.37k	return;
307	1.37k	}
308		// Enumerate two options: skip the opt symbol, or inline the symbol.
309	233	eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip
310	233	Front.Symbol = Front.Symbol.drop_back(OptSuffix.size()); // drop "_opt"
311	233	Result.push_back(std::move(Front));
312	233	eliminateOptionalTail(Elements.drop_front(1), Result, CB);
313	233	Result.pop_back();
314	233	}
315
316		// Diagnoses the grammar and emit warnings if any.
317	1	void diagnoseGrammar(const Grammar &G) {
318	1	const auto &T = G.table();
319	244	for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) {
320	243	auto Range = T.Nonterminals[SID].RuleRange;
321	243	if (Range.Start == Range.End)
322	0	Diagnostics.push_back(
323	0	llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID)));
324	243	llvm::StringRef NameRef = T.Nonterminals[SID].Name;
325	243	if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) {
326	0	Diagnostics.push_back(llvm::formatv(
327	0	"Token-like name {0} is used as a nonterminal", G.symbolName(SID)));
328	0	}
329	243	}
330	1	llvm::DenseSet<llvm::hash_code> VisitedRules;
331	885	for (RuleID RID = 0; RID < T.Rules.size(); ++RID) {
332	884	const auto &R = T.Rules[RID];
333	884	auto Code = llvm::hash_combine(
334	884	R.Target, llvm::hash_combine_range(R.seq().begin(), R.seq().end()));
335	884	auto [_, New] = VisitedRules.insert(Code);
336	884	if (!New)
337	0	Diagnostics.push_back(
338	0	llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID)));
339	884	}
340		// symbol-id -> used counts
341	1	std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0);
342	1	for (const Rule &R : T.Rules)
343	884	for (SymbolID SID : R.seq())
344	2.06k	if (isNonterminal(SID))
345	1.06k	++UseCounts[SID];
346	244	for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
347	243	if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol)
348	0	Diagnostics.push_back(
349	0	llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID)));
350	1	}
351		std::vector<std::string> &Diagnostics;
352		};
353		} // namespace
354
355		Grammar Grammar::parseBNF(llvm::StringRef BNF,
356	1	std::vector<std::string> &Diagnostics) {
357	1	Diagnostics.clear();
358	1	return GrammarBuilder(Diagnostics).build(BNF);
359	1	}
360
361		} // namespace pseudo
362		} // namespace clang