/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h

Source (jump to first uncovered line)
//===--- Grammar.h - grammar used by clang pseudoparser  ---------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//  This file defines base structures for parsing & modeling a grammar for a
//  programming language:
//
//    # This is a fake C++ BNF grammar
//    _ := translation-unit
//    translation-unit := declaration-seq_opt
//    declaration-seq := declaration
//    declaration-seq := declaration-seq declaration
//
//  A grammar formally describes a language, and it is constructed by a set of
//  production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
//  nonterminal or terminal, identified by a SymbolID.
//
//  Annotations are supported in a syntax form of [key=value]. They specify
//  attributes which are associated with either a grammar symbol (on the
//  right-hand side of the symbol) or a grammar rule (at the end of the rule
//  body).
//  Attributes provide a way to inject custom code into the GLR parser. Each
//  unique attribute value creates an extension point (identified by ExtensionID
//  ), and an extension point corresponds to a piece of native code. For
//  example, C++ grammar has a rule:
//
//   compound_statement := { statement-seq [recover=Brackets] }
//
//  The `recover` attribute instructs the parser that we should perform error
//  recovery if parsing the statement-seq fails. The `Brackets` recovery
//  heuristic is implemented in CXX.cpp by binding the ExtensionID for the
//  `Recovery` value to a specific C++ function that finds the recovery point.
//
//  Notions about the BNF grammar:
//  - "_" is the start symbol of the augmented grammar;
//  - single-line comment is supported, starting with a #
//  - A rule describes how a nonterminal (left side of :=) is constructed, and
//    it is *per line* in the grammar file
//  - Terminals (also called tokens) correspond to the clang::TokenKind; they
//    are written in the grammar like "IDENTIFIER", "USING", "+"
//  - Nonterminals are specified with "lower-case" names in the grammar; they
//    shouldn't be nullable (has an empty sequence)
//  - optional symbols are supported (specified with a _opt suffix), and they
//    will be eliminated during the grammar parsing stage
//
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
#define CLANG_PSEUDO_GRAMMAR_GRAMMAR_H

#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <optional>
#include <vector>

namespace clang {
namespace pseudo {
// A SymbolID uniquely identifies a terminal/nonterminal symbol in a grammar.
// nonterminal IDs are indexes into a table of nonterminal symbols.
// Terminal IDs correspond to the clang TokenKind enum.
using SymbolID = uint16_t;
// SymbolID is only 12 bits wide.
// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals.
static constexpr uint16_t SymbolBits = 12;
static constexpr uint16_t NumTerminals = tok::NUM_TOKENS;
// SymbolIDs with the top bit set are tokens/terminals.
static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1);
inline bool isToken(SymbolID ID) { return ID & TokenFlag; }
inline bool isNonterminal(SymbolID ID) { return !isToken(ID); }
// The terminals are always the clang tok::TokenKind (not all are used).
inline tok::TokenKind symbolToToken(SymbolID SID) {
  assert(isToken(SID));
  SID &= ~TokenFlag;
  assert(SID < NumTerminals);
  return static_cast<tok::TokenKind>(SID);
}
inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
  return TokenFlag | static_cast<SymbolID>(TK);
}

// An extension is a piece of native code specific to a grammar that modifies
// the behavior of annotated rules. One ExtensionID is assigned for each unique
// attribute value (all attributes share a namespace).
using ExtensionID = uint16_t;

// A RuleID uniquely identifies a production rule in a grammar.
// It is an index into a table of rules.
using RuleID = uint16_t;
// There are maximum 2^12 rules.
static constexpr unsigned RuleBits = 12;

// Represent a production rule in the grammar, e.g.
//   expression := a b c
//   ^Target       ^Sequence
struct Rule {
  Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq);

  // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens
  // long, however, we're stricter in order to reduce the size, we limit the max
  // length to 9 (this is the longest sequence in cxx grammar).
  static constexpr unsigned SizeBits = 4;
  static constexpr unsigned MaxElements = 9;
  static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit");
  static_assert(SizeBits + SymbolBits <= 16,
                "Must be able to store symbol ID + size efficiently");

  // 16 bits for target symbol and size of sequence:
  // SymbolID : 12 | Size : 4
  SymbolID Target : SymbolBits;
  uint8_t Size : SizeBits; // Size of the Sequence
  SymbolID Sequence[MaxElements];

  // A guarded rule has extra logic to determine whether the RHS is eligible.
  bool Guarded = false;

  // Specifies the index within Sequence eligible for error recovery.
  // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
  // should recover by finding the matching brace, and forcing stmt-seq to match
  // everything between braces.
  // For now, only a single strategy at a single point is possible.
  uint8_t RecoveryIndex = -1;
  ExtensionID Recovery = 0;

  llvm::ArrayRef<SymbolID> seq() const {
    return llvm::ArrayRef<SymbolID>(Sequence, Size);
  }
  friend bool operator==(const Rule &L, const Rule &R) {
    return L.Target == R.Target && L.seq() == R.seq() && L.Guarded == R.Guarded;
  }
};

struct GrammarTable;

// Grammar that describes a programming language, e.g. C++. It represents the
// contents of the specified grammar.
// It is a building block for constructing a table-based parser.
class Grammar {
public:
  Grammar() = default; // Creates an invalid dummy grammar.
  explicit Grammar(std::unique_ptr<GrammarTable>);

  // Parses grammar from a BNF file.
  // Diagnostics emitted during parsing are stored in Diags.
  static Grammar parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diags);

  // Returns the SymbolID of the symbol '_'.
  SymbolID underscore() const { return Underscore; };

  // Returns all rules of the given nonterminal symbol.
  llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
  const Rule &lookupRule(RuleID RID) const;

  // Gets symbol (terminal or nonterminal) name.
  // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator).
  llvm::StringRef symbolName(SymbolID) const;

  // Lookup the SymbolID of the nonterminal symbol by Name.
  std::optional<SymbolID> findNonterminal(llvm::StringRef Name) const;

  // Dumps the whole grammar.
  std::string dump() const;
  // Dumps a particular rule.
  std::string dumpRule(RuleID) const;
  // Dumps all rules of the given nonterminal symbol.
  std::string dumpRules(SymbolID) const;

  const GrammarTable &table() const { return *T; }

private:
  std::unique_ptr<GrammarTable> T;
  // The symbol ID of '_'. (In the LR literature, this is the start symbol of
  // the augmented grammar.)
  SymbolID Underscore;
};
// For each nonterminal X, computes the set of terminals that begin strings
// derived from X. (Known as FIRST sets in grammar-based parsers).
std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
// For each nonterminal X, computes the set of terminals that could immediately
// follow X. (Known as FOLLOW sets in grammar-based parsers).
std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);

// Storage for the underlying data of the Grammar.
// It can be constructed dynamically (from compiling BNF file) or statically
// (a compiled data-source).
struct GrammarTable {
  GrammarTable();

  struct Nonterminal {
    std::string Name;
    // Corresponding rules that construct the nonterminal, it is a [Start, End)
    // index range of the Rules table.
    struct {
      RuleID Start;
      RuleID End;
    } RuleRange;
  };

  // RuleID is an index into this table of rule definitions.
  //
  // Rules with the same target symbol (LHS) are grouped into a single range.
  // The relative order of different target symbols is *not* by SymbolID, but
  // rather a topological sort: if S := T then the rules producing T have lower
  // RuleIDs than rules producing S.
  // (This strange order simplifies the GLR parser: for a given token range, if
  // we reduce in increasing RuleID order then we need never backtrack --
  // prerequisite reductions are reached before dependent ones).
  std::vector<Rule> Rules;
  // A table of terminals (aka tokens). It corresponds to the clang::Token.
  // clang::tok::TokenKind is the index of the table.
  llvm::ArrayRef<std::string> Terminals;
  // A table of nonterminals, sorted by name.
  // SymbolID is the index of the table.
  std::vector<Nonterminal> Nonterminals;
  // A table of attribute values, sorted by name.
  // ExtensionID is the index of the table.
  std::vector<std::string> AttributeValues;
};

} // namespace pseudo
} // namespace clang

#endif // CLANG_PSEUDO_GRAMMAR_GRAMMAR_H

Coverage Report

Created: 2024-01-17 10:31

Line	Count	Source (jump to first uncovered line)
1		//===--- Grammar.h - grammar used by clang pseudoparser ---------- C++--===//
2		//
3		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4		// See https://llvm.org/LICENSE.txt for license information.
5		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6		//
7		//===----------------------------------------------------------------------===//
8		//
9		// This file defines base structures for parsing & modeling a grammar for a
10		// programming language:
11		//
12		// # This is a fake C++ BNF grammar
13		// _ := translation-unit
14		// translation-unit := declaration-seq_opt
15		// declaration-seq := declaration
16		// declaration-seq := declaration-seq declaration
17		//
18		// A grammar formally describes a language, and it is constructed by a set of
19		// production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
20		// nonterminal or terminal, identified by a SymbolID.
21		//
22		// Annotations are supported in a syntax form of [key=value]. They specify
23		// attributes which are associated with either a grammar symbol (on the
24		// right-hand side of the symbol) or a grammar rule (at the end of the rule
25		// body).
26		// Attributes provide a way to inject custom code into the GLR parser. Each
27		// unique attribute value creates an extension point (identified by ExtensionID
28		// ), and an extension point corresponds to a piece of native code. For
29		// example, C++ grammar has a rule:
30		//
31		// compound_statement := { statement-seq [recover=Brackets] }
32		//
33		// The `recover` attribute instructs the parser that we should perform error
34		// recovery if parsing the statement-seq fails. The `Brackets` recovery
35		// heuristic is implemented in CXX.cpp by binding the ExtensionID for the
36		// `Recovery` value to a specific C++ function that finds the recovery point.
37		//
38		// Notions about the BNF grammar:
39		// - "_" is the start symbol of the augmented grammar;
40		// - single-line comment is supported, starting with a #
41		// - A rule describes how a nonterminal (left side of :=) is constructed, and
42		// it is per line in the grammar file
43		// - Terminals (also called tokens) correspond to the clang::TokenKind; they
44		// are written in the grammar like "IDENTIFIER", "USING", "+"
45		// - Nonterminals are specified with "lower-case" names in the grammar; they
46		// shouldn't be nullable (has an empty sequence)
47		// - optional symbols are supported (specified with a _opt suffix), and they
48		// will be eliminated during the grammar parsing stage
49		//
50		//===----------------------------------------------------------------------===//
51
52		#ifndef CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
53		#define CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
54
55		#include "clang/Basic/TokenKinds.h"
56		#include "llvm/ADT/ArrayRef.h"
57		#include "llvm/ADT/DenseSet.h"
58		#include "llvm/ADT/StringRef.h"
59		#include "llvm/Support/raw_ostream.h"
60		#include <cstdint>
61		#include <optional>
62		#include <vector>
63
64		namespace clang {
65		namespace pseudo {
66		// A SymbolID uniquely identifies a terminal/nonterminal symbol in a grammar.
67		// nonterminal IDs are indexes into a table of nonterminal symbols.
68		// Terminal IDs correspond to the clang TokenKind enum.
69		using SymbolID = uint16_t;
70		// SymbolID is only 12 bits wide.
71		// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals.
72		static constexpr uint16_t SymbolBits = 12;
73		static constexpr uint16_t NumTerminals = tok::NUM_TOKENS;
74		// SymbolIDs with the top bit set are tokens/terminals.
75		static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1);
76	529M	inline bool isToken(SymbolID ID) { return ID & TokenFlag; }
77	340M	inline bool isNonterminal(SymbolID ID) { return !isToken(ID); }
78		// The terminals are always the clang tok::TokenKind (not all are used).
79	122M	inline tok::TokenKind symbolToToken(SymbolID SID) {
80	122M	assert(isToken(SID));
81	0	SID &= ~TokenFlag;
82	122M	assert(SID < NumTerminals);
83	0	return static_cast<tok::TokenKind>(SID);
84	122M	}
85	121M	inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
86	121M	return TokenFlag \| static_cast<SymbolID>(TK);
87	121M	}
88
89		// An extension is a piece of native code specific to a grammar that modifies
90		// the behavior of annotated rules. One ExtensionID is assigned for each unique
91		// attribute value (all attributes share a namespace).
92		using ExtensionID = uint16_t;
93
94		// A RuleID uniquely identifies a production rule in a grammar.
95		// It is an index into a table of rules.
96		using RuleID = uint16_t;
97		// There are maximum 2^12 rules.
98		static constexpr unsigned RuleBits = 12;
99
100		// Represent a production rule in the grammar, e.g.
101		// expression := a b c
102		// ^Target ^Sequence
103		struct Rule {
104		Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq);
105
106		// We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens
107		// long, however, we're stricter in order to reduce the size, we limit the max
108		// length to 9 (this is the longest sequence in cxx grammar).
109		static constexpr unsigned SizeBits = 4;
110		static constexpr unsigned MaxElements = 9;
111		static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit");
112		static_assert(SizeBits + SymbolBits <= 16,
113		"Must be able to store symbol ID + size efficiently");
114
115		// 16 bits for target symbol and size of sequence:
116		// SymbolID : 12 \| Size : 4
117		SymbolID Target : SymbolBits;
118		uint8_t Size : SizeBits; // Size of the Sequence
119		SymbolID Sequence[MaxElements];
120
121		// A guarded rule has extra logic to determine whether the RHS is eligible.
122		bool Guarded = false;
123
124		// Specifies the index within Sequence eligible for error recovery.
125		// Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
126		// should recover by finding the matching brace, and forcing stmt-seq to match
127		// everything between braces.
128		// For now, only a single strategy at a single point is possible.
129		uint8_t RecoveryIndex = -1;
130		ExtensionID Recovery = 0;
131
132	241k	llvm::ArrayRef<SymbolID> seq() const {
133	241k	return llvm::ArrayRef<SymbolID>(Sequence, Size);
134	241k	}
135	0	friend bool operator==(const Rule &L, const Rule &R) {
136	0	return L.Target == R.Target && L.seq() == R.seq() && L.Guarded == R.Guarded;
137	0	}
138		};
139
140		struct GrammarTable;
141
142		// Grammar that describes a programming language, e.g. C++. It represents the
143		// contents of the specified grammar.
144		// It is a building block for constructing a table-based parser.
145		class Grammar {
146		public:
147		Grammar() = default; // Creates an invalid dummy grammar.
148		explicit Grammar(std::unique_ptr<GrammarTable>);
149
150		// Parses grammar from a BNF file.
151		// Diagnostics emitted during parsing are stored in Diags.
152		static Grammar parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diags);
153
154		// Returns the SymbolID of the symbol '_'.
155	52.2k	SymbolID underscore() const { return Underscore; };
156
157		// Returns all rules of the given nonterminal symbol.
158		llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
159		const Rule &lookupRule(RuleID RID) const;
160
161		// Gets symbol (terminal or nonterminal) name.
162		// Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator).
163		llvm::StringRef symbolName(SymbolID) const;
164
165		// Lookup the SymbolID of the nonterminal symbol by Name.
166		std::optional<SymbolID> findNonterminal(llvm::StringRef Name) const;
167
168		// Dumps the whole grammar.
169		std::string dump() const;
170		// Dumps a particular rule.
171		std::string dumpRule(RuleID) const;
172		// Dumps all rules of the given nonterminal symbol.
173		std::string dumpRules(SymbolID) const;
174
175	27.0k	const GrammarTable &table() const { return *T; }
176
177		private:
178		std::unique_ptr<GrammarTable> T;
179		// The symbol ID of '_'. (In the LR literature, this is the start symbol of
180		// the augmented grammar.)
181		SymbolID Underscore;
182		};
183		// For each nonterminal X, computes the set of terminals that begin strings
184		// derived from X. (Known as FIRST sets in grammar-based parsers).
185		std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
186		// For each nonterminal X, computes the set of terminals that could immediately
187		// follow X. (Known as FOLLOW sets in grammar-based parsers).
188		std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
189
190		// Storage for the underlying data of the Grammar.
191		// It can be constructed dynamically (from compiling BNF file) or statically
192		// (a compiled data-source).
193		struct GrammarTable {
194		GrammarTable();
195
196		struct Nonterminal {
197		std::string Name;
198		// Corresponding rules that construct the nonterminal, it is a [Start, End)
199		// index range of the Rules table.
200		struct {
201		RuleID Start;
202		RuleID End;
203		} RuleRange;
204		};
205
206		// RuleID is an index into this table of rule definitions.
207		//
208		// Rules with the same target symbol (LHS) are grouped into a single range.
209		// The relative order of different target symbols is not by SymbolID, but
210		// rather a topological sort: if S := T then the rules producing T have lower
211		// RuleIDs than rules producing S.
212		// (This strange order simplifies the GLR parser: for a given token range, if
213		// we reduce in increasing RuleID order then we need never backtrack --
214		// prerequisite reductions are reached before dependent ones).
215		std::vector<Rule> Rules;
216		// A table of terminals (aka tokens). It corresponds to the clang::Token.
217		// clang::tok::TokenKind is the index of the table.
218		llvm::ArrayRef<std::string> Terminals;
219		// A table of nonterminals, sorted by name.
220		// SymbolID is the index of the table.
221		std::vector<Nonterminal> Nonterminals;
222		// A table of attribute values, sorted by name.
223		// ExtensionID is the index of the table.
224		std::vector<std::string> AttributeValues;
225		};
226
227		} // namespace pseudo
228		} // namespace clang
229
230		#endif // CLANG_PSEUDO_GRAMMAR_GRAMMAR_H