/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- Grammar.h - grammar used by clang pseudoparser ---------*- C++-*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This file defines base structures for parsing & modeling a grammar for a |
10 | | // programming language: |
11 | | // |
12 | | // # This is a fake C++ BNF grammar |
13 | | // _ := translation-unit |
14 | | // translation-unit := declaration-seq_opt |
15 | | // declaration-seq := declaration |
16 | | // declaration-seq := declaration-seq declaration |
17 | | // |
18 | | // A grammar formally describes a language, and it is constructed by a set of |
19 | | // production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either |
20 | | // nonterminal or terminal, identified by a SymbolID. |
21 | | // |
22 | | // Annotations are supported in a syntax form of [key=value]. They specify |
23 | | // attributes which are associated with either a grammar symbol (on the |
24 | | // right-hand side of the symbol) or a grammar rule (at the end of the rule |
25 | | // body). |
26 | | // Attributes provide a way to inject custom code into the GLR parser. Each |
27 | | // unique attribute value creates an extension point (identified by ExtensionID |
28 | | // ), and an extension point corresponds to a piece of native code. For |
29 | | // example, C++ grammar has a rule: |
30 | | // |
31 | | // compound_statement := { statement-seq [recover=Brackets] } |
32 | | // |
33 | | // The `recover` attribute instructs the parser that we should perform error |
34 | | // recovery if parsing the statement-seq fails. The `Brackets` recovery |
35 | | // heuristic is implemented in CXX.cpp by binding the ExtensionID for the |
36 | | // `Recovery` value to a specific C++ function that finds the recovery point. |
37 | | // |
38 | | // Notions about the BNF grammar: |
39 | | // - "_" is the start symbol of the augmented grammar; |
40 | | // - single-line comment is supported, starting with a # |
41 | | // - A rule describes how a nonterminal (left side of :=) is constructed, and |
42 | | // it is *per line* in the grammar file |
43 | | // - Terminals (also called tokens) correspond to the clang::TokenKind; they |
44 | | // are written in the grammar like "IDENTIFIER", "USING", "+" |
45 | | // - Nonterminals are specified with "lower-case" names in the grammar; they |
46 | | // shouldn't be nullable (has an empty sequence) |
47 | | // - optional symbols are supported (specified with a _opt suffix), and they |
48 | | // will be eliminated during the grammar parsing stage |
49 | | // |
50 | | //===----------------------------------------------------------------------===// |
51 | | |
52 | | #ifndef CLANG_PSEUDO_GRAMMAR_GRAMMAR_H |
53 | | #define CLANG_PSEUDO_GRAMMAR_GRAMMAR_H |
54 | | |
55 | | #include "clang/Basic/TokenKinds.h" |
56 | | #include "llvm/ADT/ArrayRef.h" |
57 | | #include "llvm/ADT/DenseSet.h" |
58 | | #include "llvm/ADT/StringRef.h" |
59 | | #include "llvm/Support/raw_ostream.h" |
60 | | #include <cstdint> |
61 | | #include <optional> |
62 | | #include <vector> |
63 | | |
64 | | namespace clang { |
65 | | namespace pseudo { |
66 | | // A SymbolID uniquely identifies a terminal/nonterminal symbol in a grammar. |
67 | | // nonterminal IDs are indexes into a table of nonterminal symbols. |
68 | | // Terminal IDs correspond to the clang TokenKind enum. |
69 | | using SymbolID = uint16_t; |
70 | | // SymbolID is only 12 bits wide. |
71 | | // There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals. |
72 | | static constexpr uint16_t SymbolBits = 12; |
73 | | static constexpr uint16_t NumTerminals = tok::NUM_TOKENS; |
74 | | // SymbolIDs with the top bit set are tokens/terminals. |
75 | | static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1); |
76 | 529M | inline bool isToken(SymbolID ID) { return ID & TokenFlag; } |
77 | 340M | inline bool isNonterminal(SymbolID ID) { return !isToken(ID); } |
78 | | // The terminals are always the clang tok::TokenKind (not all are used). |
79 | 122M | inline tok::TokenKind symbolToToken(SymbolID SID) { |
80 | 122M | assert(isToken(SID)); |
81 | 0 | SID &= ~TokenFlag; |
82 | 122M | assert(SID < NumTerminals); |
83 | 0 | return static_cast<tok::TokenKind>(SID); |
84 | 122M | } |
85 | 121M | inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) { |
86 | 121M | return TokenFlag | static_cast<SymbolID>(TK); |
87 | 121M | } |
88 | | |
89 | | // An extension is a piece of native code specific to a grammar that modifies |
90 | | // the behavior of annotated rules. One ExtensionID is assigned for each unique |
91 | | // attribute value (all attributes share a namespace). |
92 | | using ExtensionID = uint16_t; |
93 | | |
94 | | // A RuleID uniquely identifies a production rule in a grammar. |
95 | | // It is an index into a table of rules. |
96 | | using RuleID = uint16_t; |
97 | | // There are maximum 2^12 rules. |
98 | | static constexpr unsigned RuleBits = 12; |
99 | | |
100 | | // Represent a production rule in the grammar, e.g. |
101 | | // expression := a b c |
102 | | // ^Target ^Sequence |
103 | | struct Rule { |
104 | | Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq); |
105 | | |
106 | | // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens |
107 | | // long, however, we're stricter in order to reduce the size, we limit the max |
108 | | // length to 9 (this is the longest sequence in cxx grammar). |
109 | | static constexpr unsigned SizeBits = 4; |
110 | | static constexpr unsigned MaxElements = 9; |
111 | | static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit"); |
112 | | static_assert(SizeBits + SymbolBits <= 16, |
113 | | "Must be able to store symbol ID + size efficiently"); |
114 | | |
115 | | // 16 bits for target symbol and size of sequence: |
116 | | // SymbolID : 12 | Size : 4 |
117 | | SymbolID Target : SymbolBits; |
118 | | uint8_t Size : SizeBits; // Size of the Sequence |
119 | | SymbolID Sequence[MaxElements]; |
120 | | |
121 | | // A guarded rule has extra logic to determine whether the RHS is eligible. |
122 | | bool Guarded = false; |
123 | | |
124 | | // Specifies the index within Sequence eligible for error recovery. |
125 | | // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we |
126 | | // should recover by finding the matching brace, and forcing stmt-seq to match |
127 | | // everything between braces. |
128 | | // For now, only a single strategy at a single point is possible. |
129 | | uint8_t RecoveryIndex = -1; |
130 | | ExtensionID Recovery = 0; |
131 | | |
132 | 241k | llvm::ArrayRef<SymbolID> seq() const { |
133 | 241k | return llvm::ArrayRef<SymbolID>(Sequence, Size); |
134 | 241k | } |
135 | 0 | friend bool operator==(const Rule &L, const Rule &R) { |
136 | 0 | return L.Target == R.Target && L.seq() == R.seq() && L.Guarded == R.Guarded; |
137 | 0 | } |
138 | | }; |
139 | | |
140 | | struct GrammarTable; |
141 | | |
142 | | // Grammar that describes a programming language, e.g. C++. It represents the |
143 | | // contents of the specified grammar. |
144 | | // It is a building block for constructing a table-based parser. |
145 | | class Grammar { |
146 | | public: |
147 | | Grammar() = default; // Creates an invalid dummy grammar. |
148 | | explicit Grammar(std::unique_ptr<GrammarTable>); |
149 | | |
150 | | // Parses grammar from a BNF file. |
151 | | // Diagnostics emitted during parsing are stored in Diags. |
152 | | static Grammar parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diags); |
153 | | |
154 | | // Returns the SymbolID of the symbol '_'. |
155 | 52.2k | SymbolID underscore() const { return Underscore; }; |
156 | | |
157 | | // Returns all rules of the given nonterminal symbol. |
158 | | llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const; |
159 | | const Rule &lookupRule(RuleID RID) const; |
160 | | |
161 | | // Gets symbol (terminal or nonterminal) name. |
162 | | // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator). |
163 | | llvm::StringRef symbolName(SymbolID) const; |
164 | | |
165 | | // Lookup the SymbolID of the nonterminal symbol by Name. |
166 | | std::optional<SymbolID> findNonterminal(llvm::StringRef Name) const; |
167 | | |
168 | | // Dumps the whole grammar. |
169 | | std::string dump() const; |
170 | | // Dumps a particular rule. |
171 | | std::string dumpRule(RuleID) const; |
172 | | // Dumps all rules of the given nonterminal symbol. |
173 | | std::string dumpRules(SymbolID) const; |
174 | | |
175 | 27.0k | const GrammarTable &table() const { return *T; } |
176 | | |
177 | | private: |
178 | | std::unique_ptr<GrammarTable> T; |
179 | | // The symbol ID of '_'. (In the LR literature, this is the start symbol of |
180 | | // the augmented grammar.) |
181 | | SymbolID Underscore; |
182 | | }; |
183 | | // For each nonterminal X, computes the set of terminals that begin strings |
184 | | // derived from X. (Known as FIRST sets in grammar-based parsers). |
185 | | std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &); |
186 | | // For each nonterminal X, computes the set of terminals that could immediately |
187 | | // follow X. (Known as FOLLOW sets in grammar-based parsers). |
188 | | std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &); |
189 | | |
190 | | // Storage for the underlying data of the Grammar. |
191 | | // It can be constructed dynamically (from compiling BNF file) or statically |
192 | | // (a compiled data-source). |
193 | | struct GrammarTable { |
194 | | GrammarTable(); |
195 | | |
196 | | struct Nonterminal { |
197 | | std::string Name; |
198 | | // Corresponding rules that construct the nonterminal, it is a [Start, End) |
199 | | // index range of the Rules table. |
200 | | struct { |
201 | | RuleID Start; |
202 | | RuleID End; |
203 | | } RuleRange; |
204 | | }; |
205 | | |
206 | | // RuleID is an index into this table of rule definitions. |
207 | | // |
208 | | // Rules with the same target symbol (LHS) are grouped into a single range. |
209 | | // The relative order of different target symbols is *not* by SymbolID, but |
210 | | // rather a topological sort: if S := T then the rules producing T have lower |
211 | | // RuleIDs than rules producing S. |
212 | | // (This strange order simplifies the GLR parser: for a given token range, if |
213 | | // we reduce in increasing RuleID order then we need never backtrack -- |
214 | | // prerequisite reductions are reached before dependent ones). |
215 | | std::vector<Rule> Rules; |
216 | | // A table of terminals (aka tokens). It corresponds to the clang::Token. |
217 | | // clang::tok::TokenKind is the index of the table. |
218 | | llvm::ArrayRef<std::string> Terminals; |
219 | | // A table of nonterminals, sorted by name. |
220 | | // SymbolID is the index of the table. |
221 | | std::vector<Nonterminal> Nonterminals; |
222 | | // A table of attribute values, sorted by name. |
223 | | // ExtensionID is the index of the table. |
224 | | std::vector<std::string> AttributeValues; |
225 | | }; |
226 | | |
227 | | } // namespace pseudo |
228 | | } // namespace clang |
229 | | |
230 | | #endif // CLANG_PSEUDO_GRAMMAR_GRAMMAR_H |