/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // A parse forest represents a set of possible parse trees efficiently, it is |
10 | | // produced by the GLR parser. |
11 | | // |
12 | | // Despite the name, its data structure is a tree-like DAG with a single root. |
13 | | // Multiple ways to parse the same tokens are presented as an ambiguous node |
14 | | // with all possible interpretations as children. |
15 | | // Common sub-parses are shared: if two interpretations both parse "1 + 1" as |
16 | | // "expr := expr + expr", they will share a Sequence node representing the expr. |
17 | | // |
18 | | //===----------------------------------------------------------------------===// |
19 | | |
20 | | #ifndef CLANG_PSEUDO_FOREST_H |
21 | | #define CLANG_PSEUDO_FOREST_H |
22 | | |
23 | | #include "clang-pseudo/Token.h" |
24 | | #include "clang-pseudo/grammar/Grammar.h" |
25 | | #include "llvm/ADT/ArrayRef.h" |
26 | | #include "llvm/ADT/STLExtras.h" |
27 | | #include "llvm/Support/Allocator.h" |
28 | | #include <cstdint> |
29 | | |
30 | | namespace clang { |
31 | | namespace pseudo { |
32 | | |
33 | | // A node represents ways to parse a sequence of tokens, it interprets a fixed |
34 | | // range of tokens as a fixed grammar symbol. |
35 | | // |
36 | | // There are different kinds of nodes, some nodes have "children" (stored in a |
37 | | // trailing array) and have pointers to them. "Children" has different semantics |
38 | | // depending on the node kinds. For an Ambiguous node, it means all |
39 | | // possible interpretations; for a Sequence node, it means each symbol on the |
40 | | // right hand side of the production rule. |
41 | | // |
42 | | // Since this is a node in a DAG, a node may have multiple parents. And a node |
43 | | // doesn't have parent pointers. |
44 | | class alignas(class ForestNode *) ForestNode { |
45 | | public: |
46 | | class RecursiveIterator; |
47 | | enum Kind { |
48 | | // A Terminal node is a single terminal symbol bound to a token. |
49 | | Terminal, |
50 | | // A Sequence node is a nonterminal symbol parsed from a grammar rule, |
51 | | // elements() are the parses of each symbol on the RHS of the rule. |
52 | | // If the rule is A := X Y Z, the node is for nonterminal A, and elements() |
53 | | // are [X, Y, Z]. |
54 | | Sequence, |
55 | | // An Ambiguous node exposes multiple ways to interpret the code as the |
56 | | // same symbol, alternatives() are all possible parses. |
57 | | Ambiguous, |
58 | | // An Opaque node is a placeholder. It asserts that tokens match a symbol, |
59 | | // without saying how. |
60 | | // It is used for lazy-parsing (not parsed yet), or error-recovery (invalid |
61 | | // code). |
62 | | Opaque, |
63 | | }; |
64 | 389M | Kind kind() const { return K; } |
65 | | |
66 | 678M | SymbolID symbol() const { return Symbol; } |
67 | | |
68 | | // The start of the token range, it is a poistion within a token stream. |
69 | 217M | Token::Index startTokenIndex() const { return StartIndex; } |
70 | | |
71 | | // Returns the corresponding grammar rule. |
72 | | // REQUIRES: this is a Sequence node. |
73 | 83.3M | RuleID rule() const { |
74 | 83.3M | assert(kind() == Sequence); |
75 | 0 | return Data & ((1 << RuleBits) - 1); |
76 | 83.3M | } |
77 | | // Returns the parses of each element on the RHS of the rule. |
78 | | // REQUIRES: this is a Sequence node; |
79 | 18.0M | llvm::ArrayRef<const ForestNode *> elements() const { |
80 | 18.0M | assert(kind() == Sequence); |
81 | 0 | return children(Data >> RuleBits); |
82 | 18.0M | } |
83 | 0 | llvm::MutableArrayRef<ForestNode *> elements() { |
84 | 0 | assert(kind() == Sequence); |
85 | 0 | return children(Data >> RuleBits); |
86 | 0 | } |
87 | | |
88 | | // Returns all possible interpretations of the code. |
89 | | // REQUIRES: this is an Ambiguous node. |
90 | 6.22M | llvm::ArrayRef<const ForestNode *> alternatives() const { |
91 | 6.22M | assert(kind() == Ambiguous); |
92 | 0 | return children(Data); |
93 | 6.22M | } |
94 | 0 | llvm::MutableArrayRef<ForestNode *> alternatives() { |
95 | 0 | assert(kind() == Ambiguous); |
96 | 0 | return children(Data); |
97 | 0 | } |
98 | | |
99 | 17.4M | llvm::ArrayRef<const ForestNode *> children() const { |
100 | 17.4M | switch (kind()) { |
101 | 17.4M | case Sequence: |
102 | 17.4M | return elements(); |
103 | 0 | case Ambiguous: |
104 | 0 | return alternatives(); |
105 | 0 | case Terminal: |
106 | 0 | case Opaque: |
107 | 0 | return {}; |
108 | 17.4M | } |
109 | 0 | llvm_unreachable("Bad kind"); |
110 | 0 | } |
111 | | |
112 | | // Iteration over all nodes in the forest, including this. |
113 | | llvm::iterator_range<RecursiveIterator> descendants() const; |
114 | | |
115 | | std::string dump(const Grammar &) const; |
116 | | std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const; |
117 | | |
118 | | private: |
119 | | friend class ForestArena; |
120 | | |
121 | | ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data) |
122 | 143M | : StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {} |
123 | | |
124 | | ForestNode(const ForestNode &) = delete; |
125 | | ForestNode &operator=(const ForestNode &) = delete; |
126 | | ForestNode(ForestNode &&) = delete; |
127 | | ForestNode &operator=(ForestNode &&) = delete; |
128 | | |
129 | | static uint16_t sequenceData(RuleID Rule, |
130 | 88.8M | llvm::ArrayRef<const ForestNode *> Elements) { |
131 | 88.8M | assert(Rule < (1 << RuleBits)); |
132 | 0 | assert(Elements.size() < (1 << (16 - RuleBits))); |
133 | 0 | return Rule | Elements.size() << RuleBits; |
134 | 88.8M | } |
135 | | static uint16_t |
136 | 1.49M | ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) { |
137 | 1.49M | return Alternatives.size(); |
138 | 1.49M | } |
139 | | |
140 | | // Retrieves the trailing array. |
141 | 24.2M | llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const { |
142 | 24.2M | return llvm::ArrayRef(reinterpret_cast<ForestNode *const *>(this + 1), Num); |
143 | 24.2M | } |
144 | 0 | llvm::MutableArrayRef<ForestNode *> children(uint16_t Num) { |
145 | 0 | return llvm::MutableArrayRef(reinterpret_cast<ForestNode **>(this + 1), |
146 | 0 | Num); |
147 | 0 | } |
148 | | |
149 | | Token::Index StartIndex; |
150 | | Kind K : 4; |
151 | | SymbolID Symbol : SymbolBits; |
152 | | // Sequence - child count : 4 | RuleID : RuleBits (12) |
153 | | // Ambiguous - child count : 16 |
154 | | // Terminal, Opaque - unused |
155 | | uint16_t Data; |
156 | | // An array of ForestNode* following the object. |
157 | | }; |
158 | | // ForestNode may not be destroyed (for BumpPtrAllocator). |
159 | | static_assert(std::is_trivially_destructible<ForestNode>()); |
160 | | |
161 | | // A memory arena for the parse forest. |
162 | | class ForestArena { |
163 | | public: |
164 | | llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code); |
165 | | ForestNode &createSequence(SymbolID SID, RuleID RID, |
166 | 88.8M | llvm::ArrayRef<const ForestNode *> Elements) { |
167 | 88.8M | assert(!Elements.empty()); |
168 | 0 | return create(ForestNode::Sequence, SID, |
169 | 88.8M | Elements.front()->startTokenIndex(), |
170 | 88.8M | ForestNode::sequenceData(RID, Elements), Elements); |
171 | 88.8M | } |
172 | | ForestNode &createAmbiguous(SymbolID SID, |
173 | 1.49M | llvm::ArrayRef<const ForestNode *> Alternatives) { |
174 | 1.49M | assert(!Alternatives.empty()); |
175 | 0 | assert(llvm::all_of(Alternatives, |
176 | 1.49M | [SID](const ForestNode *Alternative) { |
177 | 1.49M | return SID == Alternative->symbol(); |
178 | 1.49M | }) && |
179 | 1.49M | "Ambiguous alternatives must represent the same symbol!"); |
180 | 0 | return create(ForestNode::Ambiguous, SID, |
181 | 1.49M | Alternatives.front()->startTokenIndex(), |
182 | 1.49M | ForestNode::ambiguousData(Alternatives), Alternatives); |
183 | 1.49M | } |
184 | 9.53k | ForestNode &createOpaque(SymbolID SID, Token::Index Start) { |
185 | 9.53k | return create(ForestNode::Opaque, SID, Start, 0, {}); |
186 | 9.53k | } |
187 | | |
188 | 0 | ForestNode &createTerminal(tok::TokenKind TK, Token::Index Start) { |
189 | 0 | return create(ForestNode::Terminal, tokenSymbol(TK), Start, 0, {}); |
190 | 0 | } |
191 | | |
192 | 0 | size_t nodeCount() const { return NodeCount; } |
193 | 0 | size_t bytes() const { return Arena.getBytesAllocated() + sizeof(*this); } |
194 | | |
195 | | private: |
196 | | ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start, |
197 | | uint16_t Data, |
198 | 90.3M | llvm::ArrayRef<const ForestNode *> Elements) { |
199 | 90.3M | ++NodeCount; |
200 | 90.3M | ForestNode *New = new (Arena.Allocate( |
201 | 90.3M | sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *), |
202 | 90.3M | alignof(ForestNode))) ForestNode(K, SID, Start, Data); |
203 | 90.3M | if (!Elements.empty()) |
204 | 90.3M | llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1)); |
205 | 90.3M | return *New; |
206 | 90.3M | } |
207 | | |
208 | | llvm::BumpPtrAllocator Arena; |
209 | | uint32_t NodeCount = 0; |
210 | | }; |
211 | | |
212 | | class ForestNode::RecursiveIterator |
213 | | : public llvm::iterator_facade_base<ForestNode::RecursiveIterator, |
214 | | std::input_iterator_tag, |
215 | | const ForestNode> { |
216 | | llvm::DenseSet<const ForestNode *> Seen; |
217 | | struct StackFrame { |
218 | | const ForestNode *Parent; |
219 | | unsigned ChildIndex; |
220 | | }; |
221 | | std::vector<StackFrame> Stack; |
222 | | const ForestNode *Cur; |
223 | | |
224 | | public: |
225 | 0 | RecursiveIterator(const ForestNode *N = nullptr) : Cur(N) {} |
226 | | |
227 | 0 | const ForestNode &operator*() const { return *Cur; } |
228 | | void operator++(); |
229 | 0 | bool operator==(const RecursiveIterator &I) const { return Cur == I.Cur; } |
230 | 0 | bool operator!=(const RecursiveIterator &I) const { return !(*this == I); } |
231 | | }; |
232 | | |
233 | | } // namespace pseudo |
234 | | } // namespace clang |
235 | | |
236 | | #endif // CLANG_PSEUDO_FOREST_H |