Coverage Report

Created: 2024-01-17 10:31

/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
Line
Count
Source (jump to first uncovered line)
1
//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// A parse forest represents a set of possible parse trees efficiently, it is
10
// produced by the GLR parser.
11
//
12
// Despite the name, its data structure is a tree-like DAG with a single root.
13
// Multiple ways to parse the same tokens are presented as an ambiguous node
14
// with all possible interpretations as children.
15
// Common sub-parses are shared: if two interpretations both parse "1 + 1" as
16
// "expr := expr + expr", they will share a Sequence node representing the expr.
17
//
18
//===----------------------------------------------------------------------===//
19
20
#ifndef CLANG_PSEUDO_FOREST_H
21
#define CLANG_PSEUDO_FOREST_H
22
23
#include "clang-pseudo/Token.h"
24
#include "clang-pseudo/grammar/Grammar.h"
25
#include "llvm/ADT/ArrayRef.h"
26
#include "llvm/ADT/STLExtras.h"
27
#include "llvm/Support/Allocator.h"
28
#include <cstdint>
29
30
namespace clang {
31
namespace pseudo {
32
33
// A node represents ways to parse a sequence of tokens, it interprets a fixed
34
// range of tokens as a fixed grammar symbol.
35
//
36
// There are different kinds of nodes, some nodes have "children" (stored in a
37
// trailing array) and have pointers to them. "Children" has different semantics
38
// depending on the node kinds. For an Ambiguous node, it means all
39
// possible interpretations; for a Sequence node, it means each symbol on the
40
// right hand side of the production rule.
41
//
42
// Since this is a node in a DAG, a node may have multiple parents. And a node
43
// doesn't have parent pointers.
44
class alignas(class ForestNode *) ForestNode {
45
public:
46
  class RecursiveIterator;
47
  enum Kind {
48
    // A Terminal node is a single terminal symbol bound to a token.
49
    Terminal,
50
    // A Sequence node is a nonterminal symbol parsed from a grammar rule,
51
    // elements() are the parses of each symbol on the RHS of the rule.
52
    // If the rule is A := X Y Z, the node is for nonterminal A, and elements()
53
    // are [X, Y, Z].
54
    Sequence,
55
    // An Ambiguous node exposes multiple ways to interpret the code as the
56
    // same symbol, alternatives() are all possible parses.
57
    Ambiguous,
58
    // An Opaque node is a placeholder. It asserts that tokens match a symbol,
59
    // without saying how.
60
    // It is used for lazy-parsing (not parsed yet), or error-recovery (invalid
61
    // code).
62
    Opaque,
63
  };
64
389M
  Kind kind() const { return K; }
65
66
678M
  SymbolID symbol() const { return Symbol; }
67
68
  // The start of the token range, it is a poistion within a token stream.
69
217M
  Token::Index startTokenIndex() const { return StartIndex; }
70
71
  // Returns the corresponding grammar rule.
72
  // REQUIRES: this is a Sequence node.
73
83.3M
  RuleID rule() const {
74
83.3M
    assert(kind() == Sequence);
75
0
    return Data & ((1 << RuleBits) - 1);
76
83.3M
  }
77
  // Returns the parses of each element on the RHS of the rule.
78
  // REQUIRES: this is a Sequence node;
79
18.0M
  llvm::ArrayRef<const ForestNode *> elements() const {
80
18.0M
    assert(kind() == Sequence);
81
0
    return children(Data >> RuleBits);
82
18.0M
  }
83
0
  llvm::MutableArrayRef<ForestNode *> elements() {
84
0
    assert(kind() == Sequence);
85
0
    return children(Data >> RuleBits);
86
0
  }
87
88
  // Returns all possible interpretations of the code.
89
  // REQUIRES: this is an Ambiguous node.
90
6.22M
  llvm::ArrayRef<const ForestNode *> alternatives() const {
91
6.22M
    assert(kind() == Ambiguous);
92
0
    return children(Data);
93
6.22M
  }
94
0
  llvm::MutableArrayRef<ForestNode *> alternatives() {
95
0
    assert(kind() == Ambiguous);
96
0
    return children(Data);
97
0
  }
98
99
17.4M
  llvm::ArrayRef<const ForestNode *> children() const {
100
17.4M
    switch (kind()) {
101
17.4M
    case Sequence:
102
17.4M
      return elements();
103
0
    case Ambiguous:
104
0
      return alternatives();
105
0
    case Terminal:
106
0
    case Opaque:
107
0
      return {};
108
17.4M
    }
109
0
    llvm_unreachable("Bad kind");
110
0
  }
111
112
  // Iteration over all nodes in the forest, including this.
113
  llvm::iterator_range<RecursiveIterator> descendants() const;
114
115
  std::string dump(const Grammar &) const;
116
  std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;
117
118
private:
119
  friend class ForestArena;
120
121
  ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data)
122
143M
      : StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {}
123
124
  ForestNode(const ForestNode &) = delete;
125
  ForestNode &operator=(const ForestNode &) = delete;
126
  ForestNode(ForestNode &&) = delete;
127
  ForestNode &operator=(ForestNode &&) = delete;
128
129
  static uint16_t sequenceData(RuleID Rule,
130
88.8M
                               llvm::ArrayRef<const ForestNode *> Elements) {
131
88.8M
    assert(Rule < (1 << RuleBits));
132
0
    assert(Elements.size() < (1 << (16 - RuleBits)));
133
0
    return Rule | Elements.size() << RuleBits;
134
88.8M
  }
135
  static uint16_t
136
1.49M
  ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) {
137
1.49M
    return Alternatives.size();
138
1.49M
  }
139
140
  // Retrieves the trailing array.
141
24.2M
  llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const {
142
24.2M
    return llvm::ArrayRef(reinterpret_cast<ForestNode *const *>(this + 1), Num);
143
24.2M
  }
144
0
  llvm::MutableArrayRef<ForestNode *> children(uint16_t Num) {
145
0
    return llvm::MutableArrayRef(reinterpret_cast<ForestNode **>(this + 1),
146
0
                                 Num);
147
0
  }
148
149
  Token::Index StartIndex;
150
  Kind K : 4;
151
  SymbolID Symbol : SymbolBits;
152
  // Sequence - child count : 4 | RuleID : RuleBits (12)
153
  // Ambiguous - child count : 16
154
  // Terminal, Opaque - unused
155
  uint16_t Data;
156
  // An array of ForestNode* following the object.
157
};
158
// ForestNode may not be destroyed (for BumpPtrAllocator).
159
static_assert(std::is_trivially_destructible<ForestNode>());
160
161
// A memory arena for the parse forest.
162
class ForestArena {
163
public:
164
  llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code);
165
  ForestNode &createSequence(SymbolID SID, RuleID RID,
166
88.8M
                             llvm::ArrayRef<const ForestNode *> Elements) {
167
88.8M
    assert(!Elements.empty());
168
0
    return create(ForestNode::Sequence, SID,
169
88.8M
                  Elements.front()->startTokenIndex(),
170
88.8M
                  ForestNode::sequenceData(RID, Elements), Elements);
171
88.8M
  }
172
  ForestNode &createAmbiguous(SymbolID SID,
173
1.49M
                              llvm::ArrayRef<const ForestNode *> Alternatives) {
174
1.49M
    assert(!Alternatives.empty());
175
0
    assert(llvm::all_of(Alternatives,
176
1.49M
                        [SID](const ForestNode *Alternative) {
177
1.49M
                          return SID == Alternative->symbol();
178
1.49M
                        }) &&
179
1.49M
           "Ambiguous alternatives must represent the same symbol!");
180
0
    return create(ForestNode::Ambiguous, SID,
181
1.49M
                  Alternatives.front()->startTokenIndex(),
182
1.49M
                  ForestNode::ambiguousData(Alternatives), Alternatives);
183
1.49M
  }
184
9.53k
  ForestNode &createOpaque(SymbolID SID, Token::Index Start) {
185
9.53k
    return create(ForestNode::Opaque, SID, Start, 0, {});
186
9.53k
  }
187
188
0
  ForestNode &createTerminal(tok::TokenKind TK, Token::Index Start) {
189
0
    return create(ForestNode::Terminal, tokenSymbol(TK), Start, 0, {});
190
0
  }
191
192
0
  size_t nodeCount() const { return NodeCount; }
193
0
  size_t bytes() const { return Arena.getBytesAllocated() + sizeof(*this); }
194
195
private:
196
  ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start,
197
                     uint16_t Data,
198
90.3M
                     llvm::ArrayRef<const ForestNode *> Elements) {
199
90.3M
    ++NodeCount;
200
90.3M
    ForestNode *New = new (Arena.Allocate(
201
90.3M
        sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *),
202
90.3M
        alignof(ForestNode))) ForestNode(K, SID, Start, Data);
203
90.3M
    if (!Elements.empty())
204
90.3M
      llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1));
205
90.3M
    return *New;
206
90.3M
  }
207
208
  llvm::BumpPtrAllocator Arena;
209
  uint32_t NodeCount = 0;
210
};
211
212
class ForestNode::RecursiveIterator
213
    : public llvm::iterator_facade_base<ForestNode::RecursiveIterator,
214
                                        std::input_iterator_tag,
215
                                        const ForestNode> {
216
  llvm::DenseSet<const ForestNode *> Seen;
217
  struct StackFrame {
218
    const ForestNode *Parent;
219
    unsigned ChildIndex;
220
  };
221
  std::vector<StackFrame> Stack;
222
  const ForestNode *Cur;
223
224
public:
225
0
  RecursiveIterator(const ForestNode *N = nullptr) : Cur(N) {}
226
227
0
  const ForestNode &operator*() const { return *Cur; }
228
  void operator++();
229
0
  bool operator==(const RecursiveIterator &I) const { return Cur == I.Cur; }
230
0
  bool operator!=(const RecursiveIterator &I) const { return !(*this == I); }
231
};
232
233
} // namespace pseudo
234
} // namespace clang
235
236
#endif // CLANG_PSEUDO_FOREST_H