/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- GLR.h - Implement a GLR parsing algorithm ---------------*- C++-*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This implements a standard Generalized LR (GLR) parsing algorithm. |
10 | | // |
11 | | // The GLR parser behaves as a normal LR parser until it encounters a conflict. |
12 | | // To handle a conflict (where there are multiple actions could perform), the |
13 | | // parser will simulate nondeterminism by doing a breadth-first search |
14 | | // over all the possibilities. |
15 | | // |
16 | | // Basic mechanisims of the GLR parser: |
17 | | // - A number of processes are operated in parallel. |
18 | | // - Each process has its own parsing stack and behaves as a standard |
19 | | // determinism LR parser. |
20 | | // - When a process encounters a conflict, it will be fork (one for each |
21 | | // avaiable action). |
22 | | // - When a process encounters an error, it is abandoned. |
23 | | // - All process are synchronized by the lookahead token: they perfrom shift |
24 | | // action at the same time, which means some processes need wait until other |
25 | | // processes have performed all reduce actions. |
26 | | // |
27 | | //===----------------------------------------------------------------------===// |
28 | | |
29 | | #ifndef CLANG_PSEUDO_GLR_H |
30 | | #define CLANG_PSEUDO_GLR_H |
31 | | |
32 | | #include "clang-pseudo/Forest.h" |
33 | | #include "clang-pseudo/Language.h" |
34 | | #include "clang-pseudo/grammar/Grammar.h" |
35 | | #include "clang-pseudo/grammar/LRTable.h" |
36 | | #include "llvm/Support/Allocator.h" |
37 | | #include <vector> |
38 | | |
39 | | namespace clang { |
40 | | namespace pseudo { |
41 | | |
42 | | // A Graph-Structured Stack efficiently represents all parse stacks of a GLR |
43 | | // parser. |
44 | | // |
45 | | // Each node stores a parse state, the last parsed ForestNode, and the parent |
46 | | // node. There may be several heads (top of stack), and the parser operates by: |
47 | | // - shift: pushing terminal symbols on top of the stack |
48 | | // - reduce: replace N symbols on top of the stack with one nonterminal |
49 | | // |
50 | | // The structure is a DAG rather than a linear stack: |
51 | | // - GLR allows multiple actions (conflicts) on the same head, producing forks |
52 | | // where several nodes have the same parent |
53 | | // - The parser merges nodes with the same (state, ForestNode), producing joins |
54 | | // where one node has multiple parents |
55 | | // |
56 | | // The parser is responsible for creating nodes and keeping track of the set of |
57 | | // heads. The GSS class is mostly an arena for them. |
58 | | struct GSS { |
59 | | // A node represents a partial parse of the input up to some point. |
60 | | // |
61 | | // It is the equivalent of a frame in an LR parse stack. |
62 | | // Like such a frame, it has an LR parse state and a syntax-tree node |
63 | | // representing the last parsed symbol (a ForestNode in our case). |
64 | | // Unlike a regular LR stack frame, it may have multiple parents. |
65 | | // |
66 | | // Nodes are not exactly pushed and popped on the stack: pushing is just |
67 | | // allocating a new head node with a parent pointer to the old head. Popping |
68 | | // is just forgetting about a node and remembering its parent instead. |
69 | | struct alignas(struct Node *) Node { |
70 | | // LR state describing how parsing should continue from this head. |
71 | | LRTable::StateID State; |
72 | | // Used internally to track reachability during garbage collection. |
73 | | bool GCParity; |
74 | | // Have we already used this node for error recovery? (prevents loops) |
75 | | mutable bool Recovered = false; |
76 | | // Number of the parents of this node. |
77 | | // The parents hold previous parsed symbols, and may resume control after |
78 | | // this node is reduced. |
79 | | unsigned ParentCount; |
80 | | // The parse node for the last parsed symbol. |
81 | | // This symbol appears on the left of the dot in the parse state's items. |
82 | | // (In the literature, the node is attached to the *edge* to the parent). |
83 | | const ForestNode *Payload = nullptr; |
84 | | |
85 | 498M | llvm::ArrayRef<const Node *> parents() const { |
86 | 498M | return llvm::ArrayRef(reinterpret_cast<const Node *const *>(this + 1), |
87 | 498M | ParentCount); |
88 | 498M | }; |
89 | | // Parents are stored as a trailing array of Node*. |
90 | | }; |
91 | | |
92 | | // Allocates a new node in the graph. |
93 | | const Node *addNode(LRTable::StateID State, const ForestNode *Symbol, |
94 | | llvm::ArrayRef<const Node *> Parents); |
95 | | // Frees all nodes not reachable as ancestors of Roots, and returns the count. |
96 | | // Calling this periodically prevents steady memory growth of the GSS. |
97 | | unsigned gc(std::vector<const Node *> &&Roots); |
98 | | |
99 | 0 | size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); } |
100 | 0 | size_t nodesCreated() const { return NodesCreated; } |
101 | | |
102 | | private: |
103 | | // Nodes are recycled using freelists. |
104 | | // They are variable size, so use one free-list per distinct #parents. |
105 | | std::vector<std::vector<Node *>> FreeList; |
106 | | Node *allocate(unsigned Parents); |
107 | | void destroy(Node *N); |
108 | | // The list of nodes created and not destroyed - our candidates for gc(). |
109 | | std::vector<Node *> Alive; |
110 | | bool GCParity = false; // All nodes should match this, except during GC. |
111 | | |
112 | | llvm::BumpPtrAllocator Arena; |
113 | | unsigned NodesCreated = 0; |
114 | | }; |
115 | | llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &); |
116 | | |
117 | | // Parameters for the GLR parsing. |
118 | | struct ParseParams { |
119 | | // The token stream to parse. |
120 | | const TokenStream &Code; |
121 | | |
122 | | // Arena for data structure used by the GLR algorithm. |
123 | | ForestArena &Forest; // Storage for the output forest. |
124 | | GSS &GSStack; // Storage for parsing stacks. |
125 | | }; |
126 | | |
127 | | // Parses the given token stream as the start symbol with the GLR algorithm, |
128 | | // and returns a forest node of the start symbol. |
129 | | // |
130 | | // A rule `_ := StartSymbol` must exit for the chosen start symbol. |
131 | | // |
132 | | // If the parsing fails, we model it as an opaque node in the forest. |
133 | | ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol, |
134 | | const Language &Lang); |
135 | | |
136 | | // Shift a token onto all OldHeads, placing the results into NewHeads. |
137 | | // |
138 | | // Exposed for testing only. |
139 | | void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads, |
140 | | const ForestNode &NextTok, const ParseParams &Params, |
141 | | const Language &Lang, std::vector<const GSS::Node *> &NewHeads); |
142 | | // Applies available reductions on Heads, appending resulting heads to the list. |
143 | | // |
144 | | // Exposed for testing only. |
145 | | void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead, |
146 | | const ParseParams &Params, const Language &Lang); |
147 | | |
148 | | // Heuristically recover from a state where no further parsing is possible. |
149 | | // |
150 | | // OldHeads is the parse state at TokenIndex. |
151 | | // This function consumes zero or more tokens by advancing TokenIndex, |
152 | | // and places any recovery states created in NewHeads. |
153 | | // |
154 | | // On failure, NewHeads is empty and TokenIndex is unchanged. |
155 | | // |
156 | | // WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens, |
157 | | // there is a risk of the parser falling into an infinite loop, creating an |
158 | | // endless sequence of recovery nodes. |
159 | | // Generally it is safe for recovery to match 0 tokens against sequence symbols |
160 | | // like `statement-seq`, as the grammar won't permit another statement-seq |
161 | | // immediately afterwards. However recovery strategies for `statement` should |
162 | | // consume at least one token, as statements may be adjacent in the input. |
163 | | void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads, |
164 | | unsigned &TokenIndex, const ParseParams &Params, |
165 | | const Language &Lang, std::vector<const GSS::Node *> &NewHeads); |
166 | | |
167 | | } // namespace pseudo |
168 | | } // namespace clang |
169 | | |
170 | | #endif // CLANG_PSEUDO_GLR_H |