/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h

Source (jump to first uncovered line)
//===--- GLR.h - Implement a GLR parsing algorithm ---------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This implements a standard Generalized LR (GLR) parsing algorithm.
//
// The GLR parser behaves as a normal LR parser until it encounters a conflict.
// To handle a conflict (where there are multiple actions could perform), the
// parser will simulate nondeterminism by doing a breadth-first search
// over all the possibilities.
//
// Basic mechanisims of the GLR parser:
//  - A number of processes are operated in parallel.
//  - Each process has its own parsing stack and behaves as a standard
//    determinism LR parser.
//  - When a process encounters a conflict, it will be fork (one for each
//    avaiable action).
//  - When a process encounters an error, it is abandoned.
//  - All process are synchronized by the lookahead token: they perfrom shift
//    action at the same time, which means some processes need wait until other
//    processes have performed all reduce actions.
//
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_GLR_H
#define CLANG_PSEUDO_GLR_H

#include "clang-pseudo/Forest.h"
#include "clang-pseudo/Language.h"
#include "clang-pseudo/grammar/Grammar.h"
#include "clang-pseudo/grammar/LRTable.h"
#include "llvm/Support/Allocator.h"
#include <vector>

namespace clang {
namespace pseudo {

// A Graph-Structured Stack efficiently represents all parse stacks of a GLR
// parser.
//
// Each node stores a parse state, the last parsed ForestNode, and the parent
// node. There may be several heads (top of stack), and the parser operates by:
// - shift: pushing terminal symbols on top of the stack
// - reduce: replace N symbols on top of the stack with one nonterminal
//
// The structure is a DAG rather than a linear stack:
// - GLR allows multiple actions (conflicts) on the same head, producing forks
//   where several nodes have the same parent
// - The parser merges nodes with the same (state, ForestNode), producing joins
//   where one node has multiple parents
//
// The parser is responsible for creating nodes and keeping track of the set of
// heads. The GSS class is mostly an arena for them.
struct GSS {
  // A node represents a partial parse of the input up to some point.
  //
  // It is the equivalent of a frame in an LR parse stack.
  // Like such a frame, it has an LR parse state and a syntax-tree node
  // representing the last parsed symbol (a ForestNode in our case).
  // Unlike a regular LR stack frame, it may have multiple parents.
  //
  // Nodes are not exactly pushed and popped on the stack: pushing is just
  // allocating a new head node with a parent pointer to the old head. Popping
  // is just forgetting about a node and remembering its parent instead.
  struct alignas(struct Node *) Node {
    // LR state describing how parsing should continue from this head.
    LRTable::StateID State;
    // Used internally to track reachability during garbage collection.
    bool GCParity;
    // Have we already used this node for error recovery? (prevents loops)
    mutable bool Recovered = false;
    // Number of the parents of this node.
    // The parents hold previous parsed symbols, and may resume control after
    // this node is reduced.
    unsigned ParentCount;
    // The parse node for the last parsed symbol.
    // This symbol appears on the left of the dot in the parse state's items.
    // (In the literature, the node is attached to the *edge* to the parent).
    const ForestNode *Payload = nullptr;

    llvm::ArrayRef<const Node *> parents() const {
      return llvm::ArrayRef(reinterpret_cast<const Node *const *>(this + 1),
                            ParentCount);
    };
    // Parents are stored as a trailing array of Node*.
  };

  // Allocates a new node in the graph.
  const Node *addNode(LRTable::StateID State, const ForestNode *Symbol,
                      llvm::ArrayRef<const Node *> Parents);
  // Frees all nodes not reachable as ancestors of Roots, and returns the count.
  // Calling this periodically prevents steady memory growth of the GSS.
  unsigned gc(std::vector<const Node *> &&Roots);

  size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); }
  size_t nodesCreated() const { return NodesCreated; }

private:
  // Nodes are recycled using freelists.
  // They are variable size, so use one free-list per distinct #parents.
  std::vector<std::vector<Node *>> FreeList;
  Node *allocate(unsigned Parents);
  void destroy(Node *N);
  // The list of nodes created and not destroyed - our candidates for gc().
  std::vector<Node *> Alive;
  bool GCParity = false; // All nodes should match this, except during GC.

  llvm::BumpPtrAllocator Arena;
  unsigned NodesCreated = 0;
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &);

// Parameters for the GLR parsing.
struct ParseParams {
  // The token stream to parse.
  const TokenStream &Code;

  // Arena for data structure used by the GLR algorithm.
  ForestArena &Forest;  // Storage for the output forest.
  GSS &GSStack;         // Storage for parsing stacks.
};

// Parses the given token stream as the start symbol with the GLR algorithm,
// and returns a forest node of the start symbol.
//
// A rule `_ := StartSymbol` must exit for the chosen start symbol.
//
// If the parsing fails, we model it as an opaque node in the forest.
ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
                     const Language &Lang);

// Shift a token onto all OldHeads, placing the results into NewHeads.
//
// Exposed for testing only.
void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
              const ForestNode &NextTok, const ParseParams &Params,
              const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
// Applies available reductions on Heads, appending resulting heads to the list.
//
// Exposed for testing only.
void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
               const ParseParams &Params, const Language &Lang);

// Heuristically recover from a state where no further parsing is possible.
//
// OldHeads is the parse state at TokenIndex.
// This function consumes zero or more tokens by advancing TokenIndex,
// and places any recovery states created in NewHeads.
//
// On failure, NewHeads is empty and TokenIndex is unchanged.
//
// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
// there is a risk of the parser falling into an infinite loop, creating an
// endless sequence of recovery nodes.
// Generally it is safe for recovery to match 0 tokens against sequence symbols
// like `statement-seq`, as the grammar won't permit another statement-seq
// immediately afterwards. However recovery strategies for `statement` should
// consume at least one token, as statements may be adjacent in the input.
void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
                unsigned &TokenIndex, const ParseParams &Params,
                const Language &Lang, std::vector<const GSS::Node *> &NewHeads);

} // namespace pseudo
} // namespace clang

#endif // CLANG_PSEUDO_GLR_H

Line	Count	Source (jump to first uncovered line)
1		//===--- GLR.h - Implement a GLR parsing algorithm ---------------- C++--===//
2		//
3		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4		// See https://llvm.org/LICENSE.txt for license information.
5		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6		//
7		//===----------------------------------------------------------------------===//
8		//
9		// This implements a standard Generalized LR (GLR) parsing algorithm.
10		//
11		// The GLR parser behaves as a normal LR parser until it encounters a conflict.
12		// To handle a conflict (where there are multiple actions could perform), the
13		// parser will simulate nondeterminism by doing a breadth-first search
14		// over all the possibilities.
15		//
16		// Basic mechanisims of the GLR parser:
17		// - A number of processes are operated in parallel.
18		// - Each process has its own parsing stack and behaves as a standard
19		// determinism LR parser.
20		// - When a process encounters a conflict, it will be fork (one for each
21		// avaiable action).
22		// - When a process encounters an error, it is abandoned.
23		// - All process are synchronized by the lookahead token: they perfrom shift
24		// action at the same time, which means some processes need wait until other
25		// processes have performed all reduce actions.
26		//
27		//===----------------------------------------------------------------------===//
28
29		#ifndef CLANG_PSEUDO_GLR_H
30		#define CLANG_PSEUDO_GLR_H
31
32		#include "clang-pseudo/Forest.h"
33		#include "clang-pseudo/Language.h"
34		#include "clang-pseudo/grammar/Grammar.h"
35		#include "clang-pseudo/grammar/LRTable.h"
36		#include "llvm/Support/Allocator.h"
37		#include <vector>
38
39		namespace clang {
40		namespace pseudo {
41
42		// A Graph-Structured Stack efficiently represents all parse stacks of a GLR
43		// parser.
44		//
45		// Each node stores a parse state, the last parsed ForestNode, and the parent
46		// node. There may be several heads (top of stack), and the parser operates by:
47		// - shift: pushing terminal symbols on top of the stack
48		// - reduce: replace N symbols on top of the stack with one nonterminal
49		//
50		// The structure is a DAG rather than a linear stack:
51		// - GLR allows multiple actions (conflicts) on the same head, producing forks
52		// where several nodes have the same parent
53		// - The parser merges nodes with the same (state, ForestNode), producing joins
54		// where one node has multiple parents
55		//
56		// The parser is responsible for creating nodes and keeping track of the set of
57		// heads. The GSS class is mostly an arena for them.
58		struct GSS {
59		// A node represents a partial parse of the input up to some point.
60		//
61		// It is the equivalent of a frame in an LR parse stack.
62		// Like such a frame, it has an LR parse state and a syntax-tree node
63		// representing the last parsed symbol (a ForestNode in our case).
64		// Unlike a regular LR stack frame, it may have multiple parents.
65		//
66		// Nodes are not exactly pushed and popped on the stack: pushing is just
67		// allocating a new head node with a parent pointer to the old head. Popping
68		// is just forgetting about a node and remembering its parent instead.
69		struct alignas(struct Node *) Node {
70		// LR state describing how parsing should continue from this head.
71		LRTable::StateID State;
72		// Used internally to track reachability during garbage collection.
73		bool GCParity;
74		// Have we already used this node for error recovery? (prevents loops)
75		mutable bool Recovered = false;
76		// Number of the parents of this node.
77		// The parents hold previous parsed symbols, and may resume control after
78		// this node is reduced.
79		unsigned ParentCount;
80		// The parse node for the last parsed symbol.
81		// This symbol appears on the left of the dot in the parse state's items.
82		// (In the literature, the node is attached to the edge to the parent).
83		const ForestNode *Payload = nullptr;
84
85	498M	llvm::ArrayRef<const Node *> parents() const {
86	498M	return llvm::ArrayRef(reinterpret_cast<const Node const >(this + 1),
87	498M	ParentCount);
88	498M	};
89		// Parents are stored as a trailing array of Node*.
90		};
91
92		// Allocates a new node in the graph.
93		const Node addNode(LRTable::StateID State, const ForestNode Symbol,
94		llvm::ArrayRef<const Node *> Parents);
95		// Frees all nodes not reachable as ancestors of Roots, and returns the count.
96		// Calling this periodically prevents steady memory growth of the GSS.
97		unsigned gc(std::vector<const Node *> &&Roots);
98
99	0	size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); }
100	0	size_t nodesCreated() const { return NodesCreated; }
101
102		private:
103		// Nodes are recycled using freelists.
104		// They are variable size, so use one free-list per distinct #parents.
105		std::vector<std::vector<Node *>> FreeList;
106		Node *allocate(unsigned Parents);
107		void destroy(Node *N);
108		// The list of nodes created and not destroyed - our candidates for gc().
109		std::vector<Node *> Alive;
110		bool GCParity = false; // All nodes should match this, except during GC.
111
112		llvm::BumpPtrAllocator Arena;
113		unsigned NodesCreated = 0;
114		};
115		llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &);
116
117		// Parameters for the GLR parsing.
118		struct ParseParams {
119		// The token stream to parse.
120		const TokenStream &Code;
121
122		// Arena for data structure used by the GLR algorithm.
123		ForestArena &Forest; // Storage for the output forest.
124		GSS &GSStack; // Storage for parsing stacks.
125		};
126
127		// Parses the given token stream as the start symbol with the GLR algorithm,
128		// and returns a forest node of the start symbol.
129		//
130		// A rule `_ := StartSymbol` must exit for the chosen start symbol.
131		//
132		// If the parsing fails, we model it as an opaque node in the forest.
133		ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
134		const Language &Lang);
135
136		// Shift a token onto all OldHeads, placing the results into NewHeads.
137		//
138		// Exposed for testing only.
139		void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
140		const ForestNode &NextTok, const ParseParams &Params,
141		const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
142		// Applies available reductions on Heads, appending resulting heads to the list.
143		//
144		// Exposed for testing only.
145		void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
146		const ParseParams &Params, const Language &Lang);
147
148		// Heuristically recover from a state where no further parsing is possible.
149		//
150		// OldHeads is the parse state at TokenIndex.
151		// This function consumes zero or more tokens by advancing TokenIndex,
152		// and places any recovery states created in NewHeads.
153		//
154		// On failure, NewHeads is empty and TokenIndex is unchanged.
155		//
156		// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
157		// there is a risk of the parser falling into an infinite loop, creating an
158		// endless sequence of recovery nodes.
159		// Generally it is safe for recovery to match 0 tokens against sequence symbols
160		// like `statement-seq`, as the grammar won't permit another statement-seq
161		// immediately afterwards. However recovery strategies for `statement` should
162		// consume at least one token, as statements may be adjacent in the input.
163		void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
164		unsigned &TokenIndex, const ParseParams &Params,
165		const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
166
167		} // namespace pseudo
168		} // namespace clang
169
170		#endif // CLANG_PSEUDO_GLR_H

Coverage Report

Created: 2024-01-17 10:31