/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h

Source (jump to first uncovered line)
//===--- LRGraph.h - Build an LR automaton  ------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//  LR parsers are bottom-up parsers -- they scan the input from left to right,
//  and collect the right-hand side of a production rule (called handle) on top
//  of the stack, then replace (reduce) the handle with the nonterminal defined
//  by the production rule.
//
//  This file defines LRGraph, a deterministic handle-finding finite-state
//  automaton, which is a key component in LR parsers to recognize any of
//  handles in the grammar efficiently. We build the LR table (ACTION and GOTO
//  Table) based on the LRGraph.
//
//  LRGraph can be constructed for any context-free grammars.
//  Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
//  interpretation of the FSA is nondeterministic -- we might in a state where
//  we can continue searching an handle and identify a handle (called
//  shift/reduce conflicts), or identify more than one handle (callled
//  reduce/reduce conflicts).
//
//  LRGraph is a common model for all variants of LR automatons, from the most
//  basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
//  in making decisions.
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
#define CLANG_PSEUDO_GRAMMAR_LRGRAPH_H

#include "clang-pseudo/grammar/Grammar.h"
#include "llvm/ADT/Hashing.h"
#include <vector>

namespace clang {
namespace pseudo {

// An LR item -- a grammar rule with a dot at some position of the body.
// e.g. a production rule A := X Y yields 3 items:
//   A := . X Y
//   A := X . Y
//   A := X Y .
// An item indicates how much of a production rule has been recognized at a
// position (described by dot), for example, A := X . Y indicates that we have
// recognized the X part from the input, and we hope next to see the input
// derivable from Y.
class Item {
public:
  static Item start(RuleID ID, const Grammar &G) {
    Item I;
    I.RID = ID;
    I.RuleLength = G.lookupRule(ID).Size;
    return I;
  }
  static Item sentinel(RuleID ID) {
    Item I;
    I.RID = ID;
    return I;
  }

  RuleID rule() const { return RID; }
  uint8_t dot() const { return DotPos; }

  bool hasNext() const { return DotPos < RuleLength; }
  SymbolID next(const Grammar &G) const {
    assert(hasNext());
    return G.lookupRule(RID).Sequence[DotPos];
  }

  Item advance() const {
    assert(hasNext());
    Item I = *this;
    ++I.DotPos;
    return I;
  }

  std::string dump(const Grammar &G) const;

  bool operator==(const Item &I) const {
    return DotPos == I.DotPos && RID == I.RID;
  }
  bool operator<(const Item &I) const {
    return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
  }
  friend llvm::hash_code hash_value(const Item &I) {
    return llvm::hash_combine(I.RID, I.DotPos);
  }

private:
  RuleID RID = 0;
  uint8_t DotPos = 0;
  uint8_t RuleLength = 0; // the length of rule body.
};

// A state represents a node in the LR automaton graph. It is an item set, which
// contains all possible rules that the LR parser may be parsing in that state.
//
// Conceptually, If we knew in advance what we're parsing, at any point we're
// partway through parsing a production, sitting on a stack of partially parsed
// productions. But because we don't know, there could be *several* productions
// we're partway through. The set of possibilities is the parser state, and we
// precompute all the transitions between these states.
struct State {
  // A full set of items (including non-kernel items) representing the state,
  // in a canonical order (see SortByNextSymbol in the cpp file).
  std::vector<Item> Items;

  std::string dump(const Grammar &G, unsigned Indent = 0) const;
};

// LRGraph is a deterministic finite state automaton for LR parsing.
//
// Intuitively, an LR automaton is a transition graph. The graph has a
// collection of nodes, called States. Each state corresponds to a particular
// item set, which represents a condition that could occur during the process of
// parsing a production. Edges are directed from one state to another. Each edge
// is labeled by a grammar symbol (terminal or nonterminal).
//
// LRGraph is used to construct the LR parsing table which is a core
// data-structure driving the LR parser.
class LRGraph {
public:
  // StateID is the index in States table.
  using StateID = uint16_t;

  // Constructs an LR(0) automaton.
  static LRGraph buildLR0(const Grammar &);

  // An edge in the LR graph, it represents a transition in the LR automaton.
  // If the parser is at state Src, with a lookahead Label, then it
  // transits to state Dst.
  struct Edge {
    StateID Src, Dst;
    SymbolID Label;
  };

  // A possible error recovery: choose to match some tokens against a symbol.
  //
  // e.g. a state that contains
  //   stmt := { . stmt-seq [recover=braces] }
  // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
  struct Recovery {
    StateID Src; // The state we are in when encountering the error.
    ExtensionID Strategy;      // Heuristic choosing the tokens to match.
    SymbolID Result;           // The symbol that is produced.
  };

  llvm::ArrayRef<State> states() const { return States; }
  llvm::ArrayRef<Edge> edges() const { return Edges; }
  llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
  llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
    return StartStates;
  }

  std::string dumpForTests(const Grammar &) const;

private:
  LRGraph(std::vector<State> States, std::vector<Edge> Edges,
          std::vector<Recovery> Recoveries,
          std::vector<std::pair<SymbolID, StateID>> StartStates)
      : States(std::move(States)), Edges(std::move(Edges)),
        Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
  }

  std::vector<State> States;
  std::vector<Edge> Edges;
  std::vector<Recovery> Recoveries;
  std::vector<std::pair<SymbolID, StateID>> StartStates;
};

} // namespace pseudo
} // namespace clang

namespace llvm {
// Support clang::pseudo::Item as DenseMap keys.
template <> struct DenseMapInfo<clang::pseudo::Item> {
  static inline clang::pseudo::Item getEmptyKey() {
    return clang::pseudo::Item::sentinel(-1);
  }
  static inline clang::pseudo::Item getTombstoneKey() {
    return clang::pseudo::Item::sentinel(-2);
  }
  static unsigned getHashValue(const clang::pseudo::Item &I) {
    return hash_value(I);
  }
  static bool isEqual(const clang::pseudo::Item &LHS,
                      const clang::pseudo::Item &RHS) {
    return LHS == RHS;
  }
};
} // namespace llvm

#endif // CLANG_PSEUDO_GRAMMAR_LRGRAPH_H

Line	Count	Source (jump to first uncovered line)
1		//===--- LRGraph.h - Build an LR automaton ------------------- C++--===//
2		//
3		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4		// See https://llvm.org/LICENSE.txt for license information.
5		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6		//
7		//===----------------------------------------------------------------------===//
8		//
9		// LR parsers are bottom-up parsers -- they scan the input from left to right,
10		// and collect the right-hand side of a production rule (called handle) on top
11		// of the stack, then replace (reduce) the handle with the nonterminal defined
12		// by the production rule.
13		//
14		// This file defines LRGraph, a deterministic handle-finding finite-state
15		// automaton, which is a key component in LR parsers to recognize any of
16		// handles in the grammar efficiently. We build the LR table (ACTION and GOTO
17		// Table) based on the LRGraph.
18		//
19		// LRGraph can be constructed for any context-free grammars.
20		// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
21		// interpretation of the FSA is nondeterministic -- we might in a state where
22		// we can continue searching an handle and identify a handle (called
23		// shift/reduce conflicts), or identify more than one handle (callled
24		// reduce/reduce conflicts).
25		//
26		// LRGraph is a common model for all variants of LR automatons, from the most
27		// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
28		// in making decisions.
29		//===----------------------------------------------------------------------===//
30
31		#ifndef CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
32		#define CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
33
34		#include "clang-pseudo/grammar/Grammar.h"
35		#include "llvm/ADT/Hashing.h"
36		#include <vector>
37
38		namespace clang {
39		namespace pseudo {
40
41		// An LR item -- a grammar rule with a dot at some position of the body.
42		// e.g. a production rule A := X Y yields 3 items:
43		// A := . X Y
44		// A := X . Y
45		// A := X Y .
46		// An item indicates how much of a production rule has been recognized at a
47		// position (described by dot), for example, A := X . Y indicates that we have
48		// recognized the X part from the input, and we hope next to see the input
49		// derivable from Y.
50		class Item {
51		public:
52	120k	static Item start(RuleID ID, const Grammar &G) {
53	120k	Item I;
54	120k	I.RID = ID;
55	120k	I.RuleLength = G.lookupRule(ID).Size;
56	120k	return I;
57	120k	}
58	491k	static Item sentinel(RuleID ID) {
59	491k	Item I;
60	491k	I.RID = ID;
61	491k	return I;
62	491k	}
63
64	105k	RuleID rule() const { return RID; }
65	52.2k	uint8_t dot() const { return DotPos; }
66
67	3.26M	bool hasNext() const { return DotPos < RuleLength; }
68	1.96M	SymbolID next(const Grammar &G) const {
69	1.96M	assert(hasNext());
70	0	return G.lookupRule(RID).Sequence[DotPos];
71	1.96M	}
72
73	51.2k	Item advance() const {
74	51.2k	assert(hasNext());
75	0	Item I = *this;
76	51.2k	++I.DotPos;
77	51.2k	return I;
78	51.2k	}
79
80		std::string dump(const Grammar &G) const;
81
82	1.47M	bool operator==(const Item &I) const {
83	1.47M	return DotPos == I.DotPos && RID == I.RID;
84	1.47M	}
85	98.1k	bool operator<(const Item &I) const {
86	98.1k	return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
87	98.1k	}
88	242k	friend llvm::hash_code hash_value(const Item &I) {
89	242k	return llvm::hash_combine(I.RID, I.DotPos);
90	242k	}
91
92		private:
93		RuleID RID = 0;
94		uint8_t DotPos = 0;
95		uint8_t RuleLength = 0; // the length of rule body.
96		};
97
98		// A state represents a node in the LR automaton graph. It is an item set, which
99		// contains all possible rules that the LR parser may be parsing in that state.
100		//
101		// Conceptually, If we knew in advance what we're parsing, at any point we're
102		// partway through parsing a production, sitting on a stack of partially parsed
103		// productions. But because we don't know, there could be several productions
104		// we're partway through. The set of possibilities is the parser state, and we
105		// precompute all the transitions between these states.
106		struct State {
107		// A full set of items (including non-kernel items) representing the state,
108		// in a canonical order (see SortByNextSymbol in the cpp file).
109		std::vector<Item> Items;
110
111		std::string dump(const Grammar &G, unsigned Indent = 0) const;
112		};
113
114		// LRGraph is a deterministic finite state automaton for LR parsing.
115		//
116		// Intuitively, an LR automaton is a transition graph. The graph has a
117		// collection of nodes, called States. Each state corresponds to a particular
118		// item set, which represents a condition that could occur during the process of
119		// parsing a production. Edges are directed from one state to another. Each edge
120		// is labeled by a grammar symbol (terminal or nonterminal).
121		//
122		// LRGraph is used to construct the LR parsing table which is a core
123		// data-structure driving the LR parser.
124		class LRGraph {
125		public:
126		// StateID is the index in States table.
127		using StateID = uint16_t;
128
129		// Constructs an LR(0) automaton.
130		static LRGraph buildLR0(const Grammar &);
131
132		// An edge in the LR graph, it represents a transition in the LR automaton.
133		// If the parser is at state Src, with a lookahead Label, then it
134		// transits to state Dst.
135		struct Edge {
136		StateID Src, Dst;
137		SymbolID Label;
138		};
139
140		// A possible error recovery: choose to match some tokens against a symbol.
141		//
142		// e.g. a state that contains
143		// stmt := { . stmt-seq [recover=braces] }
144		// has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
145		struct Recovery {
146		StateID Src; // The state we are in when encountering the error.
147		ExtensionID Strategy; // Heuristic choosing the tokens to match.
148		SymbolID Result; // The symbol that is produced.
149		};
150
151	2.95k	llvm::ArrayRef<State> states() const { return States; }
152	1	llvm::ArrayRef<Edge> edges() const { return Edges; }
153	1	llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
154	1	llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
155	1	return StartStates;
156	1	}
157
158		std::string dumpForTests(const Grammar &) const;
159
160		private:
161		LRGraph(std::vector<State> States, std::vector<Edge> Edges,
162		std::vector<Recovery> Recoveries,
163		std::vector<std::pair<SymbolID, StateID>> StartStates)
164		: States(std::move(States)), Edges(std::move(Edges)),
165	1	Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
166	1	}
167
168		std::vector<State> States;
169		std::vector<Edge> Edges;
170		std::vector<Recovery> Recoveries;
171		std::vector<std::pair<SymbolID, StateID>> StartStates;
172		};
173
174		} // namespace pseudo
175		} // namespace clang
176
177		namespace llvm {
178		// Support clang::pseudo::Item as DenseMap keys.
179		template <> struct DenseMapInfo<clang::pseudo::Item> {
180	274k	static inline clang::pseudo::Item getEmptyKey() {
181	274k	return clang::pseudo::Item::sentinel(-1);
182	274k	}
183	217k	static inline clang::pseudo::Item getTombstoneKey() {
184	217k	return clang::pseudo::Item::sentinel(-2);
185	217k	}
186	185k	static unsigned getHashValue(const clang::pseudo::Item &I) {
187	185k	return hash_value(I);
188	185k	}
189		static bool isEqual(const clang::pseudo::Item &LHS,
190	1.35M	const clang::pseudo::Item &RHS) {
191	1.35M	return LHS == RHS;
192	1.35M	}
193		};
194		} // namespace llvm
195
196		#endif // CLANG_PSEUDO_GRAMMAR_LRGRAPH_H

Coverage Report

Created: 2024-01-17 10:31