Coverage Report

Created: 2024-01-17 10:31

/src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
Line
Count
Source (jump to first uncovered line)
1
//===--- LRGraph.h - Build an LR automaton  ------------------*- C++-*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
//  LR parsers are bottom-up parsers -- they scan the input from left to right,
10
//  and collect the right-hand side of a production rule (called handle) on top
11
//  of the stack, then replace (reduce) the handle with the nonterminal defined
12
//  by the production rule.
13
//
14
//  This file defines LRGraph, a deterministic handle-finding finite-state
15
//  automaton, which is a key component in LR parsers to recognize any of
16
//  handles in the grammar efficiently. We build the LR table (ACTION and GOTO
17
//  Table) based on the LRGraph.
18
//
19
//  LRGraph can be constructed for any context-free grammars.
20
//  Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
21
//  interpretation of the FSA is nondeterministic -- we might in a state where
22
//  we can continue searching an handle and identify a handle (called
23
//  shift/reduce conflicts), or identify more than one handle (callled
24
//  reduce/reduce conflicts).
25
//
26
//  LRGraph is a common model for all variants of LR automatons, from the most
27
//  basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
28
//  in making decisions.
29
//===----------------------------------------------------------------------===//
30
31
#ifndef CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
32
#define CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
33
34
#include "clang-pseudo/grammar/Grammar.h"
35
#include "llvm/ADT/Hashing.h"
36
#include <vector>
37
38
namespace clang {
39
namespace pseudo {
40
41
// An LR item -- a grammar rule with a dot at some position of the body.
42
// e.g. a production rule A := X Y yields 3 items:
43
//   A := . X Y
44
//   A := X . Y
45
//   A := X Y .
46
// An item indicates how much of a production rule has been recognized at a
47
// position (described by dot), for example, A := X . Y indicates that we have
48
// recognized the X part from the input, and we hope next to see the input
49
// derivable from Y.
50
class Item {
51
public:
52
120k
  static Item start(RuleID ID, const Grammar &G) {
53
120k
    Item I;
54
120k
    I.RID = ID;
55
120k
    I.RuleLength = G.lookupRule(ID).Size;
56
120k
    return I;
57
120k
  }
58
491k
  static Item sentinel(RuleID ID) {
59
491k
    Item I;
60
491k
    I.RID = ID;
61
491k
    return I;
62
491k
  }
63
64
105k
  RuleID rule() const { return RID; }
65
52.2k
  uint8_t dot() const { return DotPos; }
66
67
3.26M
  bool hasNext() const { return DotPos < RuleLength; }
68
1.96M
  SymbolID next(const Grammar &G) const {
69
1.96M
    assert(hasNext());
70
0
    return G.lookupRule(RID).Sequence[DotPos];
71
1.96M
  }
72
73
51.2k
  Item advance() const {
74
51.2k
    assert(hasNext());
75
0
    Item I = *this;
76
51.2k
    ++I.DotPos;
77
51.2k
    return I;
78
51.2k
  }
79
80
  std::string dump(const Grammar &G) const;
81
82
1.47M
  bool operator==(const Item &I) const {
83
1.47M
    return DotPos == I.DotPos && RID == I.RID;
84
1.47M
  }
85
98.1k
  bool operator<(const Item &I) const {
86
98.1k
    return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
87
98.1k
  }
88
242k
  friend llvm::hash_code hash_value(const Item &I) {
89
242k
    return llvm::hash_combine(I.RID, I.DotPos);
90
242k
  }
91
92
private:
93
  RuleID RID = 0;
94
  uint8_t DotPos = 0;
95
  uint8_t RuleLength = 0; // the length of rule body.
96
};
97
98
// A state represents a node in the LR automaton graph. It is an item set, which
99
// contains all possible rules that the LR parser may be parsing in that state.
100
//
101
// Conceptually, If we knew in advance what we're parsing, at any point we're
102
// partway through parsing a production, sitting on a stack of partially parsed
103
// productions. But because we don't know, there could be *several* productions
104
// we're partway through. The set of possibilities is the parser state, and we
105
// precompute all the transitions between these states.
106
struct State {
107
  // A full set of items (including non-kernel items) representing the state,
108
  // in a canonical order (see SortByNextSymbol in the cpp file).
109
  std::vector<Item> Items;
110
111
  std::string dump(const Grammar &G, unsigned Indent = 0) const;
112
};
113
114
// LRGraph is a deterministic finite state automaton for LR parsing.
115
//
116
// Intuitively, an LR automaton is a transition graph. The graph has a
117
// collection of nodes, called States. Each state corresponds to a particular
118
// item set, which represents a condition that could occur during the process of
119
// parsing a production. Edges are directed from one state to another. Each edge
120
// is labeled by a grammar symbol (terminal or nonterminal).
121
//
122
// LRGraph is used to construct the LR parsing table which is a core
123
// data-structure driving the LR parser.
124
class LRGraph {
125
public:
126
  // StateID is the index in States table.
127
  using StateID = uint16_t;
128
129
  // Constructs an LR(0) automaton.
130
  static LRGraph buildLR0(const Grammar &);
131
132
  // An edge in the LR graph, it represents a transition in the LR automaton.
133
  // If the parser is at state Src, with a lookahead Label, then it
134
  // transits to state Dst.
135
  struct Edge {
136
    StateID Src, Dst;
137
    SymbolID Label;
138
  };
139
140
  // A possible error recovery: choose to match some tokens against a symbol.
141
  //
142
  // e.g. a state that contains
143
  //   stmt := { . stmt-seq [recover=braces] }
144
  // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
145
  struct Recovery {
146
    StateID Src; // The state we are in when encountering the error.
147
    ExtensionID Strategy;      // Heuristic choosing the tokens to match.
148
    SymbolID Result;           // The symbol that is produced.
149
  };
150
151
2.95k
  llvm::ArrayRef<State> states() const { return States; }
152
1
  llvm::ArrayRef<Edge> edges() const { return Edges; }
153
1
  llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
154
1
  llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
155
1
    return StartStates;
156
1
  }
157
158
  std::string dumpForTests(const Grammar &) const;
159
160
private:
161
  LRGraph(std::vector<State> States, std::vector<Edge> Edges,
162
          std::vector<Recovery> Recoveries,
163
          std::vector<std::pair<SymbolID, StateID>> StartStates)
164
      : States(std::move(States)), Edges(std::move(Edges)),
165
1
        Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
166
1
  }
167
168
  std::vector<State> States;
169
  std::vector<Edge> Edges;
170
  std::vector<Recovery> Recoveries;
171
  std::vector<std::pair<SymbolID, StateID>> StartStates;
172
};
173
174
} // namespace pseudo
175
} // namespace clang
176
177
namespace llvm {
178
// Support clang::pseudo::Item as DenseMap keys.
179
template <> struct DenseMapInfo<clang::pseudo::Item> {
180
274k
  static inline clang::pseudo::Item getEmptyKey() {
181
274k
    return clang::pseudo::Item::sentinel(-1);
182
274k
  }
183
217k
  static inline clang::pseudo::Item getTombstoneKey() {
184
217k
    return clang::pseudo::Item::sentinel(-2);
185
217k
  }
186
185k
  static unsigned getHashValue(const clang::pseudo::Item &I) {
187
185k
    return hash_value(I);
188
185k
  }
189
  static bool isEqual(const clang::pseudo::Item &LHS,
190
1.35M
                      const clang::pseudo::Item &RHS) {
191
1.35M
    return LHS == RHS;
192
1.35M
  }
193
};
194
} // namespace llvm
195
196
#endif // CLANG_PSEUDO_GRAMMAR_LRGRAPH_H