Coverage Report

Created: 2024-01-17 10:31

/src/llvm-project/clang-tools-extra/pseudo/lib/GLR.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- GLR.cpp   -----------------------------------------------*- C++-*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "clang-pseudo/GLR.h"
10
#include "clang-pseudo/Language.h"
11
#include "clang-pseudo/grammar/Grammar.h"
12
#include "clang-pseudo/grammar/LRTable.h"
13
#include "clang/Basic/TokenKinds.h"
14
#include "llvm/ADT/ArrayRef.h"
15
#include "llvm/ADT/STLExtras.h"
16
#include "llvm/ADT/ScopeExit.h"
17
#include "llvm/ADT/StringExtras.h"
18
#include "llvm/Support/Debug.h"
19
#include "llvm/Support/FormatVariadic.h"
20
#include <algorithm>
21
#include <memory>
22
#include <optional>
23
#include <queue>
24
25
#define DEBUG_TYPE "GLR.cpp"
26
27
namespace clang {
28
namespace pseudo {
29
namespace {
30
31
Token::Index findRecoveryEndpoint(ExtensionID Strategy, Token::Index Begin,
32
                                  const TokenStream &Tokens,
33
389k
                                  const Language &Lang) {
34
389k
  assert(Strategy != 0);
35
389k
  if (auto S = Lang.RecoveryStrategies.lookup(Strategy))
36
389k
    return S(Begin, Tokens);
37
0
  return Token::Invalid;
38
389k
}
39
40
} // namespace
41
42
void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
43
                unsigned &TokenIndex, const ParseParams &Params,
44
                const Language &Lang,
45
9.53k
                std::vector<const GSS::Node *> &NewHeads) {
46
9.53k
  LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
47
  // Describes a possibility to recover by forcibly interpreting a range of
48
  // tokens around the cursor as a nonterminal that we expected to see.
49
9.53k
  struct PlaceholderRecovery {
50
    // The token prior to the nonterminal which is being recovered.
51
    // This starts of the region we're skipping, so higher Position is better.
52
9.53k
    Token::Index Position;
53
    // The nonterminal which will be created in order to recover.
54
9.53k
    SymbolID Symbol;
55
    // The heuristic used to choose the bounds of the nonterminal to recover.
56
9.53k
    ExtensionID Strategy;
57
58
    // The GSS head where we are expecting the recovered nonterminal.
59
9.53k
    const GSS::Node *RecoveryNode;
60
    // Payload of nodes on the way back from the OldHead to the recovery node.
61
    // These represent the partial parse that is being discarded.
62
    // They should become the children of the opaque recovery node.
63
    // FIXME: internal structure of opaque nodes is not implemented.
64
    //
65
    // There may be multiple paths leading to the same recovery node, we choose
66
    // one arbitrarily.
67
9.53k
    std::vector<const ForestNode *> DiscardedParse;
68
9.53k
  };
69
9.53k
  std::vector<PlaceholderRecovery> Options;
70
71
  // Find recovery options by walking up the stack.
72
  //
73
  // This is similar to exception handling: we walk up the "frames" of nested
74
  // rules being parsed until we find one that has a "handler" which allows us
75
  // to determine the node bounds without parsing it.
76
  //
77
  // Unfortunately there's a significant difference: the stack contains both
78
  // "upward" nodes (ancestor parses) and "leftward" ones.
79
  // e.g. when parsing `{ if (1) ? }` as compound-stmt, the stack contains:
80
  //   stmt := IF ( expr ) . stmt      - current state, we should recover here!
81
  //   stmt := IF ( expr . ) stmt      - (left, no recovery here)
82
  //   stmt := IF ( . expr ) stmt      - left, we should NOT recover here!
83
  //   stmt := IF . ( expr ) stmt      - (left, no recovery here)
84
  //   stmt-seq := . stmt              - up, we might recover here
85
  //   compound-stmt := { . stmt-seq } - up, we should recover here!
86
  //
87
  // It's not obvious how to avoid collecting "leftward" recovery options.
88
  // I think the distinction is ill-defined after merging items into states.
89
  // For now, we have to take this into account when defining recovery rules.
90
  // (e.g. in the expr recovery above, stay inside the parentheses).
91
  // FIXME: find a more satisfying way to avoid such false recovery.
92
  // FIXME: Add a test for spurious recovery once tests can define strategies.
93
9.53k
  std::vector<const ForestNode *> Path;
94
9.53k
  llvm::DenseSet<const GSS::Node *> Seen;
95
32.5M
  auto WalkUp = [&](const GSS::Node *N, Token::Index NextTok, auto &WalkUp) {
96
32.5M
    if (!Seen.insert(N).second)
97
22.3M
      return;
98
10.2M
    if (!N->Recovered) { // Don't recover the same way twice!
99
10.2M
      for (auto Strategy : Lang.Table.getRecovery(N->State)) {
100
389k
        Options.push_back(PlaceholderRecovery{
101
389k
            NextTok,
102
389k
            Strategy.Result,
103
389k
            Strategy.Strategy,
104
389k
            N,
105
389k
            Path,
106
389k
        });
107
389k
        LLVM_DEBUG(llvm::dbgs()
108
389k
                   << "Option: recover " << Lang.G.symbolName(Strategy.Result)
109
389k
                   << " at token " << NextTok << "\n");
110
389k
      }
111
10.2M
    }
112
10.2M
    Path.push_back(N->Payload);
113
10.2M
    for (const GSS::Node *Parent : N->parents())
114
31.4M
      WalkUp(Parent, N->Payload->startTokenIndex(), WalkUp);
115
10.2M
    Path.pop_back();
116
10.2M
  };
117
9.53k
  for (auto *N : OldHeads)
118
1.10M
    WalkUp(N, TokenIndex, WalkUp);
119
120
  // Now we select the option(s) we will use to recover.
121
  //
122
  // We prefer options starting further right, as these discard less code
123
  // (e.g. we prefer to recover inner scopes rather than outer ones).
124
  // The options also need to agree on an endpoint, so the parser has a
125
  // consistent position afterwards.
126
  //
127
  // So conceptually we're sorting by the tuple (start, end), though we avoid
128
  // computing `end` for options that can't be winners.
129
130
  // Consider options starting further right first.
131
  // Don't drop the others yet though, we may still use them if preferred fails.
132
2.86M
  llvm::stable_sort(Options, [&](const auto &L, const auto &R) {
133
2.86M
    return L.Position > R.Position;
134
2.86M
  });
135
136
  // We may find multiple winners, but they will have the same range.
137
9.53k
  std::optional<Token::Range> RecoveryRange;
138
9.53k
  std::vector<const PlaceholderRecovery *> BestOptions;
139
389k
  for (const PlaceholderRecovery &Option : Options) {
140
    // If this starts further left than options we've already found, then
141
    // we'll never find anything better. Skip computing End for the rest.
142
389k
    if (RecoveryRange && Option.Position < RecoveryRange->Begin)
143
0
      break;
144
145
389k
    auto End = findRecoveryEndpoint(Option.Strategy, Option.Position,
146
389k
                                    Params.Code, Lang);
147
    // Recovery may not take the parse backwards.
148
389k
    if (End == Token::Invalid || End < TokenIndex)
149
389k
      continue;
150
0
    if (RecoveryRange) {
151
      // If this is worse than our previous options, ignore it.
152
0
      if (RecoveryRange->End < End)
153
0
        continue;
154
      // If this is an improvement over our previous options, then drop them.
155
0
      if (RecoveryRange->End > End)
156
0
        BestOptions.clear();
157
0
    }
158
    // Create recovery nodes and heads for them in the GSS. These may be
159
    // discarded if a better recovery is later found, but this path isn't hot.
160
0
    RecoveryRange = {Option.Position, End};
161
0
    BestOptions.push_back(&Option);
162
0
  }
163
164
9.53k
  if (BestOptions.empty()) {
165
9.53k
    LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size()
166
9.53k
                            << " strategies\n");
167
9.53k
    return;
168
9.53k
  }
169
170
  // We've settled on a set of recovery options, so create their nodes and
171
  // advance the cursor.
172
0
  LLVM_DEBUG({
173
0
    llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":";
174
0
    for (const auto *Option : BestOptions)
175
0
      llvm::dbgs() << " " << Lang.G.symbolName(Option->Symbol);
176
0
    llvm::dbgs() << "\n";
177
0
  });
178
  // FIXME: in general, we might have the same Option->Symbol multiple times,
179
  // and we risk creating redundant Forest and GSS nodes.
180
  // We also may inadvertently set up the next glrReduce to create a sequence
181
  // node duplicating an opaque node that we're creating here.
182
  // There are various options, including simply breaking ties between options.
183
  // For now it's obscure enough to ignore.
184
0
  for (const PlaceholderRecovery *Option : BestOptions) {
185
0
    Option->RecoveryNode->Recovered = true;
186
0
    const ForestNode &Placeholder =
187
0
        Params.Forest.createOpaque(Option->Symbol, RecoveryRange->Begin);
188
0
    LRTable::StateID OldState = Option->RecoveryNode->State;
189
0
    LRTable::StateID NewState =
190
0
        isToken(Option->Symbol)
191
0
            ? *Lang.Table.getShiftState(OldState, Option->Symbol)
192
0
            : *Lang.Table.getGoToState(OldState, Option->Symbol);
193
0
    const GSS::Node *NewHead =
194
0
        Params.GSStack.addNode(NewState, &Placeholder, {Option->RecoveryNode});
195
0
    NewHeads.push_back(NewHead);
196
0
  }
197
0
  TokenIndex = RecoveryRange->End;
198
0
}
199
200
using StateID = LRTable::StateID;
201
202
0
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) {
203
0
  std::vector<std::string> ParentStates;
204
0
  for (const auto *Parent : N.parents())
205
0
    ParentStates.push_back(llvm::formatv("{0}", Parent->State));
206
0
  OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State,
207
0
                      N.Payload ? N.Payload->symbol() : 0,
208
0
                      llvm::join(ParentStates, ", "));
209
0
  return OS;
210
0
}
211
212
// Apply all pending shift actions.
213
// In theory, LR parsing doesn't have shift/shift conflicts on a single head.
214
// But we may have multiple active heads, and each head has a shift action.
215
//
216
// We merge the stack -- if multiple heads will reach the same state after
217
// shifting a token, we shift only once by combining these heads.
218
//
219
// E.g. we have two heads (2, 3) in the GSS, and will shift both to reach 4:
220
//   0---1---2
221
//       └---3
222
// After the shift action, the GSS is:
223
//   0---1---2---4
224
//       └---3---┘
225
void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
226
              const ForestNode &NewTok, const ParseParams &Params,
227
3.30M
              const Language &Lang, std::vector<const GSS::Node *> &NewHeads) {
228
3.30M
  assert(NewTok.kind() == ForestNode::Terminal);
229
3.30M
  LLVM_DEBUG(llvm::dbgs() << llvm::formatv("  Shift {0} ({1} active heads):\n",
230
3.30M
                                           Lang.G.symbolName(NewTok.symbol()),
231
3.30M
                                           OldHeads.size()));
232
233
  // We group pending shifts by their target state so we can merge them.
234
3.30M
  llvm::SmallVector<std::pair<StateID, const GSS::Node *>, 8> Shifts;
235
3.30M
  for (const auto *H : OldHeads)
236
60.6M
    if (auto S = Lang.Table.getShiftState(H->State, NewTok.symbol()))
237
22.2M
      Shifts.push_back({*S, H});
238
3.30M
  llvm::stable_sort(Shifts, llvm::less_first{});
239
240
3.30M
  auto Rest = llvm::ArrayRef(Shifts);
241
3.30M
  llvm::SmallVector<const GSS::Node *> Parents;
242
7.57M
  while (!Rest.empty()) {
243
    // Collect the batch of PendingShift that have compatible shift states.
244
    // Their heads become TempParents, the parents of the new GSS node.
245
4.26M
    StateID NextState = Rest.front().first;
246
247
4.26M
    Parents.clear();
248
23.2M
    for (const auto &Base : Rest) {
249
23.2M
      if (Base.first != NextState)
250
970k
        break;
251
22.2M
      Parents.push_back(Base.second);
252
22.2M
    }
253
4.26M
    Rest = Rest.drop_front(Parents.size());
254
255
4.26M
    LLVM_DEBUG(llvm::dbgs() << llvm::formatv("    --> S{0} ({1} heads)\n",
256
4.26M
                                             NextState, Parents.size()));
257
4.26M
    NewHeads.push_back(Params.GSStack.addNode(NextState, &NewTok, Parents));
258
4.26M
  }
259
3.30M
}
260
261
namespace {
262
// A KeyedQueue yields pairs of keys and values in order of the keys.
263
template <typename Key, typename Value>
264
using KeyedQueue =
265
    std::priority_queue<std::pair<Key, Value>,
266
                        std::vector<std::pair<Key, Value>>, llvm::less_first>;
267
268
81.5M
template <typename T> void sortAndUnique(std::vector<T> &Vec) {
269
81.5M
  llvm::sort(Vec);
270
81.5M
  Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end());
271
81.5M
}
GLR.cpp:void clang::pseudo::(anonymous namespace)::sortAndUnique<std::__1::pair<unsigned short, clang::pseudo::(anonymous namespace)::GLRReduce::SequenceRef> >(std::__1::vector<std::__1::pair<unsigned short, clang::pseudo::(anonymous namespace)::GLRReduce::SequenceRef>, std::__1::allocator<std::__1::pair<unsigned short, clang::pseudo::(anonymous namespace)::GLRReduce::SequenceRef> > >&)
Line
Count
Source
268
40.7M
template <typename T> void sortAndUnique(std::vector<T> &Vec) {
269
40.7M
  llvm::sort(Vec);
270
40.7M
  Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end());
271
40.7M
}
GLR.cpp:void clang::pseudo::(anonymous namespace)::sortAndUnique<std::__1::pair<unsigned short, clang::pseudo::GSS::Node const*> >(std::__1::vector<std::__1::pair<unsigned short, clang::pseudo::GSS::Node const*>, std::__1::allocator<std::__1::pair<unsigned short, clang::pseudo::GSS::Node const*> > >&)
Line
Count
Source
268
40.7M
template <typename T> void sortAndUnique(std::vector<T> &Vec) {
269
40.7M
  llvm::sort(Vec);
270
40.7M
  Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end());
271
40.7M
}
272
273
// Perform reduces until no more are possible.
274
//
275
// Generally this means walking up from the heads gathering ForestNodes that
276
// will match the RHS of the rule we're reducing into a sequence ForestNode,
277
// and ending up at a base node.
278
// Then we push a new GSS node onto that base, taking care to:
279
//  - pack alternative sequence ForestNodes into an ambiguous ForestNode.
280
//  - use the same GSS node for multiple heads if the parse state matches.
281
//
282
// Examples of reduction:
283
//   Before (simple):
284
//     0--1(expr)--2(semi)
285
//   After reducing 2 by `stmt := expr semi`:
286
//     0--3(stmt)                // 3 is goto(0, stmt)
287
//
288
//   Before (splitting due to R/R conflict):
289
//     0--1(IDENTIFIER)
290
//   After reducing 1 by `class-name := IDENTIFIER` & `enum-name := IDENTIFIER`:
291
//     0--2(class-name)          // 2 is goto(0, class-name)
292
//     └--3(enum-name)           // 3 is goto(0, enum-name)
293
//
294
//   Before (splitting due to multiple bases):
295
//     0--2(class-name)--4(STAR)
296
//     └--3(enum-name)---┘
297
//   After reducing 4 by `ptr-operator := STAR`:
298
//     0--2(class-name)--5(ptr-operator)    // 5 is goto(2, ptr-operator)
299
//     └--3(enum-name)---6(ptr-operator)    // 6 is goto(3, ptr-operator)
300
//
301
//   Before (joining due to same goto state, multiple bases):
302
//     0--1(cv-qualifier)--3(class-name)
303
//     └--2(cv-qualifier)--4(enum-name)
304
//   After reducing 3 by `type-name := class-name` and
305
//                  4 by `type-name := enum-name`:
306
//     0--1(cv-qualifier)--5(type-name)  // 5 is goto(1, type-name) and
307
//     └--2(cv-qualifier)--┘             //      goto(2, type-name)
308
//
309
//   Before (joining due to same goto state, the same base):
310
//     0--1(class-name)--3(STAR)
311
//     └--2(enum-name)--4(STAR)
312
//   After reducing 3 by `pointer := class-name STAR` and
313
//                  2 by`enum-name := class-name STAR`:
314
//     0--5(pointer)       // 5 is goto(0, pointer)
315
//
316
// (This is a functor rather than a function to allow it to reuse scratch
317
// storage across calls).
318
class GLRReduce {
319
  const ParseParams &Params;
320
  const Language& Lang;
321
  // There are two interacting complications:
322
  // 1.  Performing one reduce can unlock new reduces on the newly-created head.
323
  // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes).
324
  //     This means we must have unlocked all the reduces that contribute to it.
325
  // 2b. Similarly, the new GSS nodes must be complete (have all parents).
326
  //
327
  // We define a "family" of reduces as those that produce the same symbol and
328
  // cover the same range of tokens. These are exactly the set of reductions
329
  // whose sequence nodes would be covered by the same ambiguous node.
330
  // We wish to process a whole family at a time (to satisfy complication 2),
331
  // and can address complication 1 by carefully ordering the families:
332
  // - Process families covering fewer tokens first.
333
  //   A reduce can't depend on a longer reduce!
334
  // - For equal token ranges: if S := T, process T families before S families.
335
  //   Parsing T can't depend on an equal-length S, as the grammar is acyclic.
336
  //
337
  // This isn't quite enough: we don't know the token length of the reduction
338
  // until we walk up the stack to perform the pop.
339
  // So we perform the pop part upfront, and place the push specification on
340
  // priority queues such that we can retrieve a family at a time.
341
342
  // A reduction family is characterized by its token range and symbol produced.
343
  // It is used as a key in the priority queues to group pushes by family.
344
  struct Family {
345
    // The start of the token range of the reduce.
346
    Token::Index Start;
347
    SymbolID Symbol;
348
    // Rule must produce Symbol and can otherwise be arbitrary.
349
    // RuleIDs have the topological order based on the acyclic grammar.
350
    // FIXME: should SymbolIDs be so ordered instead?
351
    RuleID Rule;
352
353
304M
    bool operator==(const Family &Other) const {
354
304M
      return Start == Other.Start && Symbol == Other.Symbol;
355
304M
    }
356
    // The larger Family is the one that should be processed first.
357
1.52G
    bool operator<(const Family &Other) const {
358
1.52G
      if (Start != Other.Start)
359
1.19G
        return Start < Other.Start;
360
331M
      if (Symbol != Other.Symbol)
361
105M
        return Rule > Other.Rule;
362
226M
      assert(*this == Other);
363
0
      return false;
364
331M
    }
365
  };
366
367
  // A sequence is the ForestNode payloads of the GSS nodes we are reducing.
368
  using Sequence = llvm::SmallVector<const ForestNode *, Rule::MaxElements>;
369
  // Like ArrayRef<const ForestNode*>, but with the missing operator<.
370
  // (Sequences are big to move by value as the collections gets rearranged).
371
  struct SequenceRef {
372
83.1M
    SequenceRef(const Sequence &S) : S(S) {}
373
    llvm::ArrayRef<const ForestNode *> S;
374
40.8M
    friend bool operator==(SequenceRef A, SequenceRef B) { return A.S == B.S; }
375
269M
    friend bool operator<(const SequenceRef &A, const SequenceRef &B) {
376
269M
      return std::lexicographical_compare(A.S.begin(), A.S.end(), B.S.begin(),
377
269M
                                          B.S.end());
378
269M
    }
379
  };
380
  // Underlying storage for sequences pointed to by stored SequenceRefs.
381
  std::deque<Sequence> SequenceStorage;
382
  // We don't actually destroy the sequences between calls, to reuse storage.
383
  // Everything SequenceStorage[ >=SequenceStorageCount ] is reusable scratch.
384
  unsigned SequenceStorageCount;
385
386
  // Halfway through a reduction (after the pop, before the push), we have
387
  // collected nodes for the RHS of a rule, and reached a base node.
388
  // They specify a sequence ForestNode we may build (but we dedup first).
389
  // (The RuleID is not stored here, but rather in the Family).
390
  struct PushSpec {
391
    // The last node popped before pushing. Its parent is the reduction base(s).
392
    // (Base is more fundamental, but this is cheaper to store).
393
    const GSS::Node* LastPop = nullptr;
394
    Sequence *Seq = nullptr;
395
  };
396
  KeyedQueue<Family, PushSpec> Sequences; // FIXME: rename => PendingPushes?
397
398
  // We treat Heads as a queue of Pop operations still to be performed.
399
  // PoppedHeads is our position within it.
400
  std::vector<const GSS::Node *> *Heads;
401
  unsigned NextPopHead;
402
  SymbolID Lookahead;
403
404
  Sequence TempSequence;
405
public:
406
  GLRReduce(const ParseParams &Params, const Language &Lang)
407
9.56k
      : Params(Params), Lang(Lang) {}
408
409
  // Reduce Heads, resulting in new nodes that are appended to Heads.
410
  // The "consumed" nodes are not removed!
411
  // Only reduce rules compatible with the Lookahead are applied, though
412
  // tokenSymbol(tok::unknown) will match any rule.
413
3.30M
  void operator()(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead) {
414
3.30M
    assert(isToken(Lookahead));
415
416
0
    NextPopHead = 0;
417
3.30M
    this->Heads = &Heads;
418
3.30M
    this->Lookahead = Lookahead;
419
3.30M
    assert(Sequences.empty());
420
0
    SequenceStorageCount = 0;
421
422
3.30M
    popPending();
423
44.0M
    while (!Sequences.empty()) {
424
40.7M
      pushNext();
425
40.7M
      popPending();
426
40.7M
    }
427
3.30M
  }
428
429
private:
430
  bool canReduce(const Rule &R, RuleID RID,
431
104M
                 llvm::ArrayRef<const ForestNode *> RHS) const {
432
104M
    if (!R.Guarded)
433
94.6M
      return true;
434
9.42M
    if (auto Guard = Lang.Guards.lookup(RID))
435
9.42M
      return Guard({RHS, Params.Code, Lookahead});
436
0
    LLVM_DEBUG(llvm::dbgs()
437
0
               << llvm::formatv("missing guard implementation for rule {0}\n",
438
0
                                Lang.G.dumpRule(RID)));
439
0
    return true;
440
9.42M
  }
441
  // pop walks up the parent chain(s) for a reduction from Head by to Rule.
442
  // Once we reach the end, record the bases and sequences.
443
33.0M
  void pop(const GSS::Node *Head, RuleID RID, const Rule &Rule) {
444
33.0M
    LLVM_DEBUG(llvm::dbgs() << "  Pop " << Lang.G.dumpRule(RID) << "\n");
445
33.0M
    Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID};
446
33.0M
    TempSequence.resize_for_overwrite(Rule.Size);
447
101M
    auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) {
448
101M
      TempSequence[Rule.Size - 1 - I] = N->Payload;
449
101M
      if (I + 1 == Rule.Size) {
450
91.2M
        F.Start = TempSequence.front()->startTokenIndex();
451
91.2M
        LLVM_DEBUG({
452
91.2M
          for (const auto *B : N->parents())
453
91.2M
            llvm::dbgs() << "    --> base at S" << B->State << "\n";
454
91.2M
        });
455
91.2M
        if (!canReduce(Rule, RID, TempSequence))
456
8.07M
          return;
457
        // Copy the chain to stable storage so it can be enqueued.
458
83.1M
        if (SequenceStorageCount == SequenceStorage.size())
459
8.37M
          SequenceStorage.emplace_back();
460
83.1M
        SequenceStorage[SequenceStorageCount] = TempSequence;
461
83.1M
        Sequence *Seq = &SequenceStorage[SequenceStorageCount++];
462
463
83.1M
        Sequences.emplace(F, PushSpec{N, Seq});
464
83.1M
        return;
465
91.2M
      }
466
10.6M
      for (const GSS::Node *Parent : N->parents())
467
68.7M
        DFS(Parent, I + 1, DFS);
468
10.6M
    };
469
33.0M
    DFS(Head, 0, DFS);
470
33.0M
  }
471
472
  // popPending pops every available reduction.
473
44.0M
  void popPending() {
474
105M
    for (; NextPopHead < Heads->size(); ++NextPopHead) {
475
      // In trivial cases, we perform the complete reduce here!
476
61.7M
      if (popAndPushTrivial())
477
16.1M
        continue;
478
45.5M
      for (RuleID RID :
479
46.2M
           Lang.Table.getReduceRules((*Heads)[NextPopHead]->State)) {
480
46.2M
        const auto &Rule = Lang.G.lookupRule(RID);
481
46.2M
        if (Lang.Table.canFollow(Rule.Target, Lookahead))
482
33.0M
          pop((*Heads)[NextPopHead], RID, Rule);
483
46.2M
      }
484
45.5M
    }
485
44.0M
  }
486
487
  // Storage reused by each call to pushNext.
488
  std::vector<std::pair</*Goto*/ StateID, const GSS::Node *>> FamilyBases;
489
  std::vector<std::pair<RuleID, SequenceRef>> FamilySequences;
490
  std::vector<const GSS::Node *> Parents;
491
  std::vector<const ForestNode *> SequenceNodes;
492
493
  // Process one push family, forming a forest node.
494
  // This produces new GSS heads which may enable more pops.
495
40.7M
  void pushNext() {
496
40.7M
    assert(!Sequences.empty());
497
0
    Family F = Sequences.top().first;
498
499
40.7M
    LLVM_DEBUG(llvm::dbgs() << "  Push " << Lang.G.symbolName(F.Symbol)
500
40.7M
                            << " from token " << F.Start << "\n");
501
502
    // Grab the sequences and bases for this family.
503
    // We don't care which rule yielded each base. If Family.Symbol is S, the
504
    // base includes an item X := ... • S ... and since the grammar is
505
    // context-free, *all* parses of S are valid here.
506
40.7M
    FamilySequences.clear();
507
40.7M
    FamilyBases.clear();
508
83.1M
    do {
509
83.1M
      const PushSpec &Push = Sequences.top().second;
510
83.1M
      FamilySequences.emplace_back(Sequences.top().first.Rule, *Push.Seq);
511
263M
      for (const GSS::Node *Base : Push.LastPop->parents()) {
512
263M
        auto NextState = Lang.Table.getGoToState(Base->State, F.Symbol);
513
263M
        assert(NextState.has_value() && "goto must succeed after reduce!");
514
0
        FamilyBases.emplace_back(*NextState, Base);
515
263M
      }
516
517
83.1M
      Sequences.pop();
518
83.1M
    } while (!Sequences.empty() && Sequences.top().first == F);
519
    // Build a forest node for each unique sequence.
520
40.7M
    sortAndUnique(FamilySequences);
521
40.7M
    SequenceNodes.clear();
522
40.7M
    for (const auto &SequenceSpec : FamilySequences)
523
75.9M
      SequenceNodes.push_back(&Params.Forest.createSequence(
524
75.9M
          F.Symbol, SequenceSpec.first, SequenceSpec.second.S));
525
    // Wrap in an ambiguous node if needed.
526
40.7M
    const ForestNode *Parsed =
527
40.7M
        SequenceNodes.size() == 1
528
40.7M
            ? SequenceNodes.front()
529
40.7M
            : &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes);
530
40.7M
    LLVM_DEBUG(llvm::dbgs() << "    --> " << Parsed->dump(Lang.G) << "\n");
531
532
    // Bases for this family, deduplicate them, and group by the goTo State.
533
40.7M
    sortAndUnique(FamilyBases);
534
    // Create a GSS node for each unique goto state.
535
40.7M
    llvm::ArrayRef<decltype(FamilyBases)::value_type> BasesLeft = FamilyBases;
536
85.3M
    while (!BasesLeft.empty()) {
537
44.5M
      StateID NextState = BasesLeft.front().first;
538
44.5M
      Parents.clear();
539
143M
      for (const auto &Base : BasesLeft) {
540
143M
        if (Base.first != NextState)
541
3.82M
          break;
542
139M
        Parents.push_back(Base.second);
543
139M
      }
544
44.5M
      BasesLeft = BasesLeft.drop_front(Parents.size());
545
44.5M
      Heads->push_back(Params.GSStack.addNode(NextState, Parsed, Parents));
546
44.5M
    }
547
40.7M
  }
548
549
  // In general we split a reduce into a pop/push, so concurrently-available
550
  // reductions can run in the correct order. The data structures are expensive.
551
  //
552
  // When only one reduction is possible at a time, we can skip this:
553
  // we pop and immediately push, as an LR parser (as opposed to GLR) would.
554
  // This is valid whenever there's only one concurrent PushSpec.
555
  //
556
  // This function handles a trivial but common subset of these cases:
557
  //  - there must be no pending pushes, and only one poppable head
558
  //  - the head must have only one reduction rule
559
  //  - the reduction path must be a straight line (no multiple parents)
560
  // (Roughly this means there's no local ambiguity, so the LR algorithm works).
561
  //
562
  // Returns true if we successfully consumed the next unpopped head.
563
61.7M
  bool popAndPushTrivial() {
564
61.7M
    if (!Sequences.empty() || Heads->size() != NextPopHead + 1)
565
40.7M
      return false;
566
20.9M
    const GSS::Node *Head = Heads->back();
567
20.9M
    std::optional<RuleID> RID;
568
20.9M
    for (RuleID R : Lang.Table.getReduceRules(Head->State)) {
569
20.2M
      if (RID.has_value())
570
1.51M
        return false;
571
18.7M
      RID = R;
572
18.7M
    }
573
19.4M
    if (!RID)
574
2.22M
      return true; // no reductions available, but we've processed the head!
575
17.2M
    const auto &Rule = Lang.G.lookupRule(*RID);
576
17.2M
    if (!Lang.Table.canFollow(Rule.Target, Lookahead))
577
1.08M
      return true; // reduction is not available
578
16.1M
    const GSS::Node *Base = Head;
579
16.1M
    TempSequence.resize_for_overwrite(Rule.Size);
580
30.5M
    for (unsigned I = 0; I < Rule.Size; ++I) {
581
17.7M
      if (Base->parents().size() != 1)
582
3.28M
        return false;
583
14.4M
      TempSequence[Rule.Size - 1 - I] = Base->Payload;
584
14.4M
      Base = Base->parents().front();
585
14.4M
    }
586
12.8M
    if (!canReduce(Rule, *RID, TempSequence))
587
899
      return true; // reduction is not available
588
12.8M
    const ForestNode *Parsed =
589
12.8M
        &Params.Forest.createSequence(Rule.Target, *RID, TempSequence);
590
12.8M
    auto NextState = Lang.Table.getGoToState(Base->State, Rule.Target);
591
12.8M
    assert(NextState.has_value() && "goto must succeed after reduce!");
592
0
    Heads->push_back(Params.GSStack.addNode(*NextState, Parsed, {Base}));
593
12.8M
    LLVM_DEBUG(llvm::dbgs()
594
12.8M
               << "  Reduce (trivial) " << Lang.G.dumpRule(*RID) << "\n"
595
12.8M
               << "    --> S" << Heads->back()->State << "\n");
596
12.8M
    return true;
597
12.8M
  }
598
};
599
600
} // namespace
601
602
ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
603
9.56k
                     const Language &Lang) {
604
9.56k
  GLRReduce Reduce(Params, Lang);
605
9.56k
  assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal");
606
0
  llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Params.Code);
607
9.56k
  auto &GSS = Params.GSStack;
608
609
9.56k
  StateID StartState = Lang.Table.getStartState(StartSymbol);
610
  // Heads correspond to the parse of tokens [0, I), NextHeads to [0, I+1).
611
9.56k
  std::vector<const GSS::Node *> Heads = {GSS.addNode(/*State=*/StartState,
612
9.56k
                                                      /*ForestNode=*/nullptr,
613
9.56k
                                                      {})};
614
  // Invariant: Heads is partitioned by source: {shifted | reduced}.
615
  // HeadsPartition is the index of the first head formed by reduction.
616
  // We use this to discard and recreate the reduced heads during recovery.
617
9.56k
  unsigned HeadsPartition = Heads.size();
618
9.56k
  std::vector<const GSS::Node *> NextHeads;
619
3.29M
  auto MaybeGC = [&, Roots(std::vector<const GSS::Node *>{}), I(0u)]() mutable {
620
3.29M
    assert(NextHeads.empty() && "Running GC at the wrong time!");
621
3.29M
    if (++I != 20) // Run periodically to balance CPU and memory usage.
622
3.13M
      return;
623
163k
    I = 0;
624
625
    // We need to copy the list: Roots is consumed by the GC.
626
163k
    Roots = Heads;
627
163k
    GSS.gc(std::move(Roots));
628
163k
  };
629
  // Each iteration fully processes a single token.
630
3.30M
  for (unsigned I = 0; I < Terminals.size();) {
631
3.30M
    LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
632
3.30M
                   "Next token {0} (id={1})\n",
633
3.30M
                  Lang.G.symbolName(Terminals[I].symbol()), Terminals[I].symbol()));
634
    // Consume the token.
635
3.30M
    glrShift(Heads, Terminals[I], Params, Lang, NextHeads);
636
637
    // If we weren't able to consume the token, try to skip over some tokens
638
    // so we can keep parsing.
639
3.30M
    if (NextHeads.empty()) {
640
      // The reduction in the previous round was constrained by lookahead.
641
      // On valid code this only rejects dead ends, but on broken code we should
642
      // consider all possibilities.
643
      //
644
      // We discard all heads formed by reduction, and recreate them without
645
      // this constraint. This may duplicate some nodes, but it's rare.
646
9.53k
      LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. "
647
9.53k
                                 "Re-reducing without lookahead.\n");
648
9.53k
      Heads.resize(HeadsPartition);
649
9.53k
      Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown));
650
651
9.53k
      glrRecover(Heads, I, Params, Lang, NextHeads);
652
9.53k
      if (NextHeads.empty())
653
        // FIXME: Ensure the `_ := start-symbol` rules have a fallback
654
        // error-recovery strategy attached. Then this condition can't happen.
655
9.53k
        return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
656
9.53k
    } else
657
3.29M
      ++I;
658
659
    // Form nonterminals containing the token we just consumed.
660
3.29M
    SymbolID Lookahead =
661
3.29M
        I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
662
3.29M
    HeadsPartition = NextHeads.size();
663
3.29M
    Reduce(NextHeads, Lookahead);
664
    // Prepare for the next token.
665
3.29M
    std::swap(Heads, NextHeads);
666
3.29M
    NextHeads.clear();
667
3.29M
    MaybeGC();
668
3.29M
  }
669
35
  LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
670
671
  // The parse was successful if in state `_ := start-symbol EOF .`
672
  // The GSS parent has `_ := start-symbol . EOF`; its payload is the parse.
673
35
  auto AfterStart = Lang.Table.getGoToState(StartState, StartSymbol);
674
35
  assert(AfterStart.has_value() && "goto must succeed after start symbol!");
675
0
  auto Accept = Lang.Table.getShiftState(*AfterStart, tokenSymbol(tok::eof));
676
35
  assert(Accept.has_value() && "shift EOF must succeed!");
677
35
  auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
678
35
    const ForestNode *Result = nullptr;
679
35
    for (const auto *Head : Heads) {
680
35
      if (Head->State == *Accept) {
681
35
        assert(Head->Payload->symbol() == tokenSymbol(tok::eof));
682
0
        assert(Result == nullptr && "multiple results!");
683
0
        Result = Head->parents().front()->Payload;
684
35
        assert(Result->symbol() == StartSymbol);
685
35
      }
686
35
    }
687
35
    return Result;
688
35
  };
689
35
  if (auto *Result = SearchForAccept(Heads))
690
35
    return *const_cast<ForestNode *>(Result); // Safe: we created all nodes.
691
  // We failed to parse the input, returning an opaque forest node for recovery.
692
  // FIXME: as above, we can add fallback error handling so this is impossible.
693
0
  return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
694
35
}
695
696
void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
697
0
               const ParseParams &Params, const Language &Lang) {
698
  // Create a new GLRReduce each time for tests, performance doesn't matter.
699
0
  GLRReduce{Params, Lang}(Heads, Lookahead);
700
0
}
701
702
const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
703
61.7M
                              llvm::ArrayRef<const Node *> Parents) {
704
61.7M
  Node *Result = new (allocate(Parents.size())) Node();
705
61.7M
  Result->State = State;
706
61.7M
  Result->GCParity = GCParity;
707
61.7M
  Result->ParentCount = Parents.size();
708
61.7M
  Alive.push_back(Result);
709
61.7M
  ++NodesCreated;
710
61.7M
  Result->Payload = Symbol;
711
61.7M
  if (!Parents.empty())
712
61.7M
    llvm::copy(Parents, reinterpret_cast<const Node **>(Result + 1));
713
61.7M
  return Result;
714
61.7M
}
715
716
61.7M
GSS::Node *GSS::allocate(unsigned Parents) {
717
61.7M
  if (FreeList.size() <= Parents)
718
67.8k
    FreeList.resize(Parents + 1);
719
61.7M
  auto &SizedList = FreeList[Parents];
720
61.7M
  if (!SizedList.empty()) {
721
44.1M
    auto *Result = SizedList.back();
722
44.1M
    SizedList.pop_back();
723
44.1M
    return Result;
724
44.1M
  }
725
17.5M
  return static_cast<Node *>(
726
17.5M
      Arena.Allocate(sizeof(Node) + Parents * sizeof(Node *), alignof(Node)));
727
61.7M
}
728
729
46.1M
void GSS::destroy(Node *N) {
730
46.1M
  unsigned ParentCount = N->ParentCount;
731
46.1M
  N->~Node();
732
46.1M
  assert(FreeList.size() > ParentCount && "established on construction!");
733
0
  FreeList[ParentCount].push_back(N);
734
46.1M
}
735
736
163k
unsigned GSS::gc(std::vector<const Node *> &&Queue) {
737
163k
#ifndef NDEBUG
738
770M
  auto ParityMatches = [&](const Node *N) { return N->GCParity == GCParity; };
739
163k
  assert("Before GC" && llvm::all_of(Alive, ParityMatches));
740
0
  auto Deferred = llvm::make_scope_exit(
741
163k
      [&] { assert("After GC" && llvm::all_of(Alive, ParityMatches)); });
742
163k
  assert(llvm::all_of(
743
163k
      Queue, [&](const Node *R) { return llvm::is_contained(Alive, R); }));
744
0
#endif
745
0
  unsigned InitialCount = Alive.size();
746
747
  // Mark
748
163k
  GCParity = !GCParity;
749
730M
  while (!Queue.empty()) {
750
730M
    Node *N = const_cast<Node *>(Queue.back()); // Safe: we created these nodes.
751
730M
    Queue.pop_back();
752
730M
    if (N->GCParity != GCParity) { // Not seen yet
753
362M
      N->GCParity = GCParity;      // Mark as seen
754
362M
      for (const Node *P : N->parents()) // And walk parents
755
727M
        Queue.push_back(P);
756
362M
    }
757
730M
  }
758
  // Sweep
759
408M
  llvm::erase_if(Alive, [&](Node *N) {
760
408M
    if (N->GCParity == GCParity) // Walk reached this node.
761
362M
      return false;
762
46.1M
    destroy(N);
763
46.1M
    return true;
764
408M
  });
765
766
163k
  LLVM_DEBUG(llvm::dbgs() << "GC pruned " << (InitialCount - Alive.size())
767
163k
                          << "/" << InitialCount << " GSS nodes\n");
768
163k
  return InitialCount - Alive.size();
769
163k
}
770
771
} // namespace pseudo
772
} // namespace clang