Coverage Report

Created: 2024-01-17 10:31

/src/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This Pass handles loop interchange transform.
10
// This pass interchanges loops to provide a more cache-friendly memory access
11
// patterns.
12
//
13
//===----------------------------------------------------------------------===//
14
15
#include "llvm/Transforms/Scalar/LoopInterchange.h"
16
#include "llvm/ADT/STLExtras.h"
17
#include "llvm/ADT/SmallVector.h"
18
#include "llvm/ADT/Statistic.h"
19
#include "llvm/ADT/StringRef.h"
20
#include "llvm/Analysis/DependenceAnalysis.h"
21
#include "llvm/Analysis/LoopCacheAnalysis.h"
22
#include "llvm/Analysis/LoopInfo.h"
23
#include "llvm/Analysis/LoopNestAnalysis.h"
24
#include "llvm/Analysis/LoopPass.h"
25
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26
#include "llvm/Analysis/ScalarEvolution.h"
27
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
28
#include "llvm/IR/BasicBlock.h"
29
#include "llvm/IR/Constants.h"
30
#include "llvm/IR/DiagnosticInfo.h"
31
#include "llvm/IR/Dominators.h"
32
#include "llvm/IR/Function.h"
33
#include "llvm/IR/InstrTypes.h"
34
#include "llvm/IR/Instruction.h"
35
#include "llvm/IR/Instructions.h"
36
#include "llvm/IR/User.h"
37
#include "llvm/IR/Value.h"
38
#include "llvm/Support/Casting.h"
39
#include "llvm/Support/CommandLine.h"
40
#include "llvm/Support/Debug.h"
41
#include "llvm/Support/ErrorHandling.h"
42
#include "llvm/Support/raw_ostream.h"
43
#include "llvm/Transforms/Scalar/LoopPassManager.h"
44
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
45
#include "llvm/Transforms/Utils/LoopUtils.h"
46
#include <cassert>
47
#include <utility>
48
#include <vector>
49
50
using namespace llvm;
51
52
0
#define DEBUG_TYPE "loop-interchange"
53
54
STATISTIC(LoopsInterchanged, "Number of loops interchanged");
55
56
static cl::opt<int> LoopInterchangeCostThreshold(
57
    "loop-interchange-threshold", cl::init(0), cl::Hidden,
58
    cl::desc("Interchange if you gain more than this number"));
59
60
namespace {
61
62
using LoopVector = SmallVector<Loop *, 8>;
63
64
// TODO: Check if we can use a sparse matrix here.
65
using CharMatrix = std::vector<std::vector<char>>;
66
67
} // end anonymous namespace
68
69
// Maximum number of dependencies that can be handled in the dependency matrix.
70
static const unsigned MaxMemInstrCount = 100;
71
72
// Maximum loop depth supported.
73
static const unsigned MaxLoopNestDepth = 10;
74
75
#ifdef DUMP_DEP_MATRICIES
76
static void printDepMatrix(CharMatrix &DepMatrix) {
77
  for (auto &Row : DepMatrix) {
78
    for (auto D : Row)
79
      LLVM_DEBUG(dbgs() << D << " ");
80
    LLVM_DEBUG(dbgs() << "\n");
81
  }
82
}
83
#endif
84
85
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
86
                                     Loop *L, DependenceInfo *DI,
87
0
                                     ScalarEvolution *SE) {
88
0
  using ValueVector = SmallVector<Value *, 16>;
89
90
0
  ValueVector MemInstr;
91
92
  // For each block.
93
0
  for (BasicBlock *BB : L->blocks()) {
94
    // Scan the BB and collect legal loads and stores.
95
0
    for (Instruction &I : *BB) {
96
0
      if (!isa<Instruction>(I))
97
0
        return false;
98
0
      if (auto *Ld = dyn_cast<LoadInst>(&I)) {
99
0
        if (!Ld->isSimple())
100
0
          return false;
101
0
        MemInstr.push_back(&I);
102
0
      } else if (auto *St = dyn_cast<StoreInst>(&I)) {
103
0
        if (!St->isSimple())
104
0
          return false;
105
0
        MemInstr.push_back(&I);
106
0
      }
107
0
    }
108
0
  }
109
110
0
  LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
111
0
                    << " Loads and Stores to analyze\n");
112
113
0
  ValueVector::iterator I, IE, J, JE;
114
115
0
  for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
116
0
    for (J = I, JE = MemInstr.end(); J != JE; ++J) {
117
0
      std::vector<char> Dep;
118
0
      Instruction *Src = cast<Instruction>(*I);
119
0
      Instruction *Dst = cast<Instruction>(*J);
120
      // Ignore Input dependencies.
121
0
      if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
122
0
        continue;
123
      // Track Output, Flow, and Anti dependencies.
124
0
      if (auto D = DI->depends(Src, Dst, true)) {
125
0
        assert(D->isOrdered() && "Expected an output, flow or anti dep.");
126
        // If the direction vector is negative, normalize it to
127
        // make it non-negative.
128
0
        if (D->normalize(SE))
129
0
          LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
130
0
        LLVM_DEBUG(StringRef DepType =
131
0
                       D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
132
0
                   dbgs() << "Found " << DepType
133
0
                          << " dependency between Src and Dst\n"
134
0
                          << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
135
0
        unsigned Levels = D->getLevels();
136
0
        char Direction;
137
0
        for (unsigned II = 1; II <= Levels; ++II) {
138
0
          if (D->isScalar(II)) {
139
0
            Direction = 'S';
140
0
            Dep.push_back(Direction);
141
0
          } else {
142
0
            unsigned Dir = D->getDirection(II);
143
0
            if (Dir == Dependence::DVEntry::LT ||
144
0
                Dir == Dependence::DVEntry::LE)
145
0
              Direction = '<';
146
0
            else if (Dir == Dependence::DVEntry::GT ||
147
0
                     Dir == Dependence::DVEntry::GE)
148
0
              Direction = '>';
149
0
            else if (Dir == Dependence::DVEntry::EQ)
150
0
              Direction = '=';
151
0
            else
152
0
              Direction = '*';
153
0
            Dep.push_back(Direction);
154
0
          }
155
0
        }
156
0
        while (Dep.size() != Level) {
157
0
          Dep.push_back('I');
158
0
        }
159
160
0
        DepMatrix.push_back(Dep);
161
0
        if (DepMatrix.size() > MaxMemInstrCount) {
162
0
          LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
163
0
                            << " dependencies inside loop\n");
164
0
          return false;
165
0
        }
166
0
      }
167
0
    }
168
0
  }
169
170
0
  return true;
171
0
}
172
173
// A loop is moved from index 'from' to an index 'to'. Update the Dependence
174
// matrix by exchanging the two columns.
175
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
176
0
                                    unsigned ToIndx) {
177
0
  for (unsigned I = 0, E = DepMatrix.size(); I < E; ++I)
178
0
    std::swap(DepMatrix[I][ToIndx], DepMatrix[I][FromIndx]);
179
0
}
180
181
// After interchanging, check if the direction vector is valid.
182
// [Theorem] A permutation of the loops in a perfect nest is legal if and only
183
// if the direction matrix, after the same permutation is applied to its
184
// columns, has no ">" direction as the leftmost non-"=" direction in any row.
185
0
static bool isLexicographicallyPositive(std::vector<char> &DV) {
186
0
  for (unsigned char Direction : DV) {
187
0
    if (Direction == '<')
188
0
      return true;
189
0
    if (Direction == '>' || Direction == '*')
190
0
      return false;
191
0
  }
192
0
  return true;
193
0
}
194
195
// Checks if it is legal to interchange 2 loops.
196
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
197
                                      unsigned InnerLoopId,
198
0
                                      unsigned OuterLoopId) {
199
0
  unsigned NumRows = DepMatrix.size();
200
0
  std::vector<char> Cur;
201
  // For each row check if it is valid to interchange.
202
0
  for (unsigned Row = 0; Row < NumRows; ++Row) {
203
    // Create temporary DepVector check its lexicographical order
204
    // before and after swapping OuterLoop vs InnerLoop
205
0
    Cur = DepMatrix[Row];
206
0
    if (!isLexicographicallyPositive(Cur))
207
0
      return false;
208
0
    std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
209
0
    if (!isLexicographicallyPositive(Cur))
210
0
      return false;
211
0
  }
212
0
  return true;
213
0
}
214
215
0
static void populateWorklist(Loop &L, LoopVector &LoopList) {
216
0
  LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
217
0
                    << L.getHeader()->getParent()->getName() << " Loop: %"
218
0
                    << L.getHeader()->getName() << '\n');
219
0
  assert(LoopList.empty() && "LoopList should initially be empty!");
220
0
  Loop *CurrentLoop = &L;
221
0
  const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
222
0
  while (!Vec->empty()) {
223
0
    // The current loop has multiple subloops in it hence it is not tightly
224
0
    // nested.
225
0
    // Discard all loops above it added into Worklist.
226
0
    if (Vec->size() != 1) {
227
0
      LoopList = {};
228
0
      return;
229
0
    }
230
0
231
0
    LoopList.push_back(CurrentLoop);
232
0
    CurrentLoop = Vec->front();
233
0
    Vec = &CurrentLoop->getSubLoops();
234
0
  }
235
0
  LoopList.push_back(CurrentLoop);
236
0
}
237
238
namespace {
239
240
/// LoopInterchangeLegality checks if it is legal to interchange the loop.
241
class LoopInterchangeLegality {
242
public:
243
  LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
244
                          OptimizationRemarkEmitter *ORE)
245
0
      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
246
247
  /// Check if the loops can be interchanged.
248
  bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
249
                           CharMatrix &DepMatrix);
250
251
  /// Discover induction PHIs in the header of \p L. Induction
252
  /// PHIs are added to \p Inductions.
253
  bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
254
255
  /// Check if the loop structure is understood. We do not handle triangular
256
  /// loops for now.
257
  bool isLoopStructureUnderstood();
258
259
  bool currentLimitations();
260
261
0
  const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
262
0
    return OuterInnerReductions;
263
0
  }
264
265
0
  const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const {
266
0
    return InnerLoopInductions;
267
0
  }
268
269
private:
270
  bool tightlyNested(Loop *Outer, Loop *Inner);
271
  bool containsUnsafeInstructions(BasicBlock *BB);
272
273
  /// Discover induction and reduction PHIs in the header of \p L. Induction
274
  /// PHIs are added to \p Inductions, reductions are added to
275
  /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
276
  /// to be passed as \p InnerLoop.
277
  bool findInductionAndReductions(Loop *L,
278
                                  SmallVector<PHINode *, 8> &Inductions,
279
                                  Loop *InnerLoop);
280
281
  Loop *OuterLoop;
282
  Loop *InnerLoop;
283
284
  ScalarEvolution *SE;
285
286
  /// Interface to emit optimization remarks.
287
  OptimizationRemarkEmitter *ORE;
288
289
  /// Set of reduction PHIs taking part of a reduction across the inner and
290
  /// outer loop.
291
  SmallPtrSet<PHINode *, 4> OuterInnerReductions;
292
293
  /// Set of inner loop induction PHIs
294
  SmallVector<PHINode *, 8> InnerLoopInductions;
295
};
296
297
/// LoopInterchangeProfitability checks if it is profitable to interchange the
298
/// loop.
299
class LoopInterchangeProfitability {
300
public:
301
  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
302
                               OptimizationRemarkEmitter *ORE)
303
0
      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
304
305
  /// Check if the loop interchange is profitable.
306
  bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
307
                    unsigned InnerLoopId, unsigned OuterLoopId,
308
                    CharMatrix &DepMatrix,
309
                    const DenseMap<const Loop *, unsigned> &CostMap,
310
                    std::unique_ptr<CacheCost> &CC);
311
312
private:
313
  int getInstrOrderCost();
314
  std::optional<bool> isProfitablePerLoopCacheAnalysis(
315
      const DenseMap<const Loop *, unsigned> &CostMap,
316
      std::unique_ptr<CacheCost> &CC);
317
  std::optional<bool> isProfitablePerInstrOrderCost();
318
  std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
319
                                                   unsigned OuterLoopId,
320
                                                   CharMatrix &DepMatrix);
321
  Loop *OuterLoop;
322
  Loop *InnerLoop;
323
324
  /// Scev analysis.
325
  ScalarEvolution *SE;
326
327
  /// Interface to emit optimization remarks.
328
  OptimizationRemarkEmitter *ORE;
329
};
330
331
/// LoopInterchangeTransform interchanges the loop.
332
class LoopInterchangeTransform {
333
public:
334
  LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
335
                           LoopInfo *LI, DominatorTree *DT,
336
                           const LoopInterchangeLegality &LIL)
337
0
      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
338
339
  /// Interchange OuterLoop and InnerLoop.
340
  bool transform();
341
  void restructureLoops(Loop *NewInner, Loop *NewOuter,
342
                        BasicBlock *OrigInnerPreHeader,
343
                        BasicBlock *OrigOuterPreHeader);
344
  void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
345
346
private:
347
  bool adjustLoopLinks();
348
  bool adjustLoopBranches();
349
350
  Loop *OuterLoop;
351
  Loop *InnerLoop;
352
353
  /// Scev analysis.
354
  ScalarEvolution *SE;
355
356
  LoopInfo *LI;
357
  DominatorTree *DT;
358
359
  const LoopInterchangeLegality &LIL;
360
};
361
362
struct LoopInterchange {
363
  ScalarEvolution *SE = nullptr;
364
  LoopInfo *LI = nullptr;
365
  DependenceInfo *DI = nullptr;
366
  DominatorTree *DT = nullptr;
367
  std::unique_ptr<CacheCost> CC = nullptr;
368
369
  /// Interface to emit optimization remarks.
370
  OptimizationRemarkEmitter *ORE;
371
372
  LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
373
                  DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
374
                  OptimizationRemarkEmitter *ORE)
375
0
      : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
376
377
0
  bool run(Loop *L) {
378
0
    if (L->getParentLoop())
379
0
      return false;
380
0
    SmallVector<Loop *, 8> LoopList;
381
0
    populateWorklist(*L, LoopList);
382
0
    return processLoopList(LoopList);
383
0
  }
384
385
0
  bool run(LoopNest &LN) {
386
0
    SmallVector<Loop *, 8> LoopList(LN.getLoops().begin(), LN.getLoops().end());
387
0
    for (unsigned I = 1; I < LoopList.size(); ++I)
388
0
      if (LoopList[I]->getParentLoop() != LoopList[I - 1])
389
0
        return false;
390
0
    return processLoopList(LoopList);
391
0
  }
392
393
0
  bool isComputableLoopNest(ArrayRef<Loop *> LoopList) {
394
0
    for (Loop *L : LoopList) {
395
0
      const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
396
0
      if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
397
0
        LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
398
0
        return false;
399
0
      }
400
0
      if (L->getNumBackEdges() != 1) {
401
0
        LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
402
0
        return false;
403
0
      }
404
0
      if (!L->getExitingBlock()) {
405
0
        LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
406
0
        return false;
407
0
      }
408
0
    }
409
0
    return true;
410
0
  }
411
412
0
  unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
413
    // TODO: Add a better heuristic to select the loop to be interchanged based
414
    // on the dependence matrix. Currently we select the innermost loop.
415
0
    return LoopList.size() - 1;
416
0
  }
417
418
0
  bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
419
0
    bool Changed = false;
420
0
    unsigned LoopNestDepth = LoopList.size();
421
0
    if (LoopNestDepth < 2) {
422
0
      LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
423
0
      return false;
424
0
    }
425
0
    if (LoopNestDepth > MaxLoopNestDepth) {
426
0
      LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
427
0
                        << MaxLoopNestDepth << "\n");
428
0
      return false;
429
0
    }
430
0
    if (!isComputableLoopNest(LoopList)) {
431
0
      LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
432
0
      return false;
433
0
    }
434
435
0
    LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
436
0
                      << "\n");
437
438
0
    CharMatrix DependencyMatrix;
439
0
    Loop *OuterMostLoop = *(LoopList.begin());
440
0
    if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
441
0
                                  OuterMostLoop, DI, SE)) {
442
0
      LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
443
0
      return false;
444
0
    }
445
#ifdef DUMP_DEP_MATRICIES
446
    LLVM_DEBUG(dbgs() << "Dependence before interchange\n");
447
    printDepMatrix(DependencyMatrix);
448
#endif
449
450
    // Get the Outermost loop exit.
451
0
    BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
452
0
    if (!LoopNestExit) {
453
0
      LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
454
0
      return false;
455
0
    }
456
457
0
    unsigned SelecLoopId = selectLoopForInterchange(LoopList);
458
    // Obtain the loop vector returned from loop cache analysis beforehand,
459
    // and put each <Loop, index> pair into a map for constant time query
460
    // later. Indices in loop vector reprsent the optimal order of the
461
    // corresponding loop, e.g., given a loopnest with depth N, index 0
462
    // indicates the loop should be placed as the outermost loop and index N
463
    // indicates the loop should be placed as the innermost loop.
464
    //
465
    // For the old pass manager CacheCost would be null.
466
0
    DenseMap<const Loop *, unsigned> CostMap;
467
0
    if (CC != nullptr) {
468
0
      const auto &LoopCosts = CC->getLoopCosts();
469
0
      for (unsigned i = 0; i < LoopCosts.size(); i++) {
470
0
        CostMap[LoopCosts[i].first] = i;
471
0
      }
472
0
    }
473
    // We try to achieve the globally optimal memory access for the loopnest,
474
    // and do interchange based on a bubble-sort fasion. We start from
475
    // the innermost loop, move it outwards to the best possible position
476
    // and repeat this process.
477
0
    for (unsigned j = SelecLoopId; j > 0; j--) {
478
0
      bool ChangedPerIter = false;
479
0
      for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
480
0
        bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
481
0
                                        DependencyMatrix, CostMap);
482
0
        if (!Interchanged)
483
0
          continue;
484
        // Loops interchanged, update LoopList accordingly.
485
0
        std::swap(LoopList[i - 1], LoopList[i]);
486
        // Update the DependencyMatrix
487
0
        interChangeDependencies(DependencyMatrix, i, i - 1);
488
#ifdef DUMP_DEP_MATRICIES
489
        LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
490
        printDepMatrix(DependencyMatrix);
491
#endif
492
0
        ChangedPerIter |= Interchanged;
493
0
        Changed |= Interchanged;
494
0
      }
495
      // Early abort if there was no interchange during an entire round of
496
      // moving loops outwards.
497
0
      if (!ChangedPerIter)
498
0
        break;
499
0
    }
500
0
    return Changed;
501
0
  }
502
503
  bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
504
                   unsigned OuterLoopId,
505
                   std::vector<std::vector<char>> &DependencyMatrix,
506
0
                   const DenseMap<const Loop *, unsigned> &CostMap) {
507
0
    LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
508
0
                      << " and OuterLoopId = " << OuterLoopId << "\n");
509
0
    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
510
0
    if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
511
0
      LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
512
0
      return false;
513
0
    }
514
0
    LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
515
0
    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
516
0
    if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
517
0
                          DependencyMatrix, CostMap, CC)) {
518
0
      LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
519
0
      return false;
520
0
    }
521
522
0
    ORE->emit([&]() {
523
0
      return OptimizationRemark(DEBUG_TYPE, "Interchanged",
524
0
                                InnerLoop->getStartLoc(),
525
0
                                InnerLoop->getHeader())
526
0
             << "Loop interchanged with enclosing loop.";
527
0
    });
528
529
0
    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
530
0
    LIT.transform();
531
0
    LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
532
0
    LoopsInterchanged++;
533
534
0
    llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
535
0
    return true;
536
0
  }
537
};
538
539
} // end anonymous namespace
540
541
0
bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
542
0
  return any_of(*BB, [](const Instruction &I) {
543
0
    return I.mayHaveSideEffects() || I.mayReadFromMemory();
544
0
  });
545
0
}
546
547
0
bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
548
0
  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
549
0
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
550
0
  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
551
552
0
  LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
553
554
  // A perfectly nested loop will not have any branch in between the outer and
555
  // inner block i.e. outer header will branch to either inner preheader and
556
  // outerloop latch.
557
0
  BranchInst *OuterLoopHeaderBI =
558
0
      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
559
0
  if (!OuterLoopHeaderBI)
560
0
    return false;
561
562
0
  for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
563
0
    if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
564
0
        Succ != OuterLoopLatch)
565
0
      return false;
566
567
0
  LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
568
  // We do not have any basic block in between now make sure the outer header
569
  // and outer loop latch doesn't contain any unsafe instructions.
570
0
  if (containsUnsafeInstructions(OuterLoopHeader) ||
571
0
      containsUnsafeInstructions(OuterLoopLatch))
572
0
    return false;
573
574
  // Also make sure the inner loop preheader does not contain any unsafe
575
  // instructions. Note that all instructions in the preheader will be moved to
576
  // the outer loop header when interchanging.
577
0
  if (InnerLoopPreHeader != OuterLoopHeader &&
578
0
      containsUnsafeInstructions(InnerLoopPreHeader))
579
0
    return false;
580
581
0
  BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
582
  // Ensure the inner loop exit block flows to the outer loop latch possibly
583
  // through empty blocks.
584
0
  const BasicBlock &SuccInner =
585
0
      LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
586
0
  if (&SuccInner != OuterLoopLatch) {
587
0
    LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
588
0
                      << " does not lead to the outer loop latch.\n";);
589
0
    return false;
590
0
  }
591
  // The inner loop exit block does flow to the outer loop latch and not some
592
  // other BBs, now make sure it contains safe instructions, since it will be
593
  // moved into the (new) inner loop after interchange.
594
0
  if (containsUnsafeInstructions(InnerLoopExit))
595
0
    return false;
596
597
0
  LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
598
  // We have a perfect loop nest.
599
0
  return true;
600
0
}
601
602
0
bool LoopInterchangeLegality::isLoopStructureUnderstood() {
603
0
  BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
604
0
  for (PHINode *InnerInduction : InnerLoopInductions) {
605
0
    unsigned Num = InnerInduction->getNumOperands();
606
0
    for (unsigned i = 0; i < Num; ++i) {
607
0
      Value *Val = InnerInduction->getOperand(i);
608
0
      if (isa<Constant>(Val))
609
0
        continue;
610
0
      Instruction *I = dyn_cast<Instruction>(Val);
611
0
      if (!I)
612
0
        return false;
613
      // TODO: Handle triangular loops.
614
      // e.g. for(int i=0;i<N;i++)
615
      //        for(int j=i;j<N;j++)
616
0
      unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
617
0
      if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
618
0
              InnerLoopPreheader &&
619
0
          !OuterLoop->isLoopInvariant(I)) {
620
0
        return false;
621
0
      }
622
0
    }
623
0
  }
624
625
  // TODO: Handle triangular loops of another form.
626
  // e.g. for(int i=0;i<N;i++)
627
  //        for(int j=0;j<i;j++)
628
  // or,
629
  //      for(int i=0;i<N;i++)
630
  //        for(int j=0;j*i<N;j++)
631
0
  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
632
0
  BranchInst *InnerLoopLatchBI =
633
0
      dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
634
0
  if (!InnerLoopLatchBI->isConditional())
635
0
    return false;
636
0
  if (CmpInst *InnerLoopCmp =
637
0
          dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
638
0
    Value *Op0 = InnerLoopCmp->getOperand(0);
639
0
    Value *Op1 = InnerLoopCmp->getOperand(1);
640
641
    // LHS and RHS of the inner loop exit condition, e.g.,
642
    // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
643
0
    Value *Left = nullptr;
644
0
    Value *Right = nullptr;
645
646
    // Check if V only involves inner loop induction variable.
647
    // Return true if V is InnerInduction, or a cast from
648
    // InnerInduction, or a binary operator that involves
649
    // InnerInduction and a constant.
650
0
    std::function<bool(Value *)> IsPathToInnerIndVar;
651
0
    IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
652
0
      if (llvm::is_contained(InnerLoopInductions, V))
653
0
        return true;
654
0
      if (isa<Constant>(V))
655
0
        return true;
656
0
      const Instruction *I = dyn_cast<Instruction>(V);
657
0
      if (!I)
658
0
        return false;
659
0
      if (isa<CastInst>(I))
660
0
        return IsPathToInnerIndVar(I->getOperand(0));
661
0
      if (isa<BinaryOperator>(I))
662
0
        return IsPathToInnerIndVar(I->getOperand(0)) &&
663
0
               IsPathToInnerIndVar(I->getOperand(1));
664
0
      return false;
665
0
    };
666
667
    // In case of multiple inner loop indvars, it is okay if LHS and RHS
668
    // are both inner indvar related variables.
669
0
    if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
670
0
      return true;
671
672
    // Otherwise we check if the cmp instruction compares an inner indvar
673
    // related variable (Left) with a outer loop invariant (Right).
674
0
    if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
675
0
      Left = Op0;
676
0
      Right = Op1;
677
0
    } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
678
0
      Left = Op1;
679
0
      Right = Op0;
680
0
    }
681
682
0
    if (Left == nullptr)
683
0
      return false;
684
685
0
    const SCEV *S = SE->getSCEV(Right);
686
0
    if (!SE->isLoopInvariant(S, OuterLoop))
687
0
      return false;
688
0
  }
689
690
0
  return true;
691
0
}
692
693
// If SV is a LCSSA PHI node with a single incoming value, return the incoming
694
// value.
695
0
static Value *followLCSSA(Value *SV) {
696
0
  PHINode *PHI = dyn_cast<PHINode>(SV);
697
0
  if (!PHI)
698
0
    return SV;
699
700
0
  if (PHI->getNumIncomingValues() != 1)
701
0
    return SV;
702
0
  return followLCSSA(PHI->getIncomingValue(0));
703
0
}
704
705
// Check V's users to see if it is involved in a reduction in L.
706
0
static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
707
  // Reduction variables cannot be constants.
708
0
  if (isa<Constant>(V))
709
0
    return nullptr;
710
711
0
  for (Value *User : V->users()) {
712
0
    if (PHINode *PHI = dyn_cast<PHINode>(User)) {
713
0
      if (PHI->getNumIncomingValues() == 1)
714
0
        continue;
715
0
      RecurrenceDescriptor RD;
716
0
      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
717
        // Detect floating point reduction only when it can be reordered.
718
0
        if (RD.getExactFPMathInst() != nullptr)
719
0
          return nullptr;
720
0
        return PHI;
721
0
      }
722
0
      return nullptr;
723
0
    }
724
0
  }
725
726
0
  return nullptr;
727
0
}
728
729
bool LoopInterchangeLegality::findInductionAndReductions(
730
0
    Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
731
0
  if (!L->getLoopLatch() || !L->getLoopPredecessor())
732
0
    return false;
733
0
  for (PHINode &PHI : L->getHeader()->phis()) {
734
0
    InductionDescriptor ID;
735
0
    if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
736
0
      Inductions.push_back(&PHI);
737
0
    else {
738
      // PHIs in inner loops need to be part of a reduction in the outer loop,
739
      // discovered when checking the PHIs of the outer loop earlier.
740
0
      if (!InnerLoop) {
741
0
        if (!OuterInnerReductions.count(&PHI)) {
742
0
          LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
743
0
                               "across the outer loop.\n");
744
0
          return false;
745
0
        }
746
0
      } else {
747
0
        assert(PHI.getNumIncomingValues() == 2 &&
748
0
               "Phis in loop header should have exactly 2 incoming values");
749
        // Check if we have a PHI node in the outer loop that has a reduction
750
        // result from the inner loop as an incoming value.
751
0
        Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
752
0
        PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
753
0
        if (!InnerRedPhi ||
754
0
            !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
755
0
          LLVM_DEBUG(
756
0
              dbgs()
757
0
              << "Failed to recognize PHI as an induction or reduction.\n");
758
0
          return false;
759
0
        }
760
0
        OuterInnerReductions.insert(&PHI);
761
0
        OuterInnerReductions.insert(InnerRedPhi);
762
0
      }
763
0
    }
764
0
  }
765
0
  return true;
766
0
}
767
768
// This function indicates the current limitations in the transform as a result
769
// of which we do not proceed.
770
0
bool LoopInterchangeLegality::currentLimitations() {
771
0
  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
772
773
  // transform currently expects the loop latches to also be the exiting
774
  // blocks.
775
0
  if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
776
0
      OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
777
0
      !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
778
0
      !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
779
0
    LLVM_DEBUG(
780
0
        dbgs() << "Loops where the latch is not the exiting block are not"
781
0
               << " supported currently.\n");
782
0
    ORE->emit([&]() {
783
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
784
0
                                      OuterLoop->getStartLoc(),
785
0
                                      OuterLoop->getHeader())
786
0
             << "Loops where the latch is not the exiting block cannot be"
787
0
                " interchange currently.";
788
0
    });
789
0
    return true;
790
0
  }
791
792
0
  SmallVector<PHINode *, 8> Inductions;
793
0
  if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
794
0
    LLVM_DEBUG(
795
0
        dbgs() << "Only outer loops with induction or reduction PHI nodes "
796
0
               << "are supported currently.\n");
797
0
    ORE->emit([&]() {
798
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
799
0
                                      OuterLoop->getStartLoc(),
800
0
                                      OuterLoop->getHeader())
801
0
             << "Only outer loops with induction or reduction PHI nodes can be"
802
0
                " interchanged currently.";
803
0
    });
804
0
    return true;
805
0
  }
806
807
0
  Inductions.clear();
808
  // For multi-level loop nests, make sure that all phi nodes for inner loops
809
  // at all levels can be recognized as a induction or reduction phi. Bail out
810
  // if a phi node at a certain nesting level cannot be properly recognized.
811
0
  Loop *CurLevelLoop = OuterLoop;
812
0
  while (!CurLevelLoop->getSubLoops().empty()) {
813
    // We already made sure that the loop nest is tightly nested.
814
0
    CurLevelLoop = CurLevelLoop->getSubLoops().front();
815
0
    if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
816
0
      LLVM_DEBUG(
817
0
          dbgs() << "Only inner loops with induction or reduction PHI nodes "
818
0
                << "are supported currently.\n");
819
0
      ORE->emit([&]() {
820
0
        return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
821
0
                                        CurLevelLoop->getStartLoc(),
822
0
                                        CurLevelLoop->getHeader())
823
0
              << "Only inner loops with induction or reduction PHI nodes can be"
824
0
                  " interchange currently.";
825
0
      });
826
0
      return true;
827
0
    }
828
0
  }
829
830
  // TODO: Triangular loops are not handled for now.
831
0
  if (!isLoopStructureUnderstood()) {
832
0
    LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
833
0
    ORE->emit([&]() {
834
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
835
0
                                      InnerLoop->getStartLoc(),
836
0
                                      InnerLoop->getHeader())
837
0
             << "Inner loop structure not understood currently.";
838
0
    });
839
0
    return true;
840
0
  }
841
842
0
  return false;
843
0
}
844
845
bool LoopInterchangeLegality::findInductions(
846
0
    Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
847
0
  for (PHINode &PHI : L->getHeader()->phis()) {
848
0
    InductionDescriptor ID;
849
0
    if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
850
0
      Inductions.push_back(&PHI);
851
0
  }
852
0
  return !Inductions.empty();
853
0
}
854
855
// We currently only support LCSSA PHI nodes in the inner loop exit, if their
856
// users are either reduction PHIs or PHIs outside the outer loop (which means
857
// the we are only interested in the final value after the loop).
858
static bool
859
areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
860
0
                              SmallPtrSetImpl<PHINode *> &Reductions) {
861
0
  BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
862
0
  for (PHINode &PHI : InnerExit->phis()) {
863
    // Reduction lcssa phi will have only 1 incoming block that from loop latch.
864
0
    if (PHI.getNumIncomingValues() > 1)
865
0
      return false;
866
0
    if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
867
0
          PHINode *PN = dyn_cast<PHINode>(U);
868
0
          return !PN ||
869
0
                 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
870
0
        })) {
871
0
      return false;
872
0
    }
873
0
  }
874
0
  return true;
875
0
}
876
877
// We currently support LCSSA PHI nodes in the outer loop exit, if their
878
// incoming values do not come from the outer loop latch or if the
879
// outer loop latch has a single predecessor. In that case, the value will
880
// be available if both the inner and outer loop conditions are true, which
881
// will still be true after interchanging. If we have multiple predecessor,
882
// that may not be the case, e.g. because the outer loop latch may be executed
883
// if the inner loop is not executed.
884
0
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
885
0
  BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
886
0
  for (PHINode &PHI : LoopNestExit->phis()) {
887
0
    for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
888
0
      Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
889
0
      if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
890
0
        continue;
891
892
      // The incoming value is defined in the outer loop latch. Currently we
893
      // only support that in case the outer loop latch has a single predecessor.
894
      // This guarantees that the outer loop latch is executed if and only if
895
      // the inner loop is executed (because tightlyNested() guarantees that the
896
      // outer loop header only branches to the inner loop or the outer loop
897
      // latch).
898
      // FIXME: We could weaken this logic and allow multiple predecessors,
899
      //        if the values are produced outside the loop latch. We would need
900
      //        additional logic to update the PHI nodes in the exit block as
901
      //        well.
902
0
      if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
903
0
        return false;
904
0
    }
905
0
  }
906
0
  return true;
907
0
}
908
909
// In case of multi-level nested loops, it may occur that lcssa phis exist in
910
// the latch of InnerLoop, i.e., when defs of the incoming values are further
911
// inside the loopnest. Sometimes those incoming values are not available
912
// after interchange, since the original inner latch will become the new outer
913
// latch which may have predecessor paths that do not include those incoming
914
// values.
915
// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
916
// multi-level loop nests.
917
0
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
918
0
  if (InnerLoop->getSubLoops().empty())
919
0
    return true;
920
  // If the original outer latch has only one predecessor, then values defined
921
  // further inside the looploop, e.g., in the innermost loop, will be available
922
  // at the new outer latch after interchange.
923
0
  if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
924
0
    return true;
925
926
  // The outer latch has more than one predecessors, i.e., the inner
927
  // exit and the inner header.
928
  // PHI nodes in the inner latch are lcssa phis where the incoming values
929
  // are defined further inside the loopnest. Check if those phis are used
930
  // in the original inner latch. If that is the case then bail out since
931
  // those incoming values may not be available at the new outer latch.
932
0
  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
933
0
  for (PHINode &PHI : InnerLoopLatch->phis()) {
934
0
    for (auto *U : PHI.users()) {
935
0
      Instruction *UI = cast<Instruction>(U);
936
0
      if (InnerLoopLatch == UI->getParent())
937
0
        return false;
938
0
    }
939
0
  }
940
0
  return true;
941
0
}
942
943
bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
944
                                                  unsigned OuterLoopId,
945
0
                                                  CharMatrix &DepMatrix) {
946
0
  if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
947
0
    LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
948
0
                      << " and OuterLoopId = " << OuterLoopId
949
0
                      << " due to dependence\n");
950
0
    ORE->emit([&]() {
951
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
952
0
                                      InnerLoop->getStartLoc(),
953
0
                                      InnerLoop->getHeader())
954
0
             << "Cannot interchange loops due to dependences.";
955
0
    });
956
0
    return false;
957
0
  }
958
  // Check if outer and inner loop contain legal instructions only.
959
0
  for (auto *BB : OuterLoop->blocks())
960
0
    for (Instruction &I : BB->instructionsWithoutDebug())
961
0
      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
962
        // readnone functions do not prevent interchanging.
963
0
        if (CI->onlyWritesMemory())
964
0
          continue;
965
0
        LLVM_DEBUG(
966
0
            dbgs() << "Loops with call instructions cannot be interchanged "
967
0
                   << "safely.");
968
0
        ORE->emit([&]() {
969
0
          return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
970
0
                                          CI->getDebugLoc(),
971
0
                                          CI->getParent())
972
0
                 << "Cannot interchange loops due to call instruction.";
973
0
        });
974
975
0
        return false;
976
0
      }
977
978
0
  if (!findInductions(InnerLoop, InnerLoopInductions)) {
979
0
    LLVM_DEBUG(dbgs() << "Cound not find inner loop induction variables.\n");
980
0
    return false;
981
0
  }
982
983
0
  if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
984
0
    LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
985
0
    ORE->emit([&]() {
986
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
987
0
                                      InnerLoop->getStartLoc(),
988
0
                                      InnerLoop->getHeader())
989
0
             << "Cannot interchange loops because unsupported PHI nodes found "
990
0
                "in inner loop latch.";
991
0
    });
992
0
    return false;
993
0
  }
994
995
  // TODO: The loops could not be interchanged due to current limitations in the
996
  // transform module.
997
0
  if (currentLimitations()) {
998
0
    LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
999
0
    return false;
1000
0
  }
1001
1002
  // Check if the loops are tightly nested.
1003
0
  if (!tightlyNested(OuterLoop, InnerLoop)) {
1004
0
    LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1005
0
    ORE->emit([&]() {
1006
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1007
0
                                      InnerLoop->getStartLoc(),
1008
0
                                      InnerLoop->getHeader())
1009
0
             << "Cannot interchange loops because they are not tightly "
1010
0
                "nested.";
1011
0
    });
1012
0
    return false;
1013
0
  }
1014
1015
0
  if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
1016
0
                                     OuterInnerReductions)) {
1017
0
    LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1018
0
    ORE->emit([&]() {
1019
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1020
0
                                      InnerLoop->getStartLoc(),
1021
0
                                      InnerLoop->getHeader())
1022
0
             << "Found unsupported PHI node in loop exit.";
1023
0
    });
1024
0
    return false;
1025
0
  }
1026
1027
0
  if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1028
0
    LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1029
0
    ORE->emit([&]() {
1030
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1031
0
                                      OuterLoop->getStartLoc(),
1032
0
                                      OuterLoop->getHeader())
1033
0
             << "Found unsupported PHI node in loop exit.";
1034
0
    });
1035
0
    return false;
1036
0
  }
1037
1038
0
  return true;
1039
0
}
1040
1041
0
int LoopInterchangeProfitability::getInstrOrderCost() {
1042
0
  unsigned GoodOrder, BadOrder;
1043
0
  BadOrder = GoodOrder = 0;
1044
0
  for (BasicBlock *BB : InnerLoop->blocks()) {
1045
0
    for (Instruction &Ins : *BB) {
1046
0
      if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1047
0
        unsigned NumOp = GEP->getNumOperands();
1048
0
        bool FoundInnerInduction = false;
1049
0
        bool FoundOuterInduction = false;
1050
0
        for (unsigned i = 0; i < NumOp; ++i) {
1051
          // Skip operands that are not SCEV-able.
1052
0
          if (!SE->isSCEVable(GEP->getOperand(i)->getType()))
1053
0
            continue;
1054
1055
0
          const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
1056
0
          const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1057
0
          if (!AR)
1058
0
            continue;
1059
1060
          // If we find the inner induction after an outer induction e.g.
1061
          // for(int i=0;i<N;i++)
1062
          //   for(int j=0;j<N;j++)
1063
          //     A[i][j] = A[i-1][j-1]+k;
1064
          // then it is a good order.
1065
0
          if (AR->getLoop() == InnerLoop) {
1066
            // We found an InnerLoop induction after OuterLoop induction. It is
1067
            // a good order.
1068
0
            FoundInnerInduction = true;
1069
0
            if (FoundOuterInduction) {
1070
0
              GoodOrder++;
1071
0
              break;
1072
0
            }
1073
0
          }
1074
          // If we find the outer induction after an inner induction e.g.
1075
          // for(int i=0;i<N;i++)
1076
          //   for(int j=0;j<N;j++)
1077
          //     A[j][i] = A[j-1][i-1]+k;
1078
          // then it is a bad order.
1079
0
          if (AR->getLoop() == OuterLoop) {
1080
            // We found an OuterLoop induction after InnerLoop induction. It is
1081
            // a bad order.
1082
0
            FoundOuterInduction = true;
1083
0
            if (FoundInnerInduction) {
1084
0
              BadOrder++;
1085
0
              break;
1086
0
            }
1087
0
          }
1088
0
        }
1089
0
      }
1090
0
    }
1091
0
  }
1092
0
  return GoodOrder - BadOrder;
1093
0
}
1094
1095
std::optional<bool>
1096
LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1097
    const DenseMap<const Loop *, unsigned> &CostMap,
1098
0
    std::unique_ptr<CacheCost> &CC) {
1099
  // This is the new cost model returned from loop cache analysis.
1100
  // A smaller index means the loop should be placed an outer loop, and vice
1101
  // versa.
1102
0
  if (CostMap.contains(InnerLoop) && CostMap.contains(OuterLoop)) {
1103
0
    unsigned InnerIndex = 0, OuterIndex = 0;
1104
0
    InnerIndex = CostMap.find(InnerLoop)->second;
1105
0
    OuterIndex = CostMap.find(OuterLoop)->second;
1106
0
    LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1107
0
                      << ", OuterIndex = " << OuterIndex << "\n");
1108
0
    if (InnerIndex < OuterIndex)
1109
0
      return std::optional<bool>(true);
1110
0
    assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1111
0
                                       "numbers to each loop");
1112
0
    if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1113
0
      return std::nullopt;
1114
0
    return std::optional<bool>(false);
1115
0
  }
1116
0
  return std::nullopt;
1117
0
}
1118
1119
std::optional<bool>
1120
0
LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1121
  // Legacy cost model: this is rough cost estimation algorithm. It counts the
1122
  // good and bad order of induction variables in the instruction and allows
1123
  // reordering if number of bad orders is more than good.
1124
0
  int Cost = getInstrOrderCost();
1125
0
  LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1126
0
  if (Cost < 0 && Cost < LoopInterchangeCostThreshold)
1127
0
    return std::optional<bool>(true);
1128
1129
0
  return std::nullopt;
1130
0
}
1131
1132
std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1133
0
    unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1134
0
  for (auto &Row : DepMatrix) {
1135
    // If the inner loop is loop independent or doesn't carry any dependency
1136
    // it is not profitable to move this to outer position, since we are
1137
    // likely able to do inner loop vectorization already.
1138
0
    if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=')
1139
0
      return std::optional<bool>(false);
1140
1141
    // If the outer loop is not loop independent it is not profitable to move
1142
    // this to inner position, since doing so would not enable inner loop
1143
    // parallelism.
1144
0
    if (Row[OuterLoopId] != 'I' && Row[OuterLoopId] != '=')
1145
0
      return std::optional<bool>(false);
1146
0
  }
1147
  // If inner loop has dependence and outer loop is loop independent then it
1148
  // is/ profitable to interchange to enable inner loop parallelism.
1149
  // If there are no dependences, interchanging will not improve anything.
1150
0
  return std::optional<bool>(!DepMatrix.empty());
1151
0
}
1152
1153
bool LoopInterchangeProfitability::isProfitable(
1154
    const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1155
    unsigned OuterLoopId, CharMatrix &DepMatrix,
1156
    const DenseMap<const Loop *, unsigned> &CostMap,
1157
0
    std::unique_ptr<CacheCost> &CC) {
1158
  // isProfitable() is structured to avoid endless loop interchange.
1159
  // If loop cache analysis could decide the profitability then,
1160
  // profitability check will stop and return the analysis result.
1161
  // If cache analysis failed to analyze the loopnest (e.g.,
1162
  // due to delinearization issues) then only check whether it is
1163
  // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
1164
  // analysis the profitability then only, isProfitableForVectorization
1165
  // will decide.
1166
0
  std::optional<bool> shouldInterchange =
1167
0
      isProfitablePerLoopCacheAnalysis(CostMap, CC);
1168
0
  if (!shouldInterchange.has_value()) {
1169
0
    shouldInterchange = isProfitablePerInstrOrderCost();
1170
0
    if (!shouldInterchange.has_value())
1171
0
      shouldInterchange =
1172
0
          isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1173
0
  }
1174
0
  if (!shouldInterchange.has_value()) {
1175
0
    ORE->emit([&]() {
1176
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1177
0
                                      InnerLoop->getStartLoc(),
1178
0
                                      InnerLoop->getHeader())
1179
0
             << "Insufficient information to calculate the cost of loop for "
1180
0
                "interchange.";
1181
0
    });
1182
0
    return false;
1183
0
  } else if (!shouldInterchange.value()) {
1184
0
    ORE->emit([&]() {
1185
0
      return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1186
0
                                      InnerLoop->getStartLoc(),
1187
0
                                      InnerLoop->getHeader())
1188
0
             << "Interchanging loops is not considered to improve cache "
1189
0
                "locality nor vectorization.";
1190
0
    });
1191
0
    return false;
1192
0
  }
1193
0
  return true;
1194
0
}
1195
1196
void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1197
0
                                               Loop *InnerLoop) {
1198
0
  for (Loop *L : *OuterLoop)
1199
0
    if (L == InnerLoop) {
1200
0
      OuterLoop->removeChildLoop(L);
1201
0
      return;
1202
0
    }
1203
0
  llvm_unreachable("Couldn't find loop");
1204
0
}
1205
1206
/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1207
/// new inner and outer loop after interchanging: NewInner is the original
1208
/// outer loop and NewOuter is the original inner loop.
1209
///
1210
/// Before interchanging, we have the following structure
1211
/// Outer preheader
1212
//  Outer header
1213
//    Inner preheader
1214
//    Inner header
1215
//      Inner body
1216
//      Inner latch
1217
//   outer bbs
1218
//   Outer latch
1219
//
1220
// After interchanging:
1221
// Inner preheader
1222
// Inner header
1223
//   Outer preheader
1224
//   Outer header
1225
//     Inner body
1226
//     outer bbs
1227
//     Outer latch
1228
//   Inner latch
1229
void LoopInterchangeTransform::restructureLoops(
1230
    Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1231
0
    BasicBlock *OrigOuterPreHeader) {
1232
0
  Loop *OuterLoopParent = OuterLoop->getParentLoop();
1233
  // The original inner loop preheader moves from the new inner loop to
1234
  // the parent loop, if there is one.
1235
0
  NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1236
0
  LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1237
1238
  // Switch the loop levels.
1239
0
  if (OuterLoopParent) {
1240
    // Remove the loop from its parent loop.
1241
0
    removeChildLoop(OuterLoopParent, NewInner);
1242
0
    removeChildLoop(NewInner, NewOuter);
1243
0
    OuterLoopParent->addChildLoop(NewOuter);
1244
0
  } else {
1245
0
    removeChildLoop(NewInner, NewOuter);
1246
0
    LI->changeTopLevelLoop(NewInner, NewOuter);
1247
0
  }
1248
0
  while (!NewOuter->isInnermost())
1249
0
    NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1250
0
  NewOuter->addChildLoop(NewInner);
1251
1252
  // BBs from the original inner loop.
1253
0
  SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1254
1255
  // Add BBs from the original outer loop to the original inner loop (excluding
1256
  // BBs already in inner loop)
1257
0
  for (BasicBlock *BB : NewInner->blocks())
1258
0
    if (LI->getLoopFor(BB) == NewInner)
1259
0
      NewOuter->addBlockEntry(BB);
1260
1261
  // Now remove inner loop header and latch from the new inner loop and move
1262
  // other BBs (the loop body) to the new inner loop.
1263
0
  BasicBlock *OuterHeader = NewOuter->getHeader();
1264
0
  BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1265
0
  for (BasicBlock *BB : OrigInnerBBs) {
1266
    // Nothing will change for BBs in child loops.
1267
0
    if (LI->getLoopFor(BB) != NewOuter)
1268
0
      continue;
1269
    // Remove the new outer loop header and latch from the new inner loop.
1270
0
    if (BB == OuterHeader || BB == OuterLatch)
1271
0
      NewInner->removeBlockFromLoop(BB);
1272
0
    else
1273
0
      LI->changeLoopFor(BB, NewInner);
1274
0
  }
1275
1276
  // The preheader of the original outer loop becomes part of the new
1277
  // outer loop.
1278
0
  NewOuter->addBlockEntry(OrigOuterPreHeader);
1279
0
  LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1280
1281
  // Tell SE that we move the loops around.
1282
0
  SE->forgetLoop(NewOuter);
1283
0
}
1284
1285
0
bool LoopInterchangeTransform::transform() {
1286
0
  bool Transformed = false;
1287
1288
0
  if (InnerLoop->getSubLoops().empty()) {
1289
0
    BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1290
0
    LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1291
0
    auto &InductionPHIs = LIL.getInnerLoopInductions();
1292
0
    if (InductionPHIs.empty()) {
1293
0
      LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1294
0
      return false;
1295
0
    }
1296
1297
0
    SmallVector<Instruction *, 8> InnerIndexVarList;
1298
0
    for (PHINode *CurInductionPHI : InductionPHIs) {
1299
0
      if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1300
0
        InnerIndexVarList.push_back(
1301
0
            dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
1302
0
      else
1303
0
        InnerIndexVarList.push_back(
1304
0
            dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
1305
0
    }
1306
1307
    // Create a new latch block for the inner loop. We split at the
1308
    // current latch's terminator and then move the condition and all
1309
    // operands that are not either loop-invariant or the induction PHI into the
1310
    // new latch block.
1311
0
    BasicBlock *NewLatch =
1312
0
        SplitBlock(InnerLoop->getLoopLatch(),
1313
0
                   InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1314
1315
0
    SmallSetVector<Instruction *, 4> WorkList;
1316
0
    unsigned i = 0;
1317
0
    auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1318
0
      for (; i < WorkList.size(); i++) {
1319
        // Duplicate instruction and move it the new latch. Update uses that
1320
        // have been moved.
1321
0
        Instruction *NewI = WorkList[i]->clone();
1322
0
        NewI->insertBefore(NewLatch->getFirstNonPHI());
1323
0
        assert(!NewI->mayHaveSideEffects() &&
1324
0
               "Moving instructions with side-effects may change behavior of "
1325
0
               "the loop nest!");
1326
0
        for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
1327
0
          Instruction *UserI = cast<Instruction>(U.getUser());
1328
0
          if (!InnerLoop->contains(UserI->getParent()) ||
1329
0
              UserI->getParent() == NewLatch ||
1330
0
              llvm::is_contained(InductionPHIs, UserI))
1331
0
            U.set(NewI);
1332
0
        }
1333
        // Add operands of moved instruction to the worklist, except if they are
1334
        // outside the inner loop or are the induction PHI.
1335
0
        for (Value *Op : WorkList[i]->operands()) {
1336
0
          Instruction *OpI = dyn_cast<Instruction>(Op);
1337
0
          if (!OpI ||
1338
0
              this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
1339
0
              llvm::is_contained(InductionPHIs, OpI))
1340
0
            continue;
1341
0
          WorkList.insert(OpI);
1342
0
        }
1343
0
      }
1344
0
    };
1345
1346
    // FIXME: Should we interchange when we have a constant condition?
1347
0
    Instruction *CondI = dyn_cast<Instruction>(
1348
0
        cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
1349
0
            ->getCondition());
1350
0
    if (CondI)
1351
0
      WorkList.insert(CondI);
1352
0
    MoveInstructions();
1353
0
    for (Instruction *InnerIndexVar : InnerIndexVarList)
1354
0
      WorkList.insert(cast<Instruction>(InnerIndexVar));
1355
0
    MoveInstructions();
1356
0
  }
1357
1358
  // Ensure the inner loop phi nodes have a separate basic block.
1359
0
  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1360
0
  if (InnerLoopHeader->getFirstNonPHI() != InnerLoopHeader->getTerminator()) {
1361
0
    SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
1362
0
    LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
1363
0
  }
1364
1365
  // Instructions in the original inner loop preheader may depend on values
1366
  // defined in the outer loop header. Move them there, because the original
1367
  // inner loop preheader will become the entry into the interchanged loop nest.
1368
  // Currently we move all instructions and rely on LICM to move invariant
1369
  // instructions outside the loop nest.
1370
0
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1371
0
  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1372
0
  if (InnerLoopPreHeader != OuterLoopHeader) {
1373
0
    SmallPtrSet<Instruction *, 4> NeedsMoving;
1374
0
    for (Instruction &I :
1375
0
         make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
1376
0
                                         std::prev(InnerLoopPreHeader->end()))))
1377
0
      I.moveBeforePreserving(OuterLoopHeader->getTerminator());
1378
0
  }
1379
1380
0
  Transformed |= adjustLoopLinks();
1381
0
  if (!Transformed) {
1382
0
    LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
1383
0
    return false;
1384
0
  }
1385
1386
0
  return true;
1387
0
}
1388
1389
/// \brief Move all instructions except the terminator from FromBB right before
1390
/// InsertBefore
1391
0
static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1392
0
  BasicBlock *ToBB = InsertBefore->getParent();
1393
1394
0
  ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
1395
0
               FromBB->getTerminator()->getIterator());
1396
0
}
1397
1398
/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
1399
0
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
1400
  // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
1401
  // from BB1 afterwards.
1402
0
  auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
1403
0
  SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
1404
0
  for (Instruction *I : TempInstrs)
1405
0
    I->removeFromParent();
1406
1407
  // Move instructions from BB2 to BB1.
1408
0
  moveBBContents(BB2, BB1->getTerminator());
1409
1410
  // Move instructions from TempInstrs to BB2.
1411
0
  for (Instruction *I : TempInstrs)
1412
0
    I->insertBefore(BB2->getTerminator());
1413
0
}
1414
1415
// Update BI to jump to NewBB instead of OldBB. Records updates to the
1416
// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
1417
// \p OldBB  is exactly once in BI's successor list.
1418
static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
1419
                            BasicBlock *NewBB,
1420
                            std::vector<DominatorTree::UpdateType> &DTUpdates,
1421
0
                            bool MustUpdateOnce = true) {
1422
0
  assert((!MustUpdateOnce ||
1423
0
          llvm::count_if(successors(BI),
1424
0
                         [OldBB](BasicBlock *BB) {
1425
0
                           return BB == OldBB;
1426
0
                         }) == 1) && "BI must jump to OldBB exactly once.");
1427
0
  bool Changed = false;
1428
0
  for (Use &Op : BI->operands())
1429
0
    if (Op == OldBB) {
1430
0
      Op.set(NewBB);
1431
0
      Changed = true;
1432
0
    }
1433
1434
0
  if (Changed) {
1435
0
    DTUpdates.push_back(
1436
0
        {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1437
0
    DTUpdates.push_back(
1438
0
        {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1439
0
  }
1440
0
  assert(Changed && "Expected a successor to be updated");
1441
0
}
1442
1443
// Move Lcssa PHIs to the right place.
1444
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
1445
                          BasicBlock *InnerLatch, BasicBlock *OuterHeader,
1446
                          BasicBlock *OuterLatch, BasicBlock *OuterExit,
1447
0
                          Loop *InnerLoop, LoopInfo *LI) {
1448
1449
  // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
1450
  // defined either in the header or latch. Those blocks will become header and
1451
  // latch of the new outer loop, and the only possible users can PHI nodes
1452
  // in the exit block of the loop nest or the outer loop header (reduction
1453
  // PHIs, in that case, the incoming value must be defined in the inner loop
1454
  // header). We can just substitute the user with the incoming value and remove
1455
  // the PHI.
1456
0
  for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
1457
0
    assert(P.getNumIncomingValues() == 1 &&
1458
0
           "Only loops with a single exit are supported!");
1459
1460
    // Incoming values are guaranteed be instructions currently.
1461
0
    auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
1462
    // In case of multi-level nested loops, follow LCSSA to find the incoming
1463
    // value defined from the innermost loop.
1464
0
    auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
1465
    // Skip phis with incoming values from the inner loop body, excluding the
1466
    // header and latch.
1467
0
    if (IncIInnerMost->getParent() != InnerLatch &&
1468
0
        IncIInnerMost->getParent() != InnerHeader)
1469
0
      continue;
1470
1471
0
    assert(all_of(P.users(),
1472
0
                  [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
1473
0
                    return (cast<PHINode>(U)->getParent() == OuterHeader &&
1474
0
                            IncI->getParent() == InnerHeader) ||
1475
0
                           cast<PHINode>(U)->getParent() == OuterExit;
1476
0
                  }) &&
1477
0
           "Can only replace phis iff the uses are in the loop nest exit or "
1478
0
           "the incoming value is defined in the inner header (it will "
1479
0
           "dominate all loop blocks after interchanging)");
1480
0
    P.replaceAllUsesWith(IncI);
1481
0
    P.eraseFromParent();
1482
0
  }
1483
1484
0
  SmallVector<PHINode *, 8> LcssaInnerExit;
1485
0
  for (PHINode &P : InnerExit->phis())
1486
0
    LcssaInnerExit.push_back(&P);
1487
1488
0
  SmallVector<PHINode *, 8> LcssaInnerLatch;
1489
0
  for (PHINode &P : InnerLatch->phis())
1490
0
    LcssaInnerLatch.push_back(&P);
1491
1492
  // Lcssa PHIs for values used outside the inner loop are in InnerExit.
1493
  // If a PHI node has users outside of InnerExit, it has a use outside the
1494
  // interchanged loop and we have to preserve it. We move these to
1495
  // InnerLatch, which will become the new exit block for the innermost
1496
  // loop after interchanging.
1497
0
  for (PHINode *P : LcssaInnerExit)
1498
0
    P->moveBefore(InnerLatch->getFirstNonPHI());
1499
1500
  // If the inner loop latch contains LCSSA PHIs, those come from a child loop
1501
  // and we have to move them to the new inner latch.
1502
0
  for (PHINode *P : LcssaInnerLatch)
1503
0
    P->moveBefore(InnerExit->getFirstNonPHI());
1504
1505
  // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
1506
  // incoming values defined in the outer loop, we have to add a new PHI
1507
  // in the inner loop latch, which became the exit block of the outer loop,
1508
  // after interchanging.
1509
0
  if (OuterExit) {
1510
0
    for (PHINode &P : OuterExit->phis()) {
1511
0
      if (P.getNumIncomingValues() != 1)
1512
0
        continue;
1513
      // Skip Phis with incoming values defined in the inner loop. Those should
1514
      // already have been updated.
1515
0
      auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
1516
0
      if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
1517
0
        continue;
1518
1519
0
      PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
1520
0
      NewPhi->setIncomingValue(0, P.getIncomingValue(0));
1521
0
      NewPhi->setIncomingBlock(0, OuterLatch);
1522
      // We might have incoming edges from other BBs, i.e., the original outer
1523
      // header.
1524
0
      for (auto *Pred : predecessors(InnerLatch)) {
1525
0
        if (Pred == OuterLatch)
1526
0
          continue;
1527
0
        NewPhi->addIncoming(P.getIncomingValue(0), Pred);
1528
0
      }
1529
0
      NewPhi->insertBefore(InnerLatch->getFirstNonPHI());
1530
0
      P.setIncomingValue(0, NewPhi);
1531
0
    }
1532
0
  }
1533
1534
  // Now adjust the incoming blocks for the LCSSA PHIs.
1535
  // For PHIs moved from Inner's exit block, we need to replace Inner's latch
1536
  // with the new latch.
1537
0
  InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
1538
0
}
1539
1540
0
bool LoopInterchangeTransform::adjustLoopBranches() {
1541
0
  LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
1542
0
  std::vector<DominatorTree::UpdateType> DTUpdates;
1543
1544
0
  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1545
0
  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1546
1547
0
  assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1548
0
         InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
1549
0
         InnerLoopPreHeader && "Guaranteed by loop-simplify form");
1550
  // Ensure that both preheaders do not contain PHI nodes and have single
1551
  // predecessors. This allows us to move them easily. We use
1552
  // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
1553
  // preheaders do not satisfy those conditions.
1554
0
  if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
1555
0
      !OuterLoopPreHeader->getUniquePredecessor())
1556
0
    OuterLoopPreHeader =
1557
0
        InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
1558
0
  if (InnerLoopPreHeader == OuterLoop->getHeader())
1559
0
    InnerLoopPreHeader =
1560
0
        InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
1561
1562
  // Adjust the loop preheader
1563
0
  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1564
0
  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1565
0
  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1566
0
  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1567
0
  BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1568
0
  BasicBlock *InnerLoopLatchPredecessor =
1569
0
      InnerLoopLatch->getUniquePredecessor();
1570
0
  BasicBlock *InnerLoopLatchSuccessor;
1571
0
  BasicBlock *OuterLoopLatchSuccessor;
1572
1573
0
  BranchInst *OuterLoopLatchBI =
1574
0
      dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1575
0
  BranchInst *InnerLoopLatchBI =
1576
0
      dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1577
0
  BranchInst *OuterLoopHeaderBI =
1578
0
      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1579
0
  BranchInst *InnerLoopHeaderBI =
1580
0
      dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1581
1582
0
  if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1583
0
      !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1584
0
      !InnerLoopHeaderBI)
1585
0
    return false;
1586
1587
0
  BranchInst *InnerLoopLatchPredecessorBI =
1588
0
      dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1589
0
  BranchInst *OuterLoopPredecessorBI =
1590
0
      dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1591
1592
0
  if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1593
0
    return false;
1594
0
  BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1595
0
  if (!InnerLoopHeaderSuccessor)
1596
0
    return false;
1597
1598
  // Adjust Loop Preheader and headers.
1599
  // The branches in the outer loop predecessor and the outer loop header can
1600
  // be unconditional branches or conditional branches with duplicates. Consider
1601
  // this when updating the successors.
1602
0
  updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
1603
0
                  InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
1604
  // The outer loop header might or might not branch to the outer latch.
1605
  // We are guaranteed to branch to the inner loop preheader.
1606
0
  if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) {
1607
    // In this case the outerLoopHeader should branch to the InnerLoopLatch.
1608
0
    updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
1609
0
                    DTUpdates,
1610
0
                    /*MustUpdateOnce=*/false);
1611
0
  }
1612
0
  updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
1613
0
                  InnerLoopHeaderSuccessor, DTUpdates,
1614
0
                  /*MustUpdateOnce=*/false);
1615
1616
  // Adjust reduction PHI's now that the incoming block has changed.
1617
0
  InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
1618
0
                                               OuterLoopHeader);
1619
1620
0
  updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
1621
0
                  OuterLoopPreHeader, DTUpdates);
1622
1623
  // -------------Adjust loop latches-----------
1624
0
  if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
1625
0
    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
1626
0
  else
1627
0
    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
1628
1629
0
  updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
1630
0
                  InnerLoopLatchSuccessor, DTUpdates);
1631
1632
0
  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
1633
0
    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
1634
0
  else
1635
0
    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
1636
1637
0
  updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
1638
0
                  OuterLoopLatchSuccessor, DTUpdates);
1639
0
  updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
1640
0
                  DTUpdates);
1641
1642
0
  DT->applyUpdates(DTUpdates);
1643
0
  restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
1644
0
                   OuterLoopPreHeader);
1645
1646
0
  moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
1647
0
                OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
1648
0
                InnerLoop, LI);
1649
  // For PHIs in the exit block of the outer loop, outer's latch has been
1650
  // replaced by Inners'.
1651
0
  OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1652
1653
0
  auto &OuterInnerReductions = LIL.getOuterInnerReductions();
1654
  // Now update the reduction PHIs in the inner and outer loop headers.
1655
0
  SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
1656
0
  for (PHINode &PHI : InnerLoopHeader->phis())
1657
0
    if (OuterInnerReductions.contains(&PHI))
1658
0
      InnerLoopPHIs.push_back(&PHI);
1659
1660
0
  for (PHINode &PHI : OuterLoopHeader->phis())
1661
0
    if (OuterInnerReductions.contains(&PHI))
1662
0
      OuterLoopPHIs.push_back(&PHI);
1663
1664
  // Now move the remaining reduction PHIs from outer to inner loop header and
1665
  // vice versa. The PHI nodes must be part of a reduction across the inner and
1666
  // outer loop and all the remains to do is and updating the incoming blocks.
1667
0
  for (PHINode *PHI : OuterLoopPHIs) {
1668
0
    LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
1669
0
    PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
1670
0
    assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1671
0
  }
1672
0
  for (PHINode *PHI : InnerLoopPHIs) {
1673
0
    LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
1674
0
    PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
1675
0
    assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1676
0
  }
1677
1678
  // Update the incoming blocks for moved PHI nodes.
1679
0
  OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
1680
0
  OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
1681
0
  InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
1682
0
  InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1683
1684
  // Values defined in the outer loop header could be used in the inner loop
1685
  // latch. In that case, we need to create LCSSA phis for them, because after
1686
  // interchanging they will be defined in the new inner loop and used in the
1687
  // new outer loop.
1688
0
  SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
1689
0
  for (Instruction &I :
1690
0
       make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
1691
0
    MayNeedLCSSAPhis.push_back(&I);
1692
0
  formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
1693
1694
0
  return true;
1695
0
}
1696
1697
0
bool LoopInterchangeTransform::adjustLoopLinks() {
1698
  // Adjust all branches in the inner and outer loop.
1699
0
  bool Changed = adjustLoopBranches();
1700
0
  if (Changed) {
1701
    // We have interchanged the preheaders so we need to interchange the data in
1702
    // the preheaders as well. This is because the content of the inner
1703
    // preheader was previously executed inside the outer loop.
1704
0
    BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1705
0
    BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1706
0
    swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
1707
0
  }
1708
0
  return Changed;
1709
0
}
1710
1711
PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
1712
                                           LoopAnalysisManager &AM,
1713
                                           LoopStandardAnalysisResults &AR,
1714
0
                                           LPMUpdater &U) {
1715
0
  Function &F = *LN.getParent();
1716
1717
0
  DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
1718
0
  std::unique_ptr<CacheCost> CC =
1719
0
      CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
1720
0
  OptimizationRemarkEmitter ORE(&F);
1721
0
  if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
1722
0
    return PreservedAnalyses::all();
1723
0
  U.markLoopNestChanged(true);
1724
0
  return getLoopPassPreservedAnalyses();
1725
0
}