/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp

Source (jump to first uncovered line)
//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief Analyzes if a function potentially memory bound and if a kernel
/// kernel may benefit from limiting number of waves to reduce cache thrashing.
///
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"

using namespace llvm;

#define DEBUG_TYPE "amdgpu-perf-hint"

static cl::opt<unsigned>
    MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
                   cl::desc("Function mem bound threshold in %"));

static cl::opt<unsigned>
    LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
                    cl::desc("Kernel limit wave threshold in %"));

static cl::opt<unsigned>
    IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
             cl::desc("Indirect access memory instruction weight"));

static cl::opt<unsigned>
    LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
             cl::desc("Large stride memory access weight"));

static cl::opt<unsigned>
    LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
                      cl::desc("Large stride memory access threshold"));

STATISTIC(NumMemBound, "Number of functions marked as memory bound");
STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");

char llvm::AMDGPUPerfHintAnalysis::ID = 0;
char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID;

INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE,
                "Analysis if a function is memory bound", true, true)

namespace {

struct AMDGPUPerfHint {
  friend AMDGPUPerfHintAnalysis;

public:
  AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
                 const TargetLowering *TLI_)
      : FIM(FIM_), DL(nullptr), TLI(TLI_) {}

  bool runOnFunction(Function &F);

private:
  struct MemAccessInfo {
    const Value *V = nullptr;
    const Value *Base = nullptr;
    int64_t Offset = 0;
    MemAccessInfo() = default;
    bool isLargeStride(MemAccessInfo &Reference) const;
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    Printable print() const {
      return Printable([this](raw_ostream &OS) {
        OS << "Value: " << *V << '\n'
           << "Base: " << *Base << " Offset: " << Offset << '\n';
      });
    }
#endif
  };

  MemAccessInfo makeMemAccessInfo(Instruction *) const;

  MemAccessInfo LastAccess; // Last memory access info

  AMDGPUPerfHintAnalysis::FuncInfoMap &FIM;

  const DataLayout *DL;

  const TargetLowering *TLI;

  AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F);
  static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
  static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);

  bool isIndirectAccess(const Instruction *Inst) const;

  /// Check if the instruction is large stride.
  /// The purpose is to identify memory access pattern like:
  /// x = a[i];
  /// y = a[i+1000];
  /// z = a[i+2000];
  /// In the above example, the second and third memory access will be marked
  /// large stride memory access.
  bool isLargeStride(const Instruction *Inst);

  bool isGlobalAddr(const Value *V) const;
  bool isLocalAddr(const Value *V) const;
  bool isGlobalLoadUsedInBB(const Instruction &) const;
};

static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
    const Instruction *Inst) {
  if (auto LI = dyn_cast<LoadInst>(Inst))
    return {LI->getPointerOperand(), LI->getType()};
  if (auto SI = dyn_cast<StoreInst>(Inst))
    return {SI->getPointerOperand(), SI->getValueOperand()->getType()};
  if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst))
    return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
  if (auto AI = dyn_cast<AtomicRMWInst>(Inst))
    return {AI->getPointerOperand(), AI->getValOperand()->getType()};
  if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst))
    return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())};

  return {nullptr, nullptr};
}

bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
  LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
  SmallSet<const Value *, 32> WorkSet;
  SmallSet<const Value *, 32> Visited;
  if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
    if (isGlobalAddr(MO))
      WorkSet.insert(MO);
  }

  while (!WorkSet.empty()) {
    const Value *V = *WorkSet.begin();
    WorkSet.erase(*WorkSet.begin());
    if (!Visited.insert(V).second)
      continue;
    LLVM_DEBUG(dbgs() << "  check: " << *V << '\n');

    if (auto LD = dyn_cast<LoadInst>(V)) {
      auto M = LD->getPointerOperand();
      if (isGlobalAddr(M)) {
        LLVM_DEBUG(dbgs() << "    is IA\n");
        return true;
      }
      continue;
    }

    if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
      auto P = GEP->getPointerOperand();
      WorkSet.insert(P);
      for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
        WorkSet.insert(GEP->getOperand(I));
      continue;
    }

    if (auto U = dyn_cast<UnaryInstruction>(V)) {
      WorkSet.insert(U->getOperand(0));
      continue;
    }

    if (auto BO = dyn_cast<BinaryOperator>(V)) {
      WorkSet.insert(BO->getOperand(0));
      WorkSet.insert(BO->getOperand(1));
      continue;
    }

    if (auto S = dyn_cast<SelectInst>(V)) {
      WorkSet.insert(S->getFalseValue());
      WorkSet.insert(S->getTrueValue());
      continue;
    }

    if (auto E = dyn_cast<ExtractElementInst>(V)) {
      WorkSet.insert(E->getVectorOperand());
      continue;
    }

    LLVM_DEBUG(dbgs() << "    dropped\n");
  }

  LLVM_DEBUG(dbgs() << "  is not IA\n");
  return false;
}

// Returns true if the global load `I` is used in its own basic block.
bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
  const auto *Ld = dyn_cast<LoadInst>(&I);
  if (!Ld)
    return false;
  if (!isGlobalAddr(Ld->getPointerOperand()))
    return false;

  for (const User *Usr : Ld->users()) {
    if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
      if (UsrInst->getParent() == I.getParent())
        return true;
    }
  }

  return false;
}

AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
  AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];

  LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');

  for (auto &B : F) {
    LastAccess = MemAccessInfo();
    unsigned UsedGlobalLoadsInBB = 0;
    for (auto &I : B) {
      if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
        unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
        // TODO: Check if the global load and its user are close to each other
        // instead (Or do this analysis in GCNSchedStrategy?).
        if (isGlobalLoadUsedInBB(I))
          UsedGlobalLoadsInBB += Size;
        if (isIndirectAccess(&I))
          FI.IAMInstCost += Size;
        if (isLargeStride(&I))
          FI.LSMInstCost += Size;
        FI.MemInstCost += Size;
        FI.InstCost += Size;
        continue;
      }
      if (auto *CB = dyn_cast<CallBase>(&I)) {
        Function *Callee = CB->getCalledFunction();
        if (!Callee || Callee->isDeclaration()) {
          ++FI.InstCost;
          continue;
        }
        if (&F == Callee) // Handle immediate recursion
          continue;

        auto Loc = FIM.find(Callee);
        if (Loc == FIM.end())
          continue;

        FI.MemInstCost += Loc->second.MemInstCost;
        FI.InstCost += Loc->second.InstCost;
        FI.IAMInstCost += Loc->second.IAMInstCost;
        FI.LSMInstCost += Loc->second.LSMInstCost;
      } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
        TargetLoweringBase::AddrMode AM;
        auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
        AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
        AM.HasBaseReg = !AM.BaseGV;
        if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
                                       GEP->getPointerAddressSpace()))
          // Offset will likely be folded into load or store
          continue;
        ++FI.InstCost;
      } else {
        ++FI.InstCost;
      }
    }

    if (!FI.HasDenseGlobalMemAcc) {
      unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
      if (GlobalMemAccPercentage > 50) {
        LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
                          << B.getName() << " has " << GlobalMemAccPercentage
                          << "% global memory access\n");
        FI.HasDenseGlobalMemAcc = true;
      }
    }
  }

  return &FI;
}

bool AMDGPUPerfHint::runOnFunction(Function &F) {
  const Module &M = *F.getParent();
  DL = &M.getDataLayout();

  if (F.hasFnAttribute("amdgpu-wave-limiter") &&
      F.hasFnAttribute("amdgpu-memory-bound"))
    return false;

  const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);

  LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
                    << '\n'
                    << " IAMInst cost: " << Info->IAMInstCost << '\n'
                    << " LSMInst cost: " << Info->LSMInstCost << '\n'
                    << " TotalInst cost: " << Info->InstCost << '\n');

  bool Changed = false;

  if (isMemBound(*Info)) {
    LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
    NumMemBound++;
    F.addFnAttr("amdgpu-memory-bound", "true");
    Changed = true;
  }

  if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
    LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
    NumLimitWave++;
    F.addFnAttr("amdgpu-wave-limiter", "true");
    Changed = true;
  }

  return Changed;
}

bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
  // Reverting optimal scheduling in favour of occupancy with basic block(s)
  // having dense global memory access can potentially hurt performance.
  if (FI.HasDenseGlobalMemAcc)
    return true;

  return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
}

bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
  return ((FI.MemInstCost + FI.IAMInstCost * IAWeight +
           FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh;
}

bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
  if (auto PT = dyn_cast<PointerType>(V->getType())) {
    unsigned As = PT->getAddressSpace();
    // Flat likely points to global too.
    return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS;
  }
  return false;
}

bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
  if (auto PT = dyn_cast<PointerType>(V->getType()))
    return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
  return false;
}

bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
  LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');

  MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
  bool IsLargeStride = MAI.isLargeStride(LastAccess);
  if (MAI.Base)
    LastAccess = std::move(MAI);

  return IsLargeStride;
}

AMDGPUPerfHint::MemAccessInfo
AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
  MemAccessInfo MAI;
  const Value *MO = getMemoryInstrPtrAndType(Inst).first;

  LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
  // Do not treat local-addr memory access as large stride.
  if (isLocalAddr(MO))
    return MAI;

  MAI.V = MO;
  MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
  return MAI;
}

bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
    MemAccessInfo &Reference) const {

  if (!Base || !Reference.Base || Base != Reference.Base)
    return false;

  uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
                                            : Reference.Offset - Offset;
  bool Result = Diff > LargeStrideThresh;
  LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
               << print() << "<=>\n"
               << Reference.print() << "Result:" << Result << '\n');
  return Result;
}
} // namespace

bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) {
  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
  if (!TPC)
    return false;

  const TargetMachine &TM = TPC->getTM<TargetMachine>();

  bool Changed = false;
  for (CallGraphNode *I : SCC) {
    Function *F = I->getFunction();
    if (!F || F->isDeclaration())
      continue;

    const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F);
    AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());

    if (Analyzer.runOnFunction(*F))
      Changed = true;
  }

  return Changed;
}

bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
  auto FI = FIM.find(F);
  if (FI == FIM.end())
    return false;

  return AMDGPUPerfHint::isMemBound(FI->second);
}

bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
  auto FI = FIM.find(F);
  if (FI == FIM.end())
    return false;

  return AMDGPUPerfHint::needLimitWave(FI->second);
}

Coverage Report

Created: 2024-01-17 10:31

Line	Count	Source (jump to first uncovered line)
1		//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
2		//
3		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4		// See https://llvm.org/LICENSE.txt for license information.
5		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6		//
7		//===----------------------------------------------------------------------===//
8		//
9		/// \file
10		/// \brief Analyzes if a function potentially memory bound and if a kernel
11		/// kernel may benefit from limiting number of waves to reduce cache thrashing.
12		///
13		//===----------------------------------------------------------------------===//
14
15		#include "AMDGPU.h"
16		#include "AMDGPUPerfHintAnalysis.h"
17		#include "Utils/AMDGPUBaseInfo.h"
18		#include "llvm/ADT/SmallSet.h"
19		#include "llvm/ADT/Statistic.h"
20		#include "llvm/Analysis/CallGraph.h"
21		#include "llvm/Analysis/ValueTracking.h"
22		#include "llvm/CodeGen/TargetLowering.h"
23		#include "llvm/CodeGen/TargetPassConfig.h"
24		#include "llvm/CodeGen/TargetSubtargetInfo.h"
25		#include "llvm/IR/Instructions.h"
26		#include "llvm/IR/IntrinsicInst.h"
27		#include "llvm/Support/CommandLine.h"
28		#include "llvm/Target/TargetMachine.h"
29
30		using namespace llvm;
31
32		#define DEBUG_TYPE "amdgpu-perf-hint"
33
34		static cl::opt<unsigned>
35		MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
36		cl::desc("Function mem bound threshold in %"));
37
38		static cl::opt<unsigned>
39		LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
40		cl::desc("Kernel limit wave threshold in %"));
41
42		static cl::opt<unsigned>
43		IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
44		cl::desc("Indirect access memory instruction weight"));
45
46		static cl::opt<unsigned>
47		LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
48		cl::desc("Large stride memory access weight"));
49
50		static cl::opt<unsigned>
51		LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
52		cl::desc("Large stride memory access threshold"));
53
54		STATISTIC(NumMemBound, "Number of functions marked as memory bound");
55		STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
56
57		char llvm::AMDGPUPerfHintAnalysis::ID = 0;
58		char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID;
59
60		INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE,
61		"Analysis if a function is memory bound", true, true)
62
63		namespace {
64
65		struct AMDGPUPerfHint {
66		friend AMDGPUPerfHintAnalysis;
67
68		public:
69		AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
70		const TargetLowering *TLI_)
71	0	: FIM(FIM_), DL(nullptr), TLI(TLI_) {}
72
73		bool runOnFunction(Function &F);
74
75		private:
76		struct MemAccessInfo {
77		const Value *V = nullptr;
78		const Value *Base = nullptr;
79		int64_t Offset = 0;
80	0	MemAccessInfo() = default;
81		bool isLargeStride(MemAccessInfo &Reference) const;
82		#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
83	0	Printable print() const {
84	0	return Printable([this](raw_ostream &OS) {
85	0	OS << "Value: " << *V << '\n'
86	0	<< "Base: " << *Base << " Offset: " << Offset << '\n';
87	0	});
88	0	}
89		#endif
90		};
91
92		MemAccessInfo makeMemAccessInfo(Instruction *) const;
93
94		MemAccessInfo LastAccess; // Last memory access info
95
96		AMDGPUPerfHintAnalysis::FuncInfoMap &FIM;
97
98		const DataLayout *DL;
99
100		const TargetLowering *TLI;
101
102		AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F);
103		static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
104		static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
105
106		bool isIndirectAccess(const Instruction *Inst) const;
107
108		/// Check if the instruction is large stride.
109		/// The purpose is to identify memory access pattern like:
110		/// x = a[i];
111		/// y = a[i+1000];
112		/// z = a[i+2000];
113		/// In the above example, the second and third memory access will be marked
114		/// large stride memory access.
115		bool isLargeStride(const Instruction *Inst);
116
117		bool isGlobalAddr(const Value *V) const;
118		bool isLocalAddr(const Value *V) const;
119		bool isGlobalLoadUsedInBB(const Instruction &) const;
120		};
121
122		static std::pair<const Value , const Type > getMemoryInstrPtrAndType(
123	0	const Instruction *Inst) {
124	0	if (auto LI = dyn_cast<LoadInst>(Inst))
125	0	return {LI->getPointerOperand(), LI->getType()};
126	0	if (auto SI = dyn_cast<StoreInst>(Inst))
127	0	return {SI->getPointerOperand(), SI->getValueOperand()->getType()};
128	0	if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst))
129	0	return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
130	0	if (auto AI = dyn_cast<AtomicRMWInst>(Inst))
131	0	return {AI->getPointerOperand(), AI->getValOperand()->getType()};
132	0	if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst))
133	0	return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())};
134
135	0	return {nullptr, nullptr};
136	0	}
137
138	0	bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
139	0	LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
140	0	SmallSet<const Value *, 32> WorkSet;
141	0	SmallSet<const Value *, 32> Visited;
142	0	if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
143	0	if (isGlobalAddr(MO))
144	0	WorkSet.insert(MO);
145	0	}
146
147	0	while (!WorkSet.empty()) {
148	0	const Value V = WorkSet.begin();
149	0	WorkSet.erase(*WorkSet.begin());
150	0	if (!Visited.insert(V).second)
151	0	continue;
152	0	LLVM_DEBUG(dbgs() << " check: " << *V << '\n');
153
154	0	if (auto LD = dyn_cast<LoadInst>(V)) {
155	0	auto M = LD->getPointerOperand();
156	0	if (isGlobalAddr(M)) {
157	0	LLVM_DEBUG(dbgs() << " is IA\n");
158	0	return true;
159	0	}
160	0	continue;
161	0	}
162
163	0	if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
164	0	auto P = GEP->getPointerOperand();
165	0	WorkSet.insert(P);
166	0	for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
167	0	WorkSet.insert(GEP->getOperand(I));
168	0	continue;
169	0	}
170
171	0	if (auto U = dyn_cast<UnaryInstruction>(V)) {
172	0	WorkSet.insert(U->getOperand(0));
173	0	continue;
174	0	}
175
176	0	if (auto BO = dyn_cast<BinaryOperator>(V)) {
177	0	WorkSet.insert(BO->getOperand(0));
178	0	WorkSet.insert(BO->getOperand(1));
179	0	continue;
180	0	}
181
182	0	if (auto S = dyn_cast<SelectInst>(V)) {
183	0	WorkSet.insert(S->getFalseValue());
184	0	WorkSet.insert(S->getTrueValue());
185	0	continue;
186	0	}
187
188	0	if (auto E = dyn_cast<ExtractElementInst>(V)) {
189	0	WorkSet.insert(E->getVectorOperand());
190	0	continue;
191	0	}
192
193	0	LLVM_DEBUG(dbgs() << " dropped\n");
194	0	}
195
196	0	LLVM_DEBUG(dbgs() << " is not IA\n");
197	0	return false;
198	0	}
199
200		// Returns true if the global load `I` is used in its own basic block.
201	0	bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
202	0	const auto *Ld = dyn_cast<LoadInst>(&I);
203	0	if (!Ld)
204	0	return false;
205	0	if (!isGlobalAddr(Ld->getPointerOperand()))
206	0	return false;
207
208	0	for (const User *Usr : Ld->users()) {
209	0	if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
210	0	if (UsrInst->getParent() == I.getParent())
211	0	return true;
212	0	}
213	0	}
214
215	0	return false;
216	0	}
217
218	0	AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
219	0	AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
220
221	0	LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
222
223	0	for (auto &B : F) {
224	0	LastAccess = MemAccessInfo();
225	0	unsigned UsedGlobalLoadsInBB = 0;
226	0	for (auto &I : B) {
227	0	if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
228	0	unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
229		// TODO: Check if the global load and its user are close to each other
230		// instead (Or do this analysis in GCNSchedStrategy?).
231	0	if (isGlobalLoadUsedInBB(I))
232	0	UsedGlobalLoadsInBB += Size;
233	0	if (isIndirectAccess(&I))
234	0	FI.IAMInstCost += Size;
235	0	if (isLargeStride(&I))
236	0	FI.LSMInstCost += Size;
237	0	FI.MemInstCost += Size;
238	0	FI.InstCost += Size;
239	0	continue;
240	0	}
241	0	if (auto *CB = dyn_cast<CallBase>(&I)) {
242	0	Function *Callee = CB->getCalledFunction();
243	0	if (!Callee \|\| Callee->isDeclaration()) {
244	0	++FI.InstCost;
245	0	continue;
246	0	}
247	0	if (&F == Callee) // Handle immediate recursion
248	0	continue;
249
250	0	auto Loc = FIM.find(Callee);
251	0	if (Loc == FIM.end())
252	0	continue;
253
254	0	FI.MemInstCost += Loc->second.MemInstCost;
255	0	FI.InstCost += Loc->second.InstCost;
256	0	FI.IAMInstCost += Loc->second.IAMInstCost;
257	0	FI.LSMInstCost += Loc->second.LSMInstCost;
258	0	} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
259	0	TargetLoweringBase::AddrMode AM;
260	0	auto Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, DL);
261	0	AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
262	0	AM.HasBaseReg = !AM.BaseGV;
263	0	if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
264	0	GEP->getPointerAddressSpace()))
265		// Offset will likely be folded into load or store
266	0	continue;
267	0	++FI.InstCost;
268	0	} else {
269	0	++FI.InstCost;
270	0	}
271	0	}
272
273	0	if (!FI.HasDenseGlobalMemAcc) {
274	0	unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
275	0	if (GlobalMemAccPercentage > 50) {
276	0	LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
277	0	<< B.getName() << " has " << GlobalMemAccPercentage
278	0	<< "% global memory access\n");
279	0	FI.HasDenseGlobalMemAcc = true;
280	0	}
281	0	}
282	0	}
283
284	0	return &FI;
285	0	}
286
287	0	bool AMDGPUPerfHint::runOnFunction(Function &F) {
288	0	const Module &M = *F.getParent();
289	0	DL = &M.getDataLayout();
290
291	0	if (F.hasFnAttribute("amdgpu-wave-limiter") &&
292	0	F.hasFnAttribute("amdgpu-memory-bound"))
293	0	return false;
294
295	0	const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
296
297	0	LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
298	0	<< '\n'
299	0	<< " IAMInst cost: " << Info->IAMInstCost << '\n'
300	0	<< " LSMInst cost: " << Info->LSMInstCost << '\n'
301	0	<< " TotalInst cost: " << Info->InstCost << '\n');
302
303	0	bool Changed = false;
304
305	0	if (isMemBound(*Info)) {
306	0	LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
307	0	NumMemBound++;
308	0	F.addFnAttr("amdgpu-memory-bound", "true");
309	0	Changed = true;
310	0	}
311
312	0	if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
313	0	LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
314	0	NumLimitWave++;
315	0	F.addFnAttr("amdgpu-wave-limiter", "true");
316	0	Changed = true;
317	0	}
318
319	0	return Changed;
320	0	}
321
322	0	bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
323		// Reverting optimal scheduling in favour of occupancy with basic block(s)
324		// having dense global memory access can potentially hurt performance.
325	0	if (FI.HasDenseGlobalMemAcc)
326	0	return true;
327
328	0	return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
329	0	}
330
331	0	bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
332	0	return ((FI.MemInstCost + FI.IAMInstCost * IAWeight +
333	0	FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh;
334	0	}
335
336	0	bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
337	0	if (auto PT = dyn_cast<PointerType>(V->getType())) {
338	0	unsigned As = PT->getAddressSpace();
339		// Flat likely points to global too.
340	0	return As == AMDGPUAS::GLOBAL_ADDRESS \|\| As == AMDGPUAS::FLAT_ADDRESS;
341	0	}
342	0	return false;
343	0	}
344
345	0	bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
346	0	if (auto PT = dyn_cast<PointerType>(V->getType()))
347	0	return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
348	0	return false;
349	0	}
350
351	0	bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
352	0	LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
353
354	0	MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
355	0	bool IsLargeStride = MAI.isLargeStride(LastAccess);
356	0	if (MAI.Base)
357	0	LastAccess = std::move(MAI);
358
359	0	return IsLargeStride;
360	0	}
361
362		AMDGPUPerfHint::MemAccessInfo
363	0	AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
364	0	MemAccessInfo MAI;
365	0	const Value *MO = getMemoryInstrPtrAndType(Inst).first;
366
367	0	LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
368		// Do not treat local-addr memory access as large stride.
369	0	if (isLocalAddr(MO))
370	0	return MAI;
371
372	0	MAI.V = MO;
373	0	MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
374	0	return MAI;
375	0	}
376
377		bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
378	0	MemAccessInfo &Reference) const {
379
380	0	if (!Base \|\| !Reference.Base \|\| Base != Reference.Base)
381	0	return false;
382
383	0	uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
384	0	: Reference.Offset - Offset;
385	0	bool Result = Diff > LargeStrideThresh;
386	0	LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
387	0	<< print() << "<=>\n"
388	0	<< Reference.print() << "Result:" << Result << '\n');
389	0	return Result;
390	0	}
391		} // namespace
392
393	0	bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) {
394	0	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
395	0	if (!TPC)
396	0	return false;
397
398	0	const TargetMachine &TM = TPC->getTM<TargetMachine>();
399
400	0	bool Changed = false;
401	0	for (CallGraphNode *I : SCC) {
402	0	Function *F = I->getFunction();
403	0	if (!F \|\| F->isDeclaration())
404	0	continue;
405
406	0	const TargetSubtargetInfo ST = TM.getSubtargetImpl(F);
407	0	AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
408
409	0	if (Analyzer.runOnFunction(*F))
410	0	Changed = true;
411	0	}
412
413	0	return Changed;
414	0	}
415
416	0	bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
417	0	auto FI = FIM.find(F);
418	0	if (FI == FIM.end())
419	0	return false;
420
421	0	return AMDGPUPerfHint::isMemBound(FI->second);
422	0	}
423
424	0	bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
425	0	auto FI = FIM.find(F);
426	0	if (FI == FIM.end())
427	0	return false;
428
429	0	return AMDGPUPerfHint::needLimitWave(FI->second);
430	0	}