/src/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | /// \file |
10 | | /// \brief Analyzes if a function potentially memory bound and if a kernel |
11 | | /// kernel may benefit from limiting number of waves to reduce cache thrashing. |
12 | | /// |
13 | | //===----------------------------------------------------------------------===// |
14 | | |
15 | | #include "AMDGPU.h" |
16 | | #include "AMDGPUPerfHintAnalysis.h" |
17 | | #include "Utils/AMDGPUBaseInfo.h" |
18 | | #include "llvm/ADT/SmallSet.h" |
19 | | #include "llvm/ADT/Statistic.h" |
20 | | #include "llvm/Analysis/CallGraph.h" |
21 | | #include "llvm/Analysis/ValueTracking.h" |
22 | | #include "llvm/CodeGen/TargetLowering.h" |
23 | | #include "llvm/CodeGen/TargetPassConfig.h" |
24 | | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
25 | | #include "llvm/IR/Instructions.h" |
26 | | #include "llvm/IR/IntrinsicInst.h" |
27 | | #include "llvm/Support/CommandLine.h" |
28 | | #include "llvm/Target/TargetMachine.h" |
29 | | |
30 | | using namespace llvm; |
31 | | |
32 | | #define DEBUG_TYPE "amdgpu-perf-hint" |
33 | | |
34 | | static cl::opt<unsigned> |
35 | | MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, |
36 | | cl::desc("Function mem bound threshold in %")); |
37 | | |
38 | | static cl::opt<unsigned> |
39 | | LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, |
40 | | cl::desc("Kernel limit wave threshold in %")); |
41 | | |
42 | | static cl::opt<unsigned> |
43 | | IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, |
44 | | cl::desc("Indirect access memory instruction weight")); |
45 | | |
46 | | static cl::opt<unsigned> |
47 | | LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, |
48 | | cl::desc("Large stride memory access weight")); |
49 | | |
50 | | static cl::opt<unsigned> |
51 | | LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, |
52 | | cl::desc("Large stride memory access threshold")); |
53 | | |
54 | | STATISTIC(NumMemBound, "Number of functions marked as memory bound"); |
55 | | STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); |
56 | | |
57 | | char llvm::AMDGPUPerfHintAnalysis::ID = 0; |
58 | | char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID; |
59 | | |
60 | | INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, |
61 | | "Analysis if a function is memory bound", true, true) |
62 | | |
63 | | namespace { |
64 | | |
65 | | struct AMDGPUPerfHint { |
66 | | friend AMDGPUPerfHintAnalysis; |
67 | | |
68 | | public: |
69 | | AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, |
70 | | const TargetLowering *TLI_) |
71 | 0 | : FIM(FIM_), DL(nullptr), TLI(TLI_) {} |
72 | | |
73 | | bool runOnFunction(Function &F); |
74 | | |
75 | | private: |
76 | | struct MemAccessInfo { |
77 | | const Value *V = nullptr; |
78 | | const Value *Base = nullptr; |
79 | | int64_t Offset = 0; |
80 | 0 | MemAccessInfo() = default; |
81 | | bool isLargeStride(MemAccessInfo &Reference) const; |
82 | | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
83 | 0 | Printable print() const { |
84 | 0 | return Printable([this](raw_ostream &OS) { |
85 | 0 | OS << "Value: " << *V << '\n' |
86 | 0 | << "Base: " << *Base << " Offset: " << Offset << '\n'; |
87 | 0 | }); |
88 | 0 | } |
89 | | #endif |
90 | | }; |
91 | | |
92 | | MemAccessInfo makeMemAccessInfo(Instruction *) const; |
93 | | |
94 | | MemAccessInfo LastAccess; // Last memory access info |
95 | | |
96 | | AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; |
97 | | |
98 | | const DataLayout *DL; |
99 | | |
100 | | const TargetLowering *TLI; |
101 | | |
102 | | AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F); |
103 | | static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); |
104 | | static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); |
105 | | |
106 | | bool isIndirectAccess(const Instruction *Inst) const; |
107 | | |
108 | | /// Check if the instruction is large stride. |
109 | | /// The purpose is to identify memory access pattern like: |
110 | | /// x = a[i]; |
111 | | /// y = a[i+1000]; |
112 | | /// z = a[i+2000]; |
113 | | /// In the above example, the second and third memory access will be marked |
114 | | /// large stride memory access. |
115 | | bool isLargeStride(const Instruction *Inst); |
116 | | |
117 | | bool isGlobalAddr(const Value *V) const; |
118 | | bool isLocalAddr(const Value *V) const; |
119 | | bool isGlobalLoadUsedInBB(const Instruction &) const; |
120 | | }; |
121 | | |
122 | | static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType( |
123 | 0 | const Instruction *Inst) { |
124 | 0 | if (auto LI = dyn_cast<LoadInst>(Inst)) |
125 | 0 | return {LI->getPointerOperand(), LI->getType()}; |
126 | 0 | if (auto SI = dyn_cast<StoreInst>(Inst)) |
127 | 0 | return {SI->getPointerOperand(), SI->getValueOperand()->getType()}; |
128 | 0 | if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) |
129 | 0 | return {AI->getPointerOperand(), AI->getCompareOperand()->getType()}; |
130 | 0 | if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) |
131 | 0 | return {AI->getPointerOperand(), AI->getValOperand()->getType()}; |
132 | 0 | if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) |
133 | 0 | return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())}; |
134 | | |
135 | 0 | return {nullptr, nullptr}; |
136 | 0 | } |
137 | | |
138 | 0 | bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { |
139 | 0 | LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); |
140 | 0 | SmallSet<const Value *, 32> WorkSet; |
141 | 0 | SmallSet<const Value *, 32> Visited; |
142 | 0 | if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) { |
143 | 0 | if (isGlobalAddr(MO)) |
144 | 0 | WorkSet.insert(MO); |
145 | 0 | } |
146 | |
|
147 | 0 | while (!WorkSet.empty()) { |
148 | 0 | const Value *V = *WorkSet.begin(); |
149 | 0 | WorkSet.erase(*WorkSet.begin()); |
150 | 0 | if (!Visited.insert(V).second) |
151 | 0 | continue; |
152 | 0 | LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); |
153 | |
|
154 | 0 | if (auto LD = dyn_cast<LoadInst>(V)) { |
155 | 0 | auto M = LD->getPointerOperand(); |
156 | 0 | if (isGlobalAddr(M)) { |
157 | 0 | LLVM_DEBUG(dbgs() << " is IA\n"); |
158 | 0 | return true; |
159 | 0 | } |
160 | 0 | continue; |
161 | 0 | } |
162 | | |
163 | 0 | if (auto GEP = dyn_cast<GetElementPtrInst>(V)) { |
164 | 0 | auto P = GEP->getPointerOperand(); |
165 | 0 | WorkSet.insert(P); |
166 | 0 | for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) |
167 | 0 | WorkSet.insert(GEP->getOperand(I)); |
168 | 0 | continue; |
169 | 0 | } |
170 | | |
171 | 0 | if (auto U = dyn_cast<UnaryInstruction>(V)) { |
172 | 0 | WorkSet.insert(U->getOperand(0)); |
173 | 0 | continue; |
174 | 0 | } |
175 | | |
176 | 0 | if (auto BO = dyn_cast<BinaryOperator>(V)) { |
177 | 0 | WorkSet.insert(BO->getOperand(0)); |
178 | 0 | WorkSet.insert(BO->getOperand(1)); |
179 | 0 | continue; |
180 | 0 | } |
181 | | |
182 | 0 | if (auto S = dyn_cast<SelectInst>(V)) { |
183 | 0 | WorkSet.insert(S->getFalseValue()); |
184 | 0 | WorkSet.insert(S->getTrueValue()); |
185 | 0 | continue; |
186 | 0 | } |
187 | | |
188 | 0 | if (auto E = dyn_cast<ExtractElementInst>(V)) { |
189 | 0 | WorkSet.insert(E->getVectorOperand()); |
190 | 0 | continue; |
191 | 0 | } |
192 | | |
193 | 0 | LLVM_DEBUG(dbgs() << " dropped\n"); |
194 | 0 | } |
195 | | |
196 | 0 | LLVM_DEBUG(dbgs() << " is not IA\n"); |
197 | 0 | return false; |
198 | 0 | } |
199 | | |
200 | | // Returns true if the global load `I` is used in its own basic block. |
201 | 0 | bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const { |
202 | 0 | const auto *Ld = dyn_cast<LoadInst>(&I); |
203 | 0 | if (!Ld) |
204 | 0 | return false; |
205 | 0 | if (!isGlobalAddr(Ld->getPointerOperand())) |
206 | 0 | return false; |
207 | | |
208 | 0 | for (const User *Usr : Ld->users()) { |
209 | 0 | if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) { |
210 | 0 | if (UsrInst->getParent() == I.getParent()) |
211 | 0 | return true; |
212 | 0 | } |
213 | 0 | } |
214 | | |
215 | 0 | return false; |
216 | 0 | } |
217 | | |
218 | 0 | AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { |
219 | 0 | AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; |
220 | |
|
221 | 0 | LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); |
222 | |
|
223 | 0 | for (auto &B : F) { |
224 | 0 | LastAccess = MemAccessInfo(); |
225 | 0 | unsigned UsedGlobalLoadsInBB = 0; |
226 | 0 | for (auto &I : B) { |
227 | 0 | if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) { |
228 | 0 | unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32); |
229 | | // TODO: Check if the global load and its user are close to each other |
230 | | // instead (Or do this analysis in GCNSchedStrategy?). |
231 | 0 | if (isGlobalLoadUsedInBB(I)) |
232 | 0 | UsedGlobalLoadsInBB += Size; |
233 | 0 | if (isIndirectAccess(&I)) |
234 | 0 | FI.IAMInstCost += Size; |
235 | 0 | if (isLargeStride(&I)) |
236 | 0 | FI.LSMInstCost += Size; |
237 | 0 | FI.MemInstCost += Size; |
238 | 0 | FI.InstCost += Size; |
239 | 0 | continue; |
240 | 0 | } |
241 | 0 | if (auto *CB = dyn_cast<CallBase>(&I)) { |
242 | 0 | Function *Callee = CB->getCalledFunction(); |
243 | 0 | if (!Callee || Callee->isDeclaration()) { |
244 | 0 | ++FI.InstCost; |
245 | 0 | continue; |
246 | 0 | } |
247 | 0 | if (&F == Callee) // Handle immediate recursion |
248 | 0 | continue; |
249 | | |
250 | 0 | auto Loc = FIM.find(Callee); |
251 | 0 | if (Loc == FIM.end()) |
252 | 0 | continue; |
253 | | |
254 | 0 | FI.MemInstCost += Loc->second.MemInstCost; |
255 | 0 | FI.InstCost += Loc->second.InstCost; |
256 | 0 | FI.IAMInstCost += Loc->second.IAMInstCost; |
257 | 0 | FI.LSMInstCost += Loc->second.LSMInstCost; |
258 | 0 | } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { |
259 | 0 | TargetLoweringBase::AddrMode AM; |
260 | 0 | auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); |
261 | 0 | AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr)); |
262 | 0 | AM.HasBaseReg = !AM.BaseGV; |
263 | 0 | if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(), |
264 | 0 | GEP->getPointerAddressSpace())) |
265 | | // Offset will likely be folded into load or store |
266 | 0 | continue; |
267 | 0 | ++FI.InstCost; |
268 | 0 | } else { |
269 | 0 | ++FI.InstCost; |
270 | 0 | } |
271 | 0 | } |
272 | |
|
273 | 0 | if (!FI.HasDenseGlobalMemAcc) { |
274 | 0 | unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size(); |
275 | 0 | if (GlobalMemAccPercentage > 50) { |
276 | 0 | LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since " |
277 | 0 | << B.getName() << " has " << GlobalMemAccPercentage |
278 | 0 | << "% global memory access\n"); |
279 | 0 | FI.HasDenseGlobalMemAcc = true; |
280 | 0 | } |
281 | 0 | } |
282 | 0 | } |
283 | |
|
284 | 0 | return &FI; |
285 | 0 | } |
286 | | |
287 | 0 | bool AMDGPUPerfHint::runOnFunction(Function &F) { |
288 | 0 | const Module &M = *F.getParent(); |
289 | 0 | DL = &M.getDataLayout(); |
290 | |
|
291 | 0 | if (F.hasFnAttribute("amdgpu-wave-limiter") && |
292 | 0 | F.hasFnAttribute("amdgpu-memory-bound")) |
293 | 0 | return false; |
294 | | |
295 | 0 | const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); |
296 | |
|
297 | 0 | LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost |
298 | 0 | << '\n' |
299 | 0 | << " IAMInst cost: " << Info->IAMInstCost << '\n' |
300 | 0 | << " LSMInst cost: " << Info->LSMInstCost << '\n' |
301 | 0 | << " TotalInst cost: " << Info->InstCost << '\n'); |
302 | |
|
303 | 0 | bool Changed = false; |
304 | |
|
305 | 0 | if (isMemBound(*Info)) { |
306 | 0 | LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); |
307 | 0 | NumMemBound++; |
308 | 0 | F.addFnAttr("amdgpu-memory-bound", "true"); |
309 | 0 | Changed = true; |
310 | 0 | } |
311 | |
|
312 | 0 | if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) { |
313 | 0 | LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); |
314 | 0 | NumLimitWave++; |
315 | 0 | F.addFnAttr("amdgpu-wave-limiter", "true"); |
316 | 0 | Changed = true; |
317 | 0 | } |
318 | |
|
319 | 0 | return Changed; |
320 | 0 | } |
321 | | |
322 | 0 | bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { |
323 | | // Reverting optimal scheduling in favour of occupancy with basic block(s) |
324 | | // having dense global memory access can potentially hurt performance. |
325 | 0 | if (FI.HasDenseGlobalMemAcc) |
326 | 0 | return true; |
327 | | |
328 | 0 | return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh; |
329 | 0 | } |
330 | | |
331 | 0 | bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { |
332 | 0 | return ((FI.MemInstCost + FI.IAMInstCost * IAWeight + |
333 | 0 | FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh; |
334 | 0 | } |
335 | | |
336 | 0 | bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { |
337 | 0 | if (auto PT = dyn_cast<PointerType>(V->getType())) { |
338 | 0 | unsigned As = PT->getAddressSpace(); |
339 | | // Flat likely points to global too. |
340 | 0 | return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS; |
341 | 0 | } |
342 | 0 | return false; |
343 | 0 | } |
344 | | |
345 | 0 | bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { |
346 | 0 | if (auto PT = dyn_cast<PointerType>(V->getType())) |
347 | 0 | return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; |
348 | 0 | return false; |
349 | 0 | } |
350 | | |
351 | 0 | bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { |
352 | 0 | LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); |
353 | |
|
354 | 0 | MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst)); |
355 | 0 | bool IsLargeStride = MAI.isLargeStride(LastAccess); |
356 | 0 | if (MAI.Base) |
357 | 0 | LastAccess = std::move(MAI); |
358 | |
|
359 | 0 | return IsLargeStride; |
360 | 0 | } |
361 | | |
362 | | AMDGPUPerfHint::MemAccessInfo |
363 | 0 | AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { |
364 | 0 | MemAccessInfo MAI; |
365 | 0 | const Value *MO = getMemoryInstrPtrAndType(Inst).first; |
366 | |
|
367 | 0 | LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); |
368 | | // Do not treat local-addr memory access as large stride. |
369 | 0 | if (isLocalAddr(MO)) |
370 | 0 | return MAI; |
371 | | |
372 | 0 | MAI.V = MO; |
373 | 0 | MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); |
374 | 0 | return MAI; |
375 | 0 | } |
376 | | |
377 | | bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( |
378 | 0 | MemAccessInfo &Reference) const { |
379 | |
|
380 | 0 | if (!Base || !Reference.Base || Base != Reference.Base) |
381 | 0 | return false; |
382 | | |
383 | 0 | uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset |
384 | 0 | : Reference.Offset - Offset; |
385 | 0 | bool Result = Diff > LargeStrideThresh; |
386 | 0 | LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" |
387 | 0 | << print() << "<=>\n" |
388 | 0 | << Reference.print() << "Result:" << Result << '\n'); |
389 | 0 | return Result; |
390 | 0 | } |
391 | | } // namespace |
392 | | |
393 | 0 | bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) { |
394 | 0 | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); |
395 | 0 | if (!TPC) |
396 | 0 | return false; |
397 | | |
398 | 0 | const TargetMachine &TM = TPC->getTM<TargetMachine>(); |
399 | |
|
400 | 0 | bool Changed = false; |
401 | 0 | for (CallGraphNode *I : SCC) { |
402 | 0 | Function *F = I->getFunction(); |
403 | 0 | if (!F || F->isDeclaration()) |
404 | 0 | continue; |
405 | | |
406 | 0 | const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F); |
407 | 0 | AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); |
408 | |
|
409 | 0 | if (Analyzer.runOnFunction(*F)) |
410 | 0 | Changed = true; |
411 | 0 | } |
412 | |
|
413 | 0 | return Changed; |
414 | 0 | } |
415 | | |
416 | 0 | bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { |
417 | 0 | auto FI = FIM.find(F); |
418 | 0 | if (FI == FIM.end()) |
419 | 0 | return false; |
420 | | |
421 | 0 | return AMDGPUPerfHint::isMemBound(FI->second); |
422 | 0 | } |
423 | | |
424 | 0 | bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { |
425 | 0 | auto FI = FIM.find(F); |
426 | 0 | if (FI == FIM.end()) |
427 | 0 | return false; |
428 | | |
429 | 0 | return AMDGPUPerfHint::needLimitWave(FI->second); |
430 | 0 | } |