/src/llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- AMDGPU.cpp ---------------------------------------------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | |
9 | | #include "ABIInfoImpl.h" |
10 | | #include "TargetInfo.h" |
11 | | #include "clang/Basic/TargetOptions.h" |
12 | | |
13 | | using namespace clang; |
14 | | using namespace clang::CodeGen; |
15 | | |
16 | | //===----------------------------------------------------------------------===// |
17 | | // AMDGPU ABI Implementation |
18 | | //===----------------------------------------------------------------------===// |
19 | | |
20 | | namespace { |
21 | | |
22 | | class AMDGPUABIInfo final : public DefaultABIInfo { |
23 | | private: |
24 | | static const unsigned MaxNumRegsForArgsRet = 16; |
25 | | |
26 | | unsigned numRegsForType(QualType Ty) const; |
27 | | |
28 | | bool isHomogeneousAggregateBaseType(QualType Ty) const override; |
29 | | bool isHomogeneousAggregateSmallEnough(const Type *Base, |
30 | | uint64_t Members) const override; |
31 | | |
32 | | // Coerce HIP scalar pointer arguments from generic pointers to global ones. |
33 | | llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS, |
34 | 0 | unsigned ToAS) const { |
35 | | // Single value types. |
36 | 0 | auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty); |
37 | 0 | if (PtrTy && PtrTy->getAddressSpace() == FromAS) |
38 | 0 | return llvm::PointerType::get(Ty->getContext(), ToAS); |
39 | 0 | return Ty; |
40 | 0 | } |
41 | | |
42 | | public: |
43 | | explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) : |
44 | 0 | DefaultABIInfo(CGT) {} |
45 | | |
46 | | ABIArgInfo classifyReturnType(QualType RetTy) const; |
47 | | ABIArgInfo classifyKernelArgumentType(QualType Ty) const; |
48 | | ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const; |
49 | | |
50 | | void computeInfo(CGFunctionInfo &FI) const override; |
51 | | Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, |
52 | | QualType Ty) const override; |
53 | | }; |
54 | | |
55 | 0 | bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { |
56 | 0 | return true; |
57 | 0 | } |
58 | | |
59 | | bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough( |
60 | 0 | const Type *Base, uint64_t Members) const { |
61 | 0 | uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32; |
62 | | |
63 | | // Homogeneous Aggregates may occupy at most 16 registers. |
64 | 0 | return Members * NumRegs <= MaxNumRegsForArgsRet; |
65 | 0 | } |
66 | | |
67 | | /// Estimate number of registers the type will use when passed in registers. |
68 | 0 | unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const { |
69 | 0 | unsigned NumRegs = 0; |
70 | |
|
71 | 0 | if (const VectorType *VT = Ty->getAs<VectorType>()) { |
72 | | // Compute from the number of elements. The reported size is based on the |
73 | | // in-memory size, which includes the padding 4th element for 3-vectors. |
74 | 0 | QualType EltTy = VT->getElementType(); |
75 | 0 | unsigned EltSize = getContext().getTypeSize(EltTy); |
76 | | |
77 | | // 16-bit element vectors should be passed as packed. |
78 | 0 | if (EltSize == 16) |
79 | 0 | return (VT->getNumElements() + 1) / 2; |
80 | | |
81 | 0 | unsigned EltNumRegs = (EltSize + 31) / 32; |
82 | 0 | return EltNumRegs * VT->getNumElements(); |
83 | 0 | } |
84 | | |
85 | 0 | if (const RecordType *RT = Ty->getAs<RecordType>()) { |
86 | 0 | const RecordDecl *RD = RT->getDecl(); |
87 | 0 | assert(!RD->hasFlexibleArrayMember()); |
88 | | |
89 | 0 | for (const FieldDecl *Field : RD->fields()) { |
90 | 0 | QualType FieldTy = Field->getType(); |
91 | 0 | NumRegs += numRegsForType(FieldTy); |
92 | 0 | } |
93 | |
|
94 | 0 | return NumRegs; |
95 | 0 | } |
96 | | |
97 | 0 | return (getContext().getTypeSize(Ty) + 31) / 32; |
98 | 0 | } |
99 | | |
100 | 0 | void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const { |
101 | 0 | llvm::CallingConv::ID CC = FI.getCallingConvention(); |
102 | |
|
103 | 0 | if (!getCXXABI().classifyReturnType(FI)) |
104 | 0 | FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); |
105 | |
|
106 | 0 | unsigned NumRegsLeft = MaxNumRegsForArgsRet; |
107 | 0 | for (auto &Arg : FI.arguments()) { |
108 | 0 | if (CC == llvm::CallingConv::AMDGPU_KERNEL) { |
109 | 0 | Arg.info = classifyKernelArgumentType(Arg.type); |
110 | 0 | } else { |
111 | 0 | Arg.info = classifyArgumentType(Arg.type, NumRegsLeft); |
112 | 0 | } |
113 | 0 | } |
114 | 0 | } |
115 | | |
116 | | Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, |
117 | 0 | QualType Ty) const { |
118 | 0 | llvm_unreachable("AMDGPU does not support varargs"); |
119 | 0 | } |
120 | | |
121 | 0 | ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const { |
122 | 0 | if (isAggregateTypeForABI(RetTy)) { |
123 | | // Records with non-trivial destructors/copy-constructors should not be |
124 | | // returned by value. |
125 | 0 | if (!getRecordArgABI(RetTy, getCXXABI())) { |
126 | | // Ignore empty structs/unions. |
127 | 0 | if (isEmptyRecord(getContext(), RetTy, true)) |
128 | 0 | return ABIArgInfo::getIgnore(); |
129 | | |
130 | | // Lower single-element structs to just return a regular value. |
131 | 0 | if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext())) |
132 | 0 | return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); |
133 | | |
134 | 0 | if (const RecordType *RT = RetTy->getAs<RecordType>()) { |
135 | 0 | const RecordDecl *RD = RT->getDecl(); |
136 | 0 | if (RD->hasFlexibleArrayMember()) |
137 | 0 | return DefaultABIInfo::classifyReturnType(RetTy); |
138 | 0 | } |
139 | | |
140 | | // Pack aggregates <= 4 bytes into single VGPR or pair. |
141 | 0 | uint64_t Size = getContext().getTypeSize(RetTy); |
142 | 0 | if (Size <= 16) |
143 | 0 | return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); |
144 | | |
145 | 0 | if (Size <= 32) |
146 | 0 | return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); |
147 | | |
148 | 0 | if (Size <= 64) { |
149 | 0 | llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); |
150 | 0 | return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); |
151 | 0 | } |
152 | | |
153 | 0 | if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet) |
154 | 0 | return ABIArgInfo::getDirect(); |
155 | 0 | } |
156 | 0 | } |
157 | | |
158 | | // Otherwise just do the default thing. |
159 | 0 | return DefaultABIInfo::classifyReturnType(RetTy); |
160 | 0 | } |
161 | | |
162 | | /// For kernels all parameters are really passed in a special buffer. It doesn't |
163 | | /// make sense to pass anything byval, so everything must be direct. |
164 | 0 | ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const { |
165 | 0 | Ty = useFirstFieldIfTransparentUnion(Ty); |
166 | | |
167 | | // TODO: Can we omit empty structs? |
168 | |
|
169 | 0 | if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) |
170 | 0 | Ty = QualType(SeltTy, 0); |
171 | |
|
172 | 0 | llvm::Type *OrigLTy = CGT.ConvertType(Ty); |
173 | 0 | llvm::Type *LTy = OrigLTy; |
174 | 0 | if (getContext().getLangOpts().HIP) { |
175 | 0 | LTy = coerceKernelArgumentType( |
176 | 0 | OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default), |
177 | 0 | /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); |
178 | 0 | } |
179 | | |
180 | | // FIXME: Should also use this for OpenCL, but it requires addressing the |
181 | | // problem of kernels being called. |
182 | | // |
183 | | // FIXME: This doesn't apply the optimization of coercing pointers in structs |
184 | | // to global address space when using byref. This would require implementing a |
185 | | // new kind of coercion of the in-memory type when for indirect arguments. |
186 | 0 | if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy && |
187 | 0 | isAggregateTypeForABI(Ty)) { |
188 | 0 | return ABIArgInfo::getIndirectAliased( |
189 | 0 | getContext().getTypeAlignInChars(Ty), |
190 | 0 | getContext().getTargetAddressSpace(LangAS::opencl_constant), |
191 | 0 | false /*Realign*/, nullptr /*Padding*/); |
192 | 0 | } |
193 | | |
194 | | // If we set CanBeFlattened to true, CodeGen will expand the struct to its |
195 | | // individual elements, which confuses the Clover OpenCL backend; therefore we |
196 | | // have to set it to false here. Other args of getDirect() are just defaults. |
197 | 0 | return ABIArgInfo::getDirect(LTy, 0, nullptr, false); |
198 | 0 | } |
199 | | |
200 | | ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, |
201 | 0 | unsigned &NumRegsLeft) const { |
202 | 0 | assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow"); |
203 | | |
204 | 0 | Ty = useFirstFieldIfTransparentUnion(Ty); |
205 | |
|
206 | 0 | if (isAggregateTypeForABI(Ty)) { |
207 | | // Records with non-trivial destructors/copy-constructors should not be |
208 | | // passed by value. |
209 | 0 | if (auto RAA = getRecordArgABI(Ty, getCXXABI())) |
210 | 0 | return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory); |
211 | | |
212 | | // Ignore empty structs/unions. |
213 | 0 | if (isEmptyRecord(getContext(), Ty, true)) |
214 | 0 | return ABIArgInfo::getIgnore(); |
215 | | |
216 | | // Lower single-element structs to just pass a regular value. TODO: We |
217 | | // could do reasonable-size multiple-element structs too, using getExpand(), |
218 | | // though watch out for things like bitfields. |
219 | 0 | if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) |
220 | 0 | return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); |
221 | | |
222 | 0 | if (const RecordType *RT = Ty->getAs<RecordType>()) { |
223 | 0 | const RecordDecl *RD = RT->getDecl(); |
224 | 0 | if (RD->hasFlexibleArrayMember()) |
225 | 0 | return DefaultABIInfo::classifyArgumentType(Ty); |
226 | 0 | } |
227 | | |
228 | | // Pack aggregates <= 8 bytes into single VGPR or pair. |
229 | 0 | uint64_t Size = getContext().getTypeSize(Ty); |
230 | 0 | if (Size <= 64) { |
231 | 0 | unsigned NumRegs = (Size + 31) / 32; |
232 | 0 | NumRegsLeft -= std::min(NumRegsLeft, NumRegs); |
233 | |
|
234 | 0 | if (Size <= 16) |
235 | 0 | return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); |
236 | | |
237 | 0 | if (Size <= 32) |
238 | 0 | return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); |
239 | | |
240 | | // XXX: Should this be i64 instead, and should the limit increase? |
241 | 0 | llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); |
242 | 0 | return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); |
243 | 0 | } |
244 | | |
245 | 0 | if (NumRegsLeft > 0) { |
246 | 0 | unsigned NumRegs = numRegsForType(Ty); |
247 | 0 | if (NumRegsLeft >= NumRegs) { |
248 | 0 | NumRegsLeft -= NumRegs; |
249 | 0 | return ABIArgInfo::getDirect(); |
250 | 0 | } |
251 | 0 | } |
252 | | |
253 | | // Use pass-by-reference in stead of pass-by-value for struct arguments in |
254 | | // function ABI. |
255 | 0 | return ABIArgInfo::getIndirectAliased( |
256 | 0 | getContext().getTypeAlignInChars(Ty), |
257 | 0 | getContext().getTargetAddressSpace(LangAS::opencl_private)); |
258 | 0 | } |
259 | | |
260 | | // Otherwise just do the default thing. |
261 | 0 | ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty); |
262 | 0 | if (!ArgInfo.isIndirect()) { |
263 | 0 | unsigned NumRegs = numRegsForType(Ty); |
264 | 0 | NumRegsLeft -= std::min(NumRegs, NumRegsLeft); |
265 | 0 | } |
266 | |
|
267 | 0 | return ArgInfo; |
268 | 0 | } |
269 | | |
270 | | class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { |
271 | | public: |
272 | | AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT) |
273 | 0 | : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {} |
274 | | |
275 | | void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, |
276 | | CodeGenModule &CGM) const; |
277 | | |
278 | | void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override; |
279 | | |
280 | | void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, |
281 | | CodeGen::CodeGenModule &M) const override; |
282 | | unsigned getOpenCLKernelCallingConv() const override; |
283 | | |
284 | | llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM, |
285 | | llvm::PointerType *T, QualType QT) const override; |
286 | | |
287 | 0 | LangAS getASTAllocaAddressSpace() const override { |
288 | 0 | return getLangASFromTargetAS( |
289 | 0 | getABIInfo().getDataLayout().getAllocaAddrSpace()); |
290 | 0 | } |
291 | | LangAS getGlobalVarAddressSpace(CodeGenModule &CGM, |
292 | | const VarDecl *D) const override; |
293 | | llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts, |
294 | | SyncScope Scope, |
295 | | llvm::AtomicOrdering Ordering, |
296 | | llvm::LLVMContext &Ctx) const override; |
297 | | llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF, |
298 | | llvm::Function *BlockInvokeFunc, |
299 | | llvm::Type *BlockTy) const override; |
300 | | bool shouldEmitStaticExternCAliases() const override; |
301 | | bool shouldEmitDWARFBitFieldSeparators() const override; |
302 | | void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; |
303 | | }; |
304 | | } |
305 | | |
306 | | static bool requiresAMDGPUProtectedVisibility(const Decl *D, |
307 | 0 | llvm::GlobalValue *GV) { |
308 | 0 | if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility) |
309 | 0 | return false; |
310 | | |
311 | 0 | return !D->hasAttr<OMPDeclareTargetDeclAttr>() && |
312 | 0 | (D->hasAttr<OpenCLKernelAttr>() || |
313 | 0 | (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) || |
314 | 0 | (isa<VarDecl>(D) && |
315 | 0 | (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || |
316 | 0 | cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() || |
317 | 0 | cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()))); |
318 | 0 | } |
319 | | |
320 | | void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( |
321 | 0 | const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const { |
322 | 0 | const auto *ReqdWGS = |
323 | 0 | M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr; |
324 | 0 | const bool IsOpenCLKernel = |
325 | 0 | M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>(); |
326 | 0 | const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>(); |
327 | |
|
328 | 0 | const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>(); |
329 | 0 | if (ReqdWGS || FlatWGS) { |
330 | 0 | M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS); |
331 | 0 | } else if (IsOpenCLKernel || IsHIPKernel) { |
332 | | // By default, restrict the maximum size to a value specified by |
333 | | // --gpu-max-threads-per-block=n or its default value for HIP. |
334 | 0 | const unsigned OpenCLDefaultMaxWorkGroupSize = 256; |
335 | 0 | const unsigned DefaultMaxWorkGroupSize = |
336 | 0 | IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize |
337 | 0 | : M.getLangOpts().GPUMaxThreadsPerBlock; |
338 | 0 | std::string AttrVal = |
339 | 0 | std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize); |
340 | 0 | F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); |
341 | 0 | } |
342 | |
|
343 | 0 | if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) |
344 | 0 | M.handleAMDGPUWavesPerEUAttr(F, Attr); |
345 | |
|
346 | 0 | if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) { |
347 | 0 | unsigned NumSGPR = Attr->getNumSGPR(); |
348 | |
|
349 | 0 | if (NumSGPR != 0) |
350 | 0 | F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR)); |
351 | 0 | } |
352 | |
|
353 | 0 | if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) { |
354 | 0 | uint32_t NumVGPR = Attr->getNumVGPR(); |
355 | |
|
356 | 0 | if (NumVGPR != 0) |
357 | 0 | F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); |
358 | 0 | } |
359 | 0 | } |
360 | | |
361 | | /// Emits control constants used to change per-architecture behaviour in the |
362 | | /// AMDGPU ROCm device libraries. |
363 | | void AMDGPUTargetCodeGenInfo::emitTargetGlobals( |
364 | 0 | CodeGen::CodeGenModule &CGM) const { |
365 | 0 | StringRef Name = "__oclc_ABI_version"; |
366 | 0 | llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name); |
367 | 0 | if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage())) |
368 | 0 | return; |
369 | | |
370 | 0 | if (CGM.getTarget().getTargetOpts().CodeObjectVersion == |
371 | 0 | llvm::CodeObjectVersionKind::COV_None) |
372 | 0 | return; |
373 | | |
374 | 0 | auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32); |
375 | 0 | llvm::Constant *COV = llvm::ConstantInt::get( |
376 | 0 | Type, CGM.getTarget().getTargetOpts().CodeObjectVersion); |
377 | | |
378 | | // It needs to be constant weak_odr without externally_initialized so that |
379 | | // the load instuction can be eliminated by the IPSCCP. |
380 | 0 | auto *GV = new llvm::GlobalVariable( |
381 | 0 | CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name, |
382 | 0 | nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, |
383 | 0 | CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant)); |
384 | 0 | GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); |
385 | 0 | GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility); |
386 | | |
387 | | // Replace any external references to this variable with the new global. |
388 | 0 | if (OriginalGV) { |
389 | 0 | OriginalGV->replaceAllUsesWith(GV); |
390 | 0 | GV->takeName(OriginalGV); |
391 | 0 | OriginalGV->eraseFromParent(); |
392 | 0 | } |
393 | 0 | } |
394 | | |
395 | | void AMDGPUTargetCodeGenInfo::setTargetAttributes( |
396 | 0 | const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { |
397 | 0 | if (requiresAMDGPUProtectedVisibility(D, GV)) { |
398 | 0 | GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); |
399 | 0 | GV->setDSOLocal(true); |
400 | 0 | } |
401 | |
|
402 | 0 | if (GV->isDeclaration()) |
403 | 0 | return; |
404 | | |
405 | 0 | llvm::Function *F = dyn_cast<llvm::Function>(GV); |
406 | 0 | if (!F) |
407 | 0 | return; |
408 | | |
409 | 0 | const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D); |
410 | 0 | if (FD) |
411 | 0 | setFunctionDeclAttributes(FD, F, M); |
412 | |
|
413 | 0 | if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) |
414 | 0 | F->addFnAttr("amdgpu-unsafe-fp-atomics", "true"); |
415 | |
|
416 | 0 | if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) |
417 | 0 | F->addFnAttr("amdgpu-ieee", "false"); |
418 | 0 | } |
419 | | |
420 | 0 | unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const { |
421 | 0 | return llvm::CallingConv::AMDGPU_KERNEL; |
422 | 0 | } |
423 | | |
424 | | // Currently LLVM assumes null pointers always have value 0, |
425 | | // which results in incorrectly transformed IR. Therefore, instead of |
426 | | // emitting null pointers in private and local address spaces, a null |
427 | | // pointer in generic address space is emitted which is casted to a |
428 | | // pointer in local or private address space. |
429 | | llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer( |
430 | | const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT, |
431 | 0 | QualType QT) const { |
432 | 0 | if (CGM.getContext().getTargetNullPointerValue(QT) == 0) |
433 | 0 | return llvm::ConstantPointerNull::get(PT); |
434 | | |
435 | 0 | auto &Ctx = CGM.getContext(); |
436 | 0 | auto NPT = llvm::PointerType::get( |
437 | 0 | PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic)); |
438 | 0 | return llvm::ConstantExpr::getAddrSpaceCast( |
439 | 0 | llvm::ConstantPointerNull::get(NPT), PT); |
440 | 0 | } |
441 | | |
442 | | LangAS |
443 | | AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, |
444 | 0 | const VarDecl *D) const { |
445 | 0 | assert(!CGM.getLangOpts().OpenCL && |
446 | 0 | !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) && |
447 | 0 | "Address space agnostic languages only"); |
448 | 0 | LangAS DefaultGlobalAS = getLangASFromTargetAS( |
449 | 0 | CGM.getContext().getTargetAddressSpace(LangAS::opencl_global)); |
450 | 0 | if (!D) |
451 | 0 | return DefaultGlobalAS; |
452 | | |
453 | 0 | LangAS AddrSpace = D->getType().getAddressSpace(); |
454 | 0 | if (AddrSpace != LangAS::Default) |
455 | 0 | return AddrSpace; |
456 | | |
457 | | // Only promote to address space 4 if VarDecl has constant initialization. |
458 | 0 | if (D->getType().isConstantStorage(CGM.getContext(), false, false) && |
459 | 0 | D->hasConstantInitialization()) { |
460 | 0 | if (auto ConstAS = CGM.getTarget().getConstantAddressSpace()) |
461 | 0 | return *ConstAS; |
462 | 0 | } |
463 | 0 | return DefaultGlobalAS; |
464 | 0 | } |
465 | | |
466 | | llvm::SyncScope::ID |
467 | | AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts, |
468 | | SyncScope Scope, |
469 | | llvm::AtomicOrdering Ordering, |
470 | 0 | llvm::LLVMContext &Ctx) const { |
471 | 0 | std::string Name; |
472 | 0 | switch (Scope) { |
473 | 0 | case SyncScope::HIPSingleThread: |
474 | 0 | case SyncScope::SingleScope: |
475 | 0 | Name = "singlethread"; |
476 | 0 | break; |
477 | 0 | case SyncScope::HIPWavefront: |
478 | 0 | case SyncScope::OpenCLSubGroup: |
479 | 0 | case SyncScope::WavefrontScope: |
480 | 0 | Name = "wavefront"; |
481 | 0 | break; |
482 | 0 | case SyncScope::HIPWorkgroup: |
483 | 0 | case SyncScope::OpenCLWorkGroup: |
484 | 0 | case SyncScope::WorkgroupScope: |
485 | 0 | Name = "workgroup"; |
486 | 0 | break; |
487 | 0 | case SyncScope::HIPAgent: |
488 | 0 | case SyncScope::OpenCLDevice: |
489 | 0 | case SyncScope::DeviceScope: |
490 | 0 | Name = "agent"; |
491 | 0 | break; |
492 | 0 | case SyncScope::SystemScope: |
493 | 0 | case SyncScope::HIPSystem: |
494 | 0 | case SyncScope::OpenCLAllSVMDevices: |
495 | 0 | Name = ""; |
496 | 0 | break; |
497 | 0 | } |
498 | | |
499 | 0 | if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) { |
500 | 0 | if (!Name.empty()) |
501 | 0 | Name = Twine(Twine(Name) + Twine("-")).str(); |
502 | |
|
503 | 0 | Name = Twine(Twine(Name) + Twine("one-as")).str(); |
504 | 0 | } |
505 | |
|
506 | 0 | return Ctx.getOrInsertSyncScopeID(Name); |
507 | 0 | } |
508 | | |
509 | 0 | bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const { |
510 | 0 | return false; |
511 | 0 | } |
512 | | |
513 | 0 | bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const { |
514 | 0 | return true; |
515 | 0 | } |
516 | | |
517 | | void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention( |
518 | 0 | const FunctionType *&FT) const { |
519 | 0 | FT = getABIInfo().getContext().adjustFunctionType( |
520 | 0 | FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel)); |
521 | 0 | } |
522 | | |
523 | | /// Create an OpenCL kernel for an enqueued block. |
524 | | /// |
525 | | /// The type of the first argument (the block literal) is the struct type |
526 | | /// of the block literal instead of a pointer type. The first argument |
527 | | /// (block literal) is passed directly by value to the kernel. The kernel |
528 | | /// allocates the same type of struct on stack and stores the block literal |
529 | | /// to it and passes its pointer to the block invoke function. The kernel |
530 | | /// has "enqueued-block" function attribute and kernel argument metadata. |
531 | | llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( |
532 | 0 | CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const { |
533 | 0 | auto &Builder = CGF.Builder; |
534 | 0 | auto &C = CGF.getLLVMContext(); |
535 | |
|
536 | 0 | auto *InvokeFT = Invoke->getFunctionType(); |
537 | 0 | llvm::SmallVector<llvm::Type *, 2> ArgTys; |
538 | 0 | llvm::SmallVector<llvm::Metadata *, 8> AddressQuals; |
539 | 0 | llvm::SmallVector<llvm::Metadata *, 8> AccessQuals; |
540 | 0 | llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames; |
541 | 0 | llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames; |
542 | 0 | llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals; |
543 | 0 | llvm::SmallVector<llvm::Metadata *, 8> ArgNames; |
544 | |
|
545 | 0 | ArgTys.push_back(BlockTy); |
546 | 0 | ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); |
547 | 0 | AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0))); |
548 | 0 | ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); |
549 | 0 | ArgTypeQuals.push_back(llvm::MDString::get(C, "")); |
550 | 0 | AccessQuals.push_back(llvm::MDString::get(C, "none")); |
551 | 0 | ArgNames.push_back(llvm::MDString::get(C, "block_literal")); |
552 | 0 | for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) { |
553 | 0 | ArgTys.push_back(InvokeFT->getParamType(I)); |
554 | 0 | ArgTypeNames.push_back(llvm::MDString::get(C, "void*")); |
555 | 0 | AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3))); |
556 | 0 | AccessQuals.push_back(llvm::MDString::get(C, "none")); |
557 | 0 | ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*")); |
558 | 0 | ArgTypeQuals.push_back(llvm::MDString::get(C, "")); |
559 | 0 | ArgNames.push_back( |
560 | 0 | llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str())); |
561 | 0 | } |
562 | 0 | std::string Name = Invoke->getName().str() + "_kernel"; |
563 | 0 | auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false); |
564 | 0 | auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name, |
565 | 0 | &CGF.CGM.getModule()); |
566 | 0 | F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); |
567 | |
|
568 | 0 | llvm::AttrBuilder KernelAttrs(C); |
569 | | // FIXME: The invoke isn't applying the right attributes either |
570 | | // FIXME: This is missing setTargetAttributes |
571 | 0 | CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs); |
572 | 0 | KernelAttrs.addAttribute("enqueued-block"); |
573 | 0 | F->addFnAttrs(KernelAttrs); |
574 | |
|
575 | 0 | auto IP = CGF.Builder.saveIP(); |
576 | 0 | auto *BB = llvm::BasicBlock::Create(C, "entry", F); |
577 | 0 | Builder.SetInsertPoint(BB); |
578 | 0 | const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy); |
579 | 0 | auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr); |
580 | 0 | BlockPtr->setAlignment(BlockAlign); |
581 | 0 | Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign); |
582 | 0 | auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0)); |
583 | 0 | llvm::SmallVector<llvm::Value *, 2> Args; |
584 | 0 | Args.push_back(Cast); |
585 | 0 | for (llvm::Argument &A : llvm::drop_begin(F->args())) |
586 | 0 | Args.push_back(&A); |
587 | 0 | llvm::CallInst *call = Builder.CreateCall(Invoke, Args); |
588 | 0 | call->setCallingConv(Invoke->getCallingConv()); |
589 | 0 | Builder.CreateRetVoid(); |
590 | 0 | Builder.restoreIP(IP); |
591 | |
|
592 | 0 | F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals)); |
593 | 0 | F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals)); |
594 | 0 | F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames)); |
595 | 0 | F->setMetadata("kernel_arg_base_type", |
596 | 0 | llvm::MDNode::get(C, ArgBaseTypeNames)); |
597 | 0 | F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals)); |
598 | 0 | if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) |
599 | 0 | F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames)); |
600 | |
|
601 | 0 | return F; |
602 | 0 | } |
603 | | |
604 | | void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr( |
605 | | llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS, |
606 | | const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal, |
607 | 0 | int32_t *MaxThreadsVal) { |
608 | 0 | unsigned Min = 0; |
609 | 0 | unsigned Max = 0; |
610 | 0 | if (FlatWGS) { |
611 | 0 | Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue(); |
612 | 0 | Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue(); |
613 | 0 | } |
614 | 0 | if (ReqdWGS && Min == 0 && Max == 0) |
615 | 0 | Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim(); |
616 | |
|
617 | 0 | if (Min != 0) { |
618 | 0 | assert(Min <= Max && "Min must be less than or equal Max"); |
619 | | |
620 | 0 | if (MinThreadsVal) |
621 | 0 | *MinThreadsVal = Min; |
622 | 0 | if (MaxThreadsVal) |
623 | 0 | *MaxThreadsVal = Max; |
624 | 0 | std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max); |
625 | 0 | if (F) |
626 | 0 | F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); |
627 | 0 | } else |
628 | 0 | assert(Max == 0 && "Max must be zero"); |
629 | 0 | } |
630 | | |
631 | | void CodeGenModule::handleAMDGPUWavesPerEUAttr( |
632 | 0 | llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) { |
633 | 0 | unsigned Min = |
634 | 0 | Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue(); |
635 | 0 | unsigned Max = |
636 | 0 | Attr->getMax() |
637 | 0 | ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue() |
638 | 0 | : 0; |
639 | |
|
640 | 0 | if (Min != 0) { |
641 | 0 | assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max"); |
642 | | |
643 | 0 | std::string AttrVal = llvm::utostr(Min); |
644 | 0 | if (Max != 0) |
645 | 0 | AttrVal = AttrVal + "," + llvm::utostr(Max); |
646 | 0 | F->addFnAttr("amdgpu-waves-per-eu", AttrVal); |
647 | 0 | } else |
648 | 0 | assert(Max == 0 && "Max must be zero"); |
649 | 0 | } |
650 | | |
651 | | std::unique_ptr<TargetCodeGenInfo> |
652 | 0 | CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) { |
653 | 0 | return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes()); |
654 | 0 | } |