/src/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This provides a class for CUDA code generation targeting the NVIDIA CUDA |
10 | | // runtime library. |
11 | | // |
12 | | //===----------------------------------------------------------------------===// |
13 | | |
14 | | #include "CGCUDARuntime.h" |
15 | | #include "CGCXXABI.h" |
16 | | #include "CodeGenFunction.h" |
17 | | #include "CodeGenModule.h" |
18 | | #include "clang/AST/Decl.h" |
19 | | #include "clang/Basic/Cuda.h" |
20 | | #include "clang/CodeGen/CodeGenABITypes.h" |
21 | | #include "clang/CodeGen/ConstantInitBuilder.h" |
22 | | #include "llvm/Frontend/Offloading/Utility.h" |
23 | | #include "llvm/IR/BasicBlock.h" |
24 | | #include "llvm/IR/Constants.h" |
25 | | #include "llvm/IR/DerivedTypes.h" |
26 | | #include "llvm/IR/ReplaceConstant.h" |
27 | | #include "llvm/Support/Format.h" |
28 | | #include "llvm/Support/VirtualFileSystem.h" |
29 | | |
30 | | using namespace clang; |
31 | | using namespace CodeGen; |
32 | | |
33 | | namespace { |
34 | | constexpr unsigned CudaFatMagic = 0x466243b1; |
35 | | constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" |
36 | | |
37 | | class CGNVCUDARuntime : public CGCUDARuntime { |
38 | | |
39 | | private: |
40 | | llvm::IntegerType *IntTy, *SizeTy; |
41 | | llvm::Type *VoidTy; |
42 | | llvm::PointerType *PtrTy; |
43 | | |
44 | | /// Convenience reference to LLVM Context |
45 | | llvm::LLVMContext &Context; |
46 | | /// Convenience reference to the current module |
47 | | llvm::Module &TheModule; |
48 | | /// Keeps track of kernel launch stubs and handles emitted in this module |
49 | | struct KernelInfo { |
50 | | llvm::Function *Kernel; // stub function to help launch kernel |
51 | | const Decl *D; |
52 | | }; |
53 | | llvm::SmallVector<KernelInfo, 16> EmittedKernels; |
54 | | // Map a kernel mangled name to a symbol for identifying kernel in host code |
55 | | // For CUDA, the symbol for identifying the kernel is the same as the device |
56 | | // stub function. For HIP, they are different. |
57 | | llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles; |
58 | | // Map a kernel handle to the kernel stub. |
59 | | llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs; |
60 | | struct VarInfo { |
61 | | llvm::GlobalVariable *Var; |
62 | | const VarDecl *D; |
63 | | DeviceVarFlags Flags; |
64 | | }; |
65 | | llvm::SmallVector<VarInfo, 16> DeviceVars; |
66 | | /// Keeps track of variable containing handle of GPU binary. Populated by |
67 | | /// ModuleCtorFunction() and used to create corresponding cleanup calls in |
68 | | /// ModuleDtorFunction() |
69 | | llvm::GlobalVariable *GpuBinaryHandle = nullptr; |
70 | | /// Whether we generate relocatable device code. |
71 | | bool RelocatableDeviceCode; |
72 | | /// Mangle context for device. |
73 | | std::unique_ptr<MangleContext> DeviceMC; |
74 | | /// Some zeros used for GEPs. |
75 | | llvm::Constant *Zeros[2]; |
76 | | |
77 | | llvm::FunctionCallee getSetupArgumentFn() const; |
78 | | llvm::FunctionCallee getLaunchFn() const; |
79 | | |
80 | | llvm::FunctionType *getRegisterGlobalsFnTy() const; |
81 | | llvm::FunctionType *getCallbackFnTy() const; |
82 | | llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; |
83 | | std::string addPrefixToName(StringRef FuncName) const; |
84 | | std::string addUnderscoredPrefixToName(StringRef FuncName) const; |
85 | | |
86 | | /// Creates a function to register all kernel stubs generated in this module. |
87 | | llvm::Function *makeRegisterGlobalsFn(); |
88 | | |
89 | | /// Helper function that generates a constant string and returns a pointer to |
90 | | /// the start of the string. The result of this function can be used anywhere |
91 | | /// where the C code specifies const char*. |
92 | | llvm::Constant *makeConstantString(const std::string &Str, |
93 | 0 | const std::string &Name = "") { |
94 | 0 | auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); |
95 | 0 | return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), |
96 | 0 | ConstStr.getPointer(), Zeros); |
97 | 0 | } |
98 | | |
99 | | /// Helper function which generates an initialized constant array from Str, |
100 | | /// and optionally sets section name and alignment. AddNull specifies whether |
101 | | /// the array should nave NUL termination. |
102 | | llvm::Constant *makeConstantArray(StringRef Str, |
103 | | StringRef Name = "", |
104 | | StringRef SectionName = "", |
105 | | unsigned Alignment = 0, |
106 | 0 | bool AddNull = false) { |
107 | 0 | llvm::Constant *Value = |
108 | 0 | llvm::ConstantDataArray::getString(Context, Str, AddNull); |
109 | 0 | auto *GV = new llvm::GlobalVariable( |
110 | 0 | TheModule, Value->getType(), /*isConstant=*/true, |
111 | 0 | llvm::GlobalValue::PrivateLinkage, Value, Name); |
112 | 0 | if (!SectionName.empty()) { |
113 | 0 | GV->setSection(SectionName); |
114 | | // Mark the address as used which make sure that this section isn't |
115 | | // merged and we will really have it in the object file. |
116 | 0 | GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); |
117 | 0 | } |
118 | 0 | if (Alignment) |
119 | 0 | GV->setAlignment(llvm::Align(Alignment)); |
120 | 0 | return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros); |
121 | 0 | } |
122 | | |
123 | | /// Helper function that generates an empty dummy function returning void. |
124 | 0 | llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { |
125 | 0 | assert(FnTy->getReturnType()->isVoidTy() && |
126 | 0 | "Can only generate dummy functions returning void!"); |
127 | 0 | llvm::Function *DummyFunc = llvm::Function::Create( |
128 | 0 | FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); |
129 | |
|
130 | 0 | llvm::BasicBlock *DummyBlock = |
131 | 0 | llvm::BasicBlock::Create(Context, "", DummyFunc); |
132 | 0 | CGBuilderTy FuncBuilder(CGM, Context); |
133 | 0 | FuncBuilder.SetInsertPoint(DummyBlock); |
134 | 0 | FuncBuilder.CreateRetVoid(); |
135 | |
|
136 | 0 | return DummyFunc; |
137 | 0 | } |
138 | | |
139 | | void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); |
140 | | void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); |
141 | | std::string getDeviceSideName(const NamedDecl *ND) override; |
142 | | |
143 | | void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, |
144 | 0 | bool Extern, bool Constant) { |
145 | 0 | DeviceVars.push_back({&Var, |
146 | 0 | VD, |
147 | 0 | {DeviceVarFlags::Variable, Extern, Constant, |
148 | 0 | VD->hasAttr<HIPManagedAttr>(), |
149 | 0 | /*Normalized*/ false, 0}}); |
150 | 0 | } |
151 | | void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var, |
152 | 0 | bool Extern, int Type) { |
153 | 0 | DeviceVars.push_back({&Var, |
154 | 0 | VD, |
155 | 0 | {DeviceVarFlags::Surface, Extern, /*Constant*/ false, |
156 | 0 | /*Managed*/ false, |
157 | 0 | /*Normalized*/ false, Type}}); |
158 | 0 | } |
159 | | void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var, |
160 | 0 | bool Extern, int Type, bool Normalized) { |
161 | 0 | DeviceVars.push_back({&Var, |
162 | 0 | VD, |
163 | 0 | {DeviceVarFlags::Texture, Extern, /*Constant*/ false, |
164 | 0 | /*Managed*/ false, Normalized, Type}}); |
165 | 0 | } |
166 | | |
167 | | /// Creates module constructor function |
168 | | llvm::Function *makeModuleCtorFunction(); |
169 | | /// Creates module destructor function |
170 | | llvm::Function *makeModuleDtorFunction(); |
171 | | /// Transform managed variables for device compilation. |
172 | | void transformManagedVars(); |
173 | | /// Create offloading entries to register globals in RDC mode. |
174 | | void createOffloadingEntries(); |
175 | | |
176 | | public: |
177 | | CGNVCUDARuntime(CodeGenModule &CGM); |
178 | | |
179 | | llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override; |
180 | 0 | llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override { |
181 | 0 | auto Loc = KernelStubs.find(Handle); |
182 | 0 | assert(Loc != KernelStubs.end()); |
183 | 0 | return Loc->second; |
184 | 0 | } |
185 | | void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; |
186 | | void handleVarRegistration(const VarDecl *VD, |
187 | | llvm::GlobalVariable &Var) override; |
188 | | void |
189 | | internalizeDeviceSideVar(const VarDecl *D, |
190 | | llvm::GlobalValue::LinkageTypes &Linkage) override; |
191 | | |
192 | | llvm::Function *finalizeModule() override; |
193 | | }; |
194 | | |
195 | | } // end anonymous namespace |
196 | | |
197 | 0 | std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { |
198 | 0 | if (CGM.getLangOpts().HIP) |
199 | 0 | return ((Twine("hip") + Twine(FuncName)).str()); |
200 | 0 | return ((Twine("cuda") + Twine(FuncName)).str()); |
201 | 0 | } |
202 | | std::string |
203 | 0 | CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { |
204 | 0 | if (CGM.getLangOpts().HIP) |
205 | 0 | return ((Twine("__hip") + Twine(FuncName)).str()); |
206 | 0 | return ((Twine("__cuda") + Twine(FuncName)).str()); |
207 | 0 | } |
208 | | |
209 | 0 | static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) { |
210 | | // If the host and device have different C++ ABIs, mark it as the device |
211 | | // mangle context so that the mangling needs to retrieve the additional |
212 | | // device lambda mangling number instead of the regular host one. |
213 | 0 | if (CGM.getContext().getAuxTargetInfo() && |
214 | 0 | CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && |
215 | 0 | CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) { |
216 | 0 | return std::unique_ptr<MangleContext>( |
217 | 0 | CGM.getContext().createDeviceMangleContext( |
218 | 0 | *CGM.getContext().getAuxTargetInfo())); |
219 | 0 | } |
220 | | |
221 | 0 | return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext( |
222 | 0 | CGM.getContext().getAuxTargetInfo())); |
223 | 0 | } |
224 | | |
225 | | CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) |
226 | | : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), |
227 | | TheModule(CGM.getModule()), |
228 | | RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), |
229 | 0 | DeviceMC(InitDeviceMC(CGM)) { |
230 | 0 | IntTy = CGM.IntTy; |
231 | 0 | SizeTy = CGM.SizeTy; |
232 | 0 | VoidTy = CGM.VoidTy; |
233 | 0 | Zeros[0] = llvm::ConstantInt::get(SizeTy, 0); |
234 | 0 | Zeros[1] = Zeros[0]; |
235 | 0 | PtrTy = CGM.UnqualPtrTy; |
236 | 0 | } |
237 | | |
238 | 0 | llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { |
239 | | // cudaError_t cudaSetupArgument(void *, size_t, size_t) |
240 | 0 | llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy}; |
241 | 0 | return CGM.CreateRuntimeFunction( |
242 | 0 | llvm::FunctionType::get(IntTy, Params, false), |
243 | 0 | addPrefixToName("SetupArgument")); |
244 | 0 | } |
245 | | |
246 | 0 | llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { |
247 | 0 | if (CGM.getLangOpts().HIP) { |
248 | | // hipError_t hipLaunchByPtr(char *); |
249 | 0 | return CGM.CreateRuntimeFunction( |
250 | 0 | llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr"); |
251 | 0 | } |
252 | | // cudaError_t cudaLaunch(char *); |
253 | 0 | return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false), |
254 | 0 | "cudaLaunch"); |
255 | 0 | } |
256 | | |
257 | 0 | llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { |
258 | 0 | return llvm::FunctionType::get(VoidTy, PtrTy, false); |
259 | 0 | } |
260 | | |
261 | 0 | llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { |
262 | 0 | return llvm::FunctionType::get(VoidTy, PtrTy, false); |
263 | 0 | } |
264 | | |
265 | 0 | llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { |
266 | 0 | llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy, |
267 | 0 | llvm::PointerType::getUnqual(Context)}; |
268 | 0 | return llvm::FunctionType::get(VoidTy, Params, false); |
269 | 0 | } |
270 | | |
271 | 0 | std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { |
272 | 0 | GlobalDecl GD; |
273 | | // D could be either a kernel or a variable. |
274 | 0 | if (auto *FD = dyn_cast<FunctionDecl>(ND)) |
275 | 0 | GD = GlobalDecl(FD, KernelReferenceKind::Kernel); |
276 | 0 | else |
277 | 0 | GD = GlobalDecl(ND); |
278 | 0 | std::string DeviceSideName; |
279 | 0 | MangleContext *MC; |
280 | 0 | if (CGM.getLangOpts().CUDAIsDevice) |
281 | 0 | MC = &CGM.getCXXABI().getMangleContext(); |
282 | 0 | else |
283 | 0 | MC = DeviceMC.get(); |
284 | 0 | if (MC->shouldMangleDeclName(ND)) { |
285 | 0 | SmallString<256> Buffer; |
286 | 0 | llvm::raw_svector_ostream Out(Buffer); |
287 | 0 | MC->mangleName(GD, Out); |
288 | 0 | DeviceSideName = std::string(Out.str()); |
289 | 0 | } else |
290 | 0 | DeviceSideName = std::string(ND->getIdentifier()->getName()); |
291 | | |
292 | | // Make unique name for device side static file-scope variable for HIP. |
293 | 0 | if (CGM.getContext().shouldExternalize(ND) && |
294 | 0 | CGM.getLangOpts().GPURelocatableDeviceCode) { |
295 | 0 | SmallString<256> Buffer; |
296 | 0 | llvm::raw_svector_ostream Out(Buffer); |
297 | 0 | Out << DeviceSideName; |
298 | 0 | CGM.printPostfixForExternalizedDecl(Out, ND); |
299 | 0 | DeviceSideName = std::string(Out.str()); |
300 | 0 | } |
301 | 0 | return DeviceSideName; |
302 | 0 | } |
303 | | |
304 | | void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, |
305 | 0 | FunctionArgList &Args) { |
306 | 0 | EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); |
307 | 0 | if (auto *GV = |
308 | 0 | dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) { |
309 | 0 | GV->setLinkage(CGF.CurFn->getLinkage()); |
310 | 0 | GV->setInitializer(CGF.CurFn); |
311 | 0 | } |
312 | 0 | if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), |
313 | 0 | CudaFeature::CUDA_USES_NEW_LAUNCH) || |
314 | 0 | (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) |
315 | 0 | emitDeviceStubBodyNew(CGF, Args); |
316 | 0 | else |
317 | 0 | emitDeviceStubBodyLegacy(CGF, Args); |
318 | 0 | } |
319 | | |
320 | | // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local |
321 | | // array and kernels are launched using cudaLaunchKernel(). |
322 | | void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, |
323 | 0 | FunctionArgList &Args) { |
324 | | // Build the shadow stack entry at the very start of the function. |
325 | | |
326 | | // Calculate amount of space we will need for all arguments. If we have no |
327 | | // args, allocate a single pointer so we still have a valid pointer to the |
328 | | // argument array that we can pass to runtime, even if it will be unused. |
329 | 0 | Address KernelArgs = CGF.CreateTempAlloca( |
330 | 0 | PtrTy, CharUnits::fromQuantity(16), "kernel_args", |
331 | 0 | llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); |
332 | | // Store pointers to the arguments in a locally allocated launch_args. |
333 | 0 | for (unsigned i = 0; i < Args.size(); ++i) { |
334 | 0 | llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); |
335 | 0 | llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy); |
336 | 0 | CGF.Builder.CreateDefaultAlignedStore( |
337 | 0 | VoidVarPtr, |
338 | 0 | CGF.Builder.CreateConstGEP1_32(PtrTy, KernelArgs.getPointer(), i)); |
339 | 0 | } |
340 | |
|
341 | 0 | llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); |
342 | | |
343 | | // Lookup cudaLaunchKernel/hipLaunchKernel function. |
344 | | // HIP kernel launching API name depends on -fgpu-default-stream option. For |
345 | | // the default value 'legacy', it is hipLaunchKernel. For 'per-thread', |
346 | | // it is hipLaunchKernel_spt. |
347 | | // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, |
348 | | // void **args, size_t sharedMem, |
349 | | // cudaStream_t stream); |
350 | | // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim, |
351 | | // dim3 blockDim, void **args, |
352 | | // size_t sharedMem, hipStream_t stream); |
353 | 0 | TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); |
354 | 0 | DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); |
355 | 0 | std::string KernelLaunchAPI = "LaunchKernel"; |
356 | 0 | if (CGF.getLangOpts().GPUDefaultStream == |
357 | 0 | LangOptions::GPUDefaultStreamKind::PerThread) { |
358 | 0 | if (CGF.getLangOpts().HIP) |
359 | 0 | KernelLaunchAPI = KernelLaunchAPI + "_spt"; |
360 | 0 | else if (CGF.getLangOpts().CUDA) |
361 | 0 | KernelLaunchAPI = KernelLaunchAPI + "_ptsz"; |
362 | 0 | } |
363 | 0 | auto LaunchKernelName = addPrefixToName(KernelLaunchAPI); |
364 | 0 | IdentifierInfo &cudaLaunchKernelII = |
365 | 0 | CGM.getContext().Idents.get(LaunchKernelName); |
366 | 0 | FunctionDecl *cudaLaunchKernelFD = nullptr; |
367 | 0 | for (auto *Result : DC->lookup(&cudaLaunchKernelII)) { |
368 | 0 | if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) |
369 | 0 | cudaLaunchKernelFD = FD; |
370 | 0 | } |
371 | |
|
372 | 0 | if (cudaLaunchKernelFD == nullptr) { |
373 | 0 | CGM.Error(CGF.CurFuncDecl->getLocation(), |
374 | 0 | "Can't find declaration for " + LaunchKernelName); |
375 | 0 | return; |
376 | 0 | } |
377 | | // Create temporary dim3 grid_dim, block_dim. |
378 | 0 | ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); |
379 | 0 | QualType Dim3Ty = GridDimParam->getType(); |
380 | 0 | Address GridDim = |
381 | 0 | CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); |
382 | 0 | Address BlockDim = |
383 | 0 | CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); |
384 | 0 | Address ShmemSize = |
385 | 0 | CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); |
386 | 0 | Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream"); |
387 | 0 | llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( |
388 | 0 | llvm::FunctionType::get(IntTy, |
389 | 0 | {/*gridDim=*/GridDim.getType(), |
390 | 0 | /*blockDim=*/BlockDim.getType(), |
391 | 0 | /*ShmemSize=*/ShmemSize.getType(), |
392 | 0 | /*Stream=*/Stream.getType()}, |
393 | 0 | /*isVarArg=*/false), |
394 | 0 | addUnderscoredPrefixToName("PopCallConfiguration")); |
395 | |
|
396 | 0 | CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, |
397 | 0 | {GridDim.getPointer(), BlockDim.getPointer(), |
398 | 0 | ShmemSize.getPointer(), Stream.getPointer()}); |
399 | | |
400 | | // Emit the call to cudaLaunch |
401 | 0 | llvm::Value *Kernel = |
402 | 0 | CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy); |
403 | 0 | CallArgList LaunchKernelArgs; |
404 | 0 | LaunchKernelArgs.add(RValue::get(Kernel), |
405 | 0 | cudaLaunchKernelFD->getParamDecl(0)->getType()); |
406 | 0 | LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); |
407 | 0 | LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); |
408 | 0 | LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), |
409 | 0 | cudaLaunchKernelFD->getParamDecl(3)->getType()); |
410 | 0 | LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), |
411 | 0 | cudaLaunchKernelFD->getParamDecl(4)->getType()); |
412 | 0 | LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), |
413 | 0 | cudaLaunchKernelFD->getParamDecl(5)->getType()); |
414 | |
|
415 | 0 | QualType QT = cudaLaunchKernelFD->getType(); |
416 | 0 | QualType CQT = QT.getCanonicalType(); |
417 | 0 | llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); |
418 | 0 | llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty); |
419 | |
|
420 | 0 | const CGFunctionInfo &FI = |
421 | 0 | CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); |
422 | 0 | llvm::FunctionCallee cudaLaunchKernelFn = |
423 | 0 | CGM.CreateRuntimeFunction(FTy, LaunchKernelName); |
424 | 0 | CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), |
425 | 0 | LaunchKernelArgs); |
426 | 0 | CGF.EmitBranch(EndBlock); |
427 | |
|
428 | 0 | CGF.EmitBlock(EndBlock); |
429 | 0 | } |
430 | | |
431 | | void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, |
432 | 0 | FunctionArgList &Args) { |
433 | | // Emit a call to cudaSetupArgument for each arg in Args. |
434 | 0 | llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); |
435 | 0 | llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); |
436 | 0 | CharUnits Offset = CharUnits::Zero(); |
437 | 0 | for (const VarDecl *A : Args) { |
438 | 0 | auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType()); |
439 | 0 | Offset = Offset.alignTo(TInfo.Align); |
440 | 0 | llvm::Value *Args[] = { |
441 | 0 | CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), |
442 | 0 | PtrTy), |
443 | 0 | llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()), |
444 | 0 | llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), |
445 | 0 | }; |
446 | 0 | llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); |
447 | 0 | llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); |
448 | 0 | llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); |
449 | 0 | llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); |
450 | 0 | CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); |
451 | 0 | CGF.EmitBlock(NextBlock); |
452 | 0 | Offset += TInfo.Width; |
453 | 0 | } |
454 | | |
455 | | // Emit the call to cudaLaunch |
456 | 0 | llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); |
457 | 0 | llvm::Value *Arg = |
458 | 0 | CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy); |
459 | 0 | CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); |
460 | 0 | CGF.EmitBranch(EndBlock); |
461 | |
|
462 | 0 | CGF.EmitBlock(EndBlock); |
463 | 0 | } |
464 | | |
465 | | // Replace the original variable Var with the address loaded from variable |
466 | | // ManagedVar populated by HIP runtime. |
467 | | static void replaceManagedVar(llvm::GlobalVariable *Var, |
468 | 0 | llvm::GlobalVariable *ManagedVar) { |
469 | 0 | SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList; |
470 | 0 | for (auto &&VarUse : Var->uses()) { |
471 | 0 | WorkList.push_back({VarUse.getUser()}); |
472 | 0 | } |
473 | 0 | while (!WorkList.empty()) { |
474 | 0 | auto &&WorkItem = WorkList.pop_back_val(); |
475 | 0 | auto *U = WorkItem.back(); |
476 | 0 | if (isa<llvm::ConstantExpr>(U)) { |
477 | 0 | for (auto &&UU : U->uses()) { |
478 | 0 | WorkItem.push_back(UU.getUser()); |
479 | 0 | WorkList.push_back(WorkItem); |
480 | 0 | WorkItem.pop_back(); |
481 | 0 | } |
482 | 0 | continue; |
483 | 0 | } |
484 | 0 | if (auto *I = dyn_cast<llvm::Instruction>(U)) { |
485 | 0 | llvm::Value *OldV = Var; |
486 | 0 | llvm::Instruction *NewV = |
487 | 0 | new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false, |
488 | 0 | llvm::Align(Var->getAlignment()), I); |
489 | 0 | WorkItem.pop_back(); |
490 | | // Replace constant expressions directly or indirectly using the managed |
491 | | // variable with instructions. |
492 | 0 | for (auto &&Op : WorkItem) { |
493 | 0 | auto *CE = cast<llvm::ConstantExpr>(Op); |
494 | 0 | auto *NewInst = CE->getAsInstruction(I); |
495 | 0 | NewInst->replaceUsesOfWith(OldV, NewV); |
496 | 0 | OldV = CE; |
497 | 0 | NewV = NewInst; |
498 | 0 | } |
499 | 0 | I->replaceUsesOfWith(OldV, NewV); |
500 | 0 | } else { |
501 | 0 | llvm_unreachable("Invalid use of managed variable"); |
502 | 0 | } |
503 | 0 | } |
504 | 0 | } |
505 | | |
506 | | /// Creates a function that sets up state on the host side for CUDA objects that |
507 | | /// have a presence on both the host and device sides. Specifically, registers |
508 | | /// the host side of kernel functions and device global variables with the CUDA |
509 | | /// runtime. |
510 | | /// \code |
511 | | /// void __cuda_register_globals(void** GpuBinaryHandle) { |
512 | | /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); |
513 | | /// ... |
514 | | /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); |
515 | | /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); |
516 | | /// ... |
517 | | /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); |
518 | | /// } |
519 | | /// \endcode |
520 | 0 | llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { |
521 | | // No need to register anything |
522 | 0 | if (EmittedKernels.empty() && DeviceVars.empty()) |
523 | 0 | return nullptr; |
524 | | |
525 | 0 | llvm::Function *RegisterKernelsFunc = llvm::Function::Create( |
526 | 0 | getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, |
527 | 0 | addUnderscoredPrefixToName("_register_globals"), &TheModule); |
528 | 0 | llvm::BasicBlock *EntryBB = |
529 | 0 | llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); |
530 | 0 | CGBuilderTy Builder(CGM, Context); |
531 | 0 | Builder.SetInsertPoint(EntryBB); |
532 | | |
533 | | // void __cudaRegisterFunction(void **, const char *, char *, const char *, |
534 | | // int, uint3*, uint3*, dim3*, dim3*, int*) |
535 | 0 | llvm::Type *RegisterFuncParams[] = { |
536 | 0 | PtrTy, PtrTy, PtrTy, PtrTy, IntTy, |
537 | 0 | PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)}; |
538 | 0 | llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( |
539 | 0 | llvm::FunctionType::get(IntTy, RegisterFuncParams, false), |
540 | 0 | addUnderscoredPrefixToName("RegisterFunction")); |
541 | | |
542 | | // Extract GpuBinaryHandle passed as the first argument passed to |
543 | | // __cuda_register_globals() and generate __cudaRegisterFunction() call for |
544 | | // each emitted kernel. |
545 | 0 | llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); |
546 | 0 | for (auto &&I : EmittedKernels) { |
547 | 0 | llvm::Constant *KernelName = |
548 | 0 | makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D))); |
549 | 0 | llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy); |
550 | 0 | llvm::Value *Args[] = { |
551 | 0 | &GpuBinaryHandlePtr, |
552 | 0 | KernelHandles[I.Kernel->getName()], |
553 | 0 | KernelName, |
554 | 0 | KernelName, |
555 | 0 | llvm::ConstantInt::get(IntTy, -1), |
556 | 0 | NullPtr, |
557 | 0 | NullPtr, |
558 | 0 | NullPtr, |
559 | 0 | NullPtr, |
560 | 0 | llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))}; |
561 | 0 | Builder.CreateCall(RegisterFunc, Args); |
562 | 0 | } |
563 | |
|
564 | 0 | llvm::Type *VarSizeTy = IntTy; |
565 | | // For HIP or CUDA 9.0+, device variable size is type of `size_t`. |
566 | 0 | if (CGM.getLangOpts().HIP || |
567 | 0 | ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90) |
568 | 0 | VarSizeTy = SizeTy; |
569 | | |
570 | | // void __cudaRegisterVar(void **, char *, char *, const char *, |
571 | | // int, int, int, int) |
572 | 0 | llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy, |
573 | 0 | IntTy, VarSizeTy, IntTy, IntTy}; |
574 | 0 | llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( |
575 | 0 | llvm::FunctionType::get(VoidTy, RegisterVarParams, false), |
576 | 0 | addUnderscoredPrefixToName("RegisterVar")); |
577 | | // void __hipRegisterManagedVar(void **, char *, char *, const char *, |
578 | | // size_t, unsigned) |
579 | 0 | llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy, |
580 | 0 | PtrTy, VarSizeTy, IntTy}; |
581 | 0 | llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction( |
582 | 0 | llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false), |
583 | 0 | addUnderscoredPrefixToName("RegisterManagedVar")); |
584 | | // void __cudaRegisterSurface(void **, const struct surfaceReference *, |
585 | | // const void **, const char *, int, int); |
586 | 0 | llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( |
587 | 0 | llvm::FunctionType::get( |
588 | 0 | VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false), |
589 | 0 | addUnderscoredPrefixToName("RegisterSurface")); |
590 | | // void __cudaRegisterTexture(void **, const struct textureReference *, |
591 | | // const void **, const char *, int, int, int) |
592 | 0 | llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction( |
593 | 0 | llvm::FunctionType::get( |
594 | 0 | VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false), |
595 | 0 | addUnderscoredPrefixToName("RegisterTexture")); |
596 | 0 | for (auto &&Info : DeviceVars) { |
597 | 0 | llvm::GlobalVariable *Var = Info.Var; |
598 | 0 | assert((!Var->isDeclaration() || Info.Flags.isManaged()) && |
599 | 0 | "External variables should not show up here, except HIP managed " |
600 | 0 | "variables"); |
601 | 0 | llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); |
602 | 0 | switch (Info.Flags.getKind()) { |
603 | 0 | case DeviceVarFlags::Variable: { |
604 | 0 | uint64_t VarSize = |
605 | 0 | CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); |
606 | 0 | if (Info.Flags.isManaged()) { |
607 | 0 | auto *ManagedVar = new llvm::GlobalVariable( |
608 | 0 | CGM.getModule(), Var->getType(), |
609 | 0 | /*isConstant=*/false, Var->getLinkage(), |
610 | 0 | /*Init=*/Var->isDeclaration() |
611 | 0 | ? nullptr |
612 | 0 | : llvm::ConstantPointerNull::get(Var->getType()), |
613 | 0 | /*Name=*/"", /*InsertBefore=*/nullptr, |
614 | 0 | llvm::GlobalVariable::NotThreadLocal); |
615 | 0 | ManagedVar->setDSOLocal(Var->isDSOLocal()); |
616 | 0 | ManagedVar->setVisibility(Var->getVisibility()); |
617 | 0 | ManagedVar->setExternallyInitialized(true); |
618 | 0 | ManagedVar->takeName(Var); |
619 | 0 | Var->setName(Twine(ManagedVar->getName() + ".managed")); |
620 | 0 | replaceManagedVar(Var, ManagedVar); |
621 | 0 | llvm::Value *Args[] = { |
622 | 0 | &GpuBinaryHandlePtr, |
623 | 0 | ManagedVar, |
624 | 0 | Var, |
625 | 0 | VarName, |
626 | 0 | llvm::ConstantInt::get(VarSizeTy, VarSize), |
627 | 0 | llvm::ConstantInt::get(IntTy, Var->getAlignment())}; |
628 | 0 | if (!Var->isDeclaration()) |
629 | 0 | Builder.CreateCall(RegisterManagedVar, Args); |
630 | 0 | } else { |
631 | 0 | llvm::Value *Args[] = { |
632 | 0 | &GpuBinaryHandlePtr, |
633 | 0 | Var, |
634 | 0 | VarName, |
635 | 0 | VarName, |
636 | 0 | llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()), |
637 | 0 | llvm::ConstantInt::get(VarSizeTy, VarSize), |
638 | 0 | llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()), |
639 | 0 | llvm::ConstantInt::get(IntTy, 0)}; |
640 | 0 | Builder.CreateCall(RegisterVar, Args); |
641 | 0 | } |
642 | 0 | break; |
643 | 0 | } |
644 | 0 | case DeviceVarFlags::Surface: |
645 | 0 | Builder.CreateCall( |
646 | 0 | RegisterSurf, |
647 | 0 | {&GpuBinaryHandlePtr, Var, VarName, VarName, |
648 | 0 | llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), |
649 | 0 | llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); |
650 | 0 | break; |
651 | 0 | case DeviceVarFlags::Texture: |
652 | 0 | Builder.CreateCall( |
653 | 0 | RegisterTex, |
654 | 0 | {&GpuBinaryHandlePtr, Var, VarName, VarName, |
655 | 0 | llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), |
656 | 0 | llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()), |
657 | 0 | llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); |
658 | 0 | break; |
659 | 0 | } |
660 | 0 | } |
661 | | |
662 | 0 | Builder.CreateRetVoid(); |
663 | 0 | return RegisterKernelsFunc; |
664 | 0 | } |
665 | | |
666 | | /// Creates a global constructor function for the module: |
667 | | /// |
668 | | /// For CUDA: |
669 | | /// \code |
670 | | /// void __cuda_module_ctor() { |
671 | | /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); |
672 | | /// __cuda_register_globals(Handle); |
673 | | /// } |
674 | | /// \endcode |
675 | | /// |
676 | | /// For HIP: |
677 | | /// \code |
678 | | /// void __hip_module_ctor() { |
679 | | /// if (__hip_gpubin_handle == 0) { |
680 | | /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); |
681 | | /// __hip_register_globals(__hip_gpubin_handle); |
682 | | /// } |
683 | | /// } |
684 | | /// \endcode |
685 | 0 | llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { |
686 | 0 | bool IsHIP = CGM.getLangOpts().HIP; |
687 | 0 | bool IsCUDA = CGM.getLangOpts().CUDA; |
688 | | // No need to generate ctors/dtors if there is no GPU binary. |
689 | 0 | StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; |
690 | 0 | if (CudaGpuBinaryFileName.empty() && !IsHIP) |
691 | 0 | return nullptr; |
692 | 0 | if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && |
693 | 0 | DeviceVars.empty()) |
694 | 0 | return nullptr; |
695 | | |
696 | | // void __{cuda|hip}_register_globals(void* handle); |
697 | 0 | llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); |
698 | | // We always need a function to pass in as callback. Create a dummy |
699 | | // implementation if we don't need to register anything. |
700 | 0 | if (RelocatableDeviceCode && !RegisterGlobalsFunc) |
701 | 0 | RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); |
702 | | |
703 | | // void ** __{cuda|hip}RegisterFatBinary(void *); |
704 | 0 | llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( |
705 | 0 | llvm::FunctionType::get(PtrTy, PtrTy, false), |
706 | 0 | addUnderscoredPrefixToName("RegisterFatBinary")); |
707 | | // struct { int magic, int version, void * gpu_binary, void * dont_care }; |
708 | 0 | llvm::StructType *FatbinWrapperTy = |
709 | 0 | llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy); |
710 | | |
711 | | // Register GPU binary with the CUDA runtime, store returned handle in a |
712 | | // global variable and save a reference in GpuBinaryHandle to be cleaned up |
713 | | // in destructor on exit. Then associate all known kernels with the GPU binary |
714 | | // handle so CUDA runtime can figure out what to call on the GPU side. |
715 | 0 | std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; |
716 | 0 | if (!CudaGpuBinaryFileName.empty()) { |
717 | 0 | auto VFS = CGM.getFileSystem(); |
718 | 0 | auto CudaGpuBinaryOrErr = |
719 | 0 | VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false); |
720 | 0 | if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { |
721 | 0 | CGM.getDiags().Report(diag::err_cannot_open_file) |
722 | 0 | << CudaGpuBinaryFileName << EC.message(); |
723 | 0 | return nullptr; |
724 | 0 | } |
725 | 0 | CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); |
726 | 0 | } |
727 | | |
728 | 0 | llvm::Function *ModuleCtorFunc = llvm::Function::Create( |
729 | 0 | llvm::FunctionType::get(VoidTy, false), |
730 | 0 | llvm::GlobalValue::InternalLinkage, |
731 | 0 | addUnderscoredPrefixToName("_module_ctor"), &TheModule); |
732 | 0 | llvm::BasicBlock *CtorEntryBB = |
733 | 0 | llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); |
734 | 0 | CGBuilderTy CtorBuilder(CGM, Context); |
735 | |
|
736 | 0 | CtorBuilder.SetInsertPoint(CtorEntryBB); |
737 | |
|
738 | 0 | const char *FatbinConstantName; |
739 | 0 | const char *FatbinSectionName; |
740 | 0 | const char *ModuleIDSectionName; |
741 | 0 | StringRef ModuleIDPrefix; |
742 | 0 | llvm::Constant *FatBinStr; |
743 | 0 | unsigned FatMagic; |
744 | 0 | if (IsHIP) { |
745 | 0 | FatbinConstantName = ".hip_fatbin"; |
746 | 0 | FatbinSectionName = ".hipFatBinSegment"; |
747 | |
|
748 | 0 | ModuleIDSectionName = "__hip_module_id"; |
749 | 0 | ModuleIDPrefix = "__hip_"; |
750 | |
|
751 | 0 | if (CudaGpuBinary) { |
752 | | // If fatbin is available from early finalization, create a string |
753 | | // literal containing the fat binary loaded from the given file. |
754 | 0 | const unsigned HIPCodeObjectAlign = 4096; |
755 | 0 | FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "", |
756 | 0 | FatbinConstantName, HIPCodeObjectAlign); |
757 | 0 | } else { |
758 | | // If fatbin is not available, create an external symbol |
759 | | // __hip_fatbin in section .hip_fatbin. The external symbol is supposed |
760 | | // to contain the fat binary but will be populated somewhere else, |
761 | | // e.g. by lld through link script. |
762 | 0 | FatBinStr = new llvm::GlobalVariable( |
763 | 0 | CGM.getModule(), CGM.Int8Ty, |
764 | 0 | /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, |
765 | 0 | "__hip_fatbin", nullptr, |
766 | 0 | llvm::GlobalVariable::NotThreadLocal); |
767 | 0 | cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); |
768 | 0 | } |
769 | |
|
770 | 0 | FatMagic = HIPFatMagic; |
771 | 0 | } else { |
772 | 0 | if (RelocatableDeviceCode) |
773 | 0 | FatbinConstantName = CGM.getTriple().isMacOSX() |
774 | 0 | ? "__NV_CUDA,__nv_relfatbin" |
775 | 0 | : "__nv_relfatbin"; |
776 | 0 | else |
777 | 0 | FatbinConstantName = |
778 | 0 | CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; |
779 | | // NVIDIA's cuobjdump looks for fatbins in this section. |
780 | 0 | FatbinSectionName = |
781 | 0 | CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; |
782 | |
|
783 | 0 | ModuleIDSectionName = CGM.getTriple().isMacOSX() |
784 | 0 | ? "__NV_CUDA,__nv_module_id" |
785 | 0 | : "__nv_module_id"; |
786 | 0 | ModuleIDPrefix = "__nv_"; |
787 | | |
788 | | // For CUDA, create a string literal containing the fat binary loaded from |
789 | | // the given file. |
790 | 0 | FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "", |
791 | 0 | FatbinConstantName, 8); |
792 | 0 | FatMagic = CudaFatMagic; |
793 | 0 | } |
794 | | |
795 | | // Create initialized wrapper structure that points to the loaded GPU binary |
796 | 0 | ConstantInitBuilder Builder(CGM); |
797 | 0 | auto Values = Builder.beginStruct(FatbinWrapperTy); |
798 | | // Fatbin wrapper magic. |
799 | 0 | Values.addInt(IntTy, FatMagic); |
800 | | // Fatbin version. |
801 | 0 | Values.addInt(IntTy, 1); |
802 | | // Data. |
803 | 0 | Values.add(FatBinStr); |
804 | | // Unused in fatbin v1. |
805 | 0 | Values.add(llvm::ConstantPointerNull::get(PtrTy)); |
806 | 0 | llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( |
807 | 0 | addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), |
808 | 0 | /*constant*/ true); |
809 | 0 | FatbinWrapper->setSection(FatbinSectionName); |
810 | | |
811 | | // There is only one HIP fat binary per linked module, however there are |
812 | | // multiple constructor functions. Make sure the fat binary is registered |
813 | | // only once. The constructor functions are executed by the dynamic loader |
814 | | // before the program gains control. The dynamic loader cannot execute the |
815 | | // constructor functions concurrently since doing that would not guarantee |
816 | | // thread safety of the loaded program. Therefore we can assume sequential |
817 | | // execution of constructor functions here. |
818 | 0 | if (IsHIP) { |
819 | 0 | auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : |
820 | 0 | llvm::GlobalValue::LinkOnceAnyLinkage; |
821 | 0 | llvm::BasicBlock *IfBlock = |
822 | 0 | llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); |
823 | 0 | llvm::BasicBlock *ExitBlock = |
824 | 0 | llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); |
825 | | // The name, size, and initialization pattern of this variable is part |
826 | | // of HIP ABI. |
827 | 0 | GpuBinaryHandle = new llvm::GlobalVariable( |
828 | 0 | TheModule, PtrTy, /*isConstant=*/false, Linkage, |
829 | 0 | /*Initializer=*/llvm::ConstantPointerNull::get(PtrTy), |
830 | 0 | "__hip_gpubin_handle"); |
831 | 0 | if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage) |
832 | 0 | GpuBinaryHandle->setComdat( |
833 | 0 | CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName())); |
834 | 0 | GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); |
835 | | // Prevent the weak symbol in different shared libraries being merged. |
836 | 0 | if (Linkage != llvm::GlobalValue::InternalLinkage) |
837 | 0 | GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); |
838 | 0 | Address GpuBinaryAddr( |
839 | 0 | GpuBinaryHandle, PtrTy, |
840 | 0 | CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); |
841 | 0 | { |
842 | 0 | auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); |
843 | 0 | llvm::Constant *Zero = |
844 | 0 | llvm::Constant::getNullValue(HandleValue->getType()); |
845 | 0 | llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); |
846 | 0 | CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); |
847 | 0 | } |
848 | 0 | { |
849 | 0 | CtorBuilder.SetInsertPoint(IfBlock); |
850 | | // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); |
851 | 0 | llvm::CallInst *RegisterFatbinCall = |
852 | 0 | CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper); |
853 | 0 | CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); |
854 | 0 | CtorBuilder.CreateBr(ExitBlock); |
855 | 0 | } |
856 | 0 | { |
857 | 0 | CtorBuilder.SetInsertPoint(ExitBlock); |
858 | | // Call __hip_register_globals(GpuBinaryHandle); |
859 | 0 | if (RegisterGlobalsFunc) { |
860 | 0 | auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); |
861 | 0 | CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); |
862 | 0 | } |
863 | 0 | } |
864 | 0 | } else if (!RelocatableDeviceCode) { |
865 | | // Register binary with CUDA runtime. This is substantially different in |
866 | | // default mode vs. separate compilation! |
867 | | // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); |
868 | 0 | llvm::CallInst *RegisterFatbinCall = |
869 | 0 | CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper); |
870 | 0 | GpuBinaryHandle = new llvm::GlobalVariable( |
871 | 0 | TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage, |
872 | 0 | llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle"); |
873 | 0 | GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); |
874 | 0 | CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, |
875 | 0 | CGM.getPointerAlign()); |
876 | | |
877 | | // Call __cuda_register_globals(GpuBinaryHandle); |
878 | 0 | if (RegisterGlobalsFunc) |
879 | 0 | CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); |
880 | | |
881 | | // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. |
882 | 0 | if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), |
883 | 0 | CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { |
884 | | // void __cudaRegisterFatBinaryEnd(void **); |
885 | 0 | llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( |
886 | 0 | llvm::FunctionType::get(VoidTy, PtrTy, false), |
887 | 0 | "__cudaRegisterFatBinaryEnd"); |
888 | 0 | CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); |
889 | 0 | } |
890 | 0 | } else { |
891 | | // Generate a unique module ID. |
892 | 0 | SmallString<64> ModuleID; |
893 | 0 | llvm::raw_svector_ostream OS(ModuleID); |
894 | 0 | OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); |
895 | 0 | llvm::Constant *ModuleIDConstant = makeConstantArray( |
896 | 0 | std::string(ModuleID.str()), "", ModuleIDSectionName, 32, /*AddNull=*/true); |
897 | | |
898 | | // Create an alias for the FatbinWrapper that nvcc will look for. |
899 | 0 | llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, |
900 | 0 | Twine("__fatbinwrap") + ModuleID, FatbinWrapper); |
901 | | |
902 | | // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, |
903 | | // void *, void (*)(void **)) |
904 | 0 | SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); |
905 | 0 | RegisterLinkedBinaryName += ModuleID; |
906 | 0 | llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( |
907 | 0 | getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); |
908 | |
|
909 | 0 | assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); |
910 | 0 | llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant, |
911 | 0 | makeDummyFunction(getCallbackFnTy())}; |
912 | 0 | CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); |
913 | 0 | } |
914 | | |
915 | | // Create destructor and register it with atexit() the way NVCC does it. Doing |
916 | | // it during regular destructor phase worked in CUDA before 9.2 but results in |
917 | | // double-free in 9.2. |
918 | 0 | if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { |
919 | | // extern "C" int atexit(void (*f)(void)); |
920 | 0 | llvm::FunctionType *AtExitTy = |
921 | 0 | llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); |
922 | 0 | llvm::FunctionCallee AtExitFunc = |
923 | 0 | CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), |
924 | 0 | /*Local=*/true); |
925 | 0 | CtorBuilder.CreateCall(AtExitFunc, CleanupFn); |
926 | 0 | } |
927 | |
|
928 | 0 | CtorBuilder.CreateRetVoid(); |
929 | 0 | return ModuleCtorFunc; |
930 | 0 | } |
931 | | |
932 | | /// Creates a global destructor function that unregisters the GPU code blob |
933 | | /// registered by constructor. |
934 | | /// |
935 | | /// For CUDA: |
936 | | /// \code |
937 | | /// void __cuda_module_dtor() { |
938 | | /// __cudaUnregisterFatBinary(Handle); |
939 | | /// } |
940 | | /// \endcode |
941 | | /// |
942 | | /// For HIP: |
943 | | /// \code |
944 | | /// void __hip_module_dtor() { |
945 | | /// if (__hip_gpubin_handle) { |
946 | | /// __hipUnregisterFatBinary(__hip_gpubin_handle); |
947 | | /// __hip_gpubin_handle = 0; |
948 | | /// } |
949 | | /// } |
950 | | /// \endcode |
951 | 0 | llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { |
952 | | // No need for destructor if we don't have a handle to unregister. |
953 | 0 | if (!GpuBinaryHandle) |
954 | 0 | return nullptr; |
955 | | |
956 | | // void __cudaUnregisterFatBinary(void ** handle); |
957 | 0 | llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( |
958 | 0 | llvm::FunctionType::get(VoidTy, PtrTy, false), |
959 | 0 | addUnderscoredPrefixToName("UnregisterFatBinary")); |
960 | |
|
961 | 0 | llvm::Function *ModuleDtorFunc = llvm::Function::Create( |
962 | 0 | llvm::FunctionType::get(VoidTy, false), |
963 | 0 | llvm::GlobalValue::InternalLinkage, |
964 | 0 | addUnderscoredPrefixToName("_module_dtor"), &TheModule); |
965 | |
|
966 | 0 | llvm::BasicBlock *DtorEntryBB = |
967 | 0 | llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); |
968 | 0 | CGBuilderTy DtorBuilder(CGM, Context); |
969 | 0 | DtorBuilder.SetInsertPoint(DtorEntryBB); |
970 | |
|
971 | 0 | Address GpuBinaryAddr( |
972 | 0 | GpuBinaryHandle, GpuBinaryHandle->getValueType(), |
973 | 0 | CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); |
974 | 0 | auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); |
975 | | // There is only one HIP fat binary per linked module, however there are |
976 | | // multiple destructor functions. Make sure the fat binary is unregistered |
977 | | // only once. |
978 | 0 | if (CGM.getLangOpts().HIP) { |
979 | 0 | llvm::BasicBlock *IfBlock = |
980 | 0 | llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); |
981 | 0 | llvm::BasicBlock *ExitBlock = |
982 | 0 | llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); |
983 | 0 | llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); |
984 | 0 | llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); |
985 | 0 | DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); |
986 | |
|
987 | 0 | DtorBuilder.SetInsertPoint(IfBlock); |
988 | 0 | DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); |
989 | 0 | DtorBuilder.CreateStore(Zero, GpuBinaryAddr); |
990 | 0 | DtorBuilder.CreateBr(ExitBlock); |
991 | |
|
992 | 0 | DtorBuilder.SetInsertPoint(ExitBlock); |
993 | 0 | } else { |
994 | 0 | DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); |
995 | 0 | } |
996 | 0 | DtorBuilder.CreateRetVoid(); |
997 | 0 | return ModuleDtorFunc; |
998 | 0 | } |
999 | | |
1000 | 0 | CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { |
1001 | 0 | return new CGNVCUDARuntime(CGM); |
1002 | 0 | } |
1003 | | |
1004 | | void CGNVCUDARuntime::internalizeDeviceSideVar( |
1005 | 0 | const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) { |
1006 | | // For -fno-gpu-rdc, host-side shadows of external declarations of device-side |
1007 | | // global variables become internal definitions. These have to be internal in |
1008 | | // order to prevent name conflicts with global host variables with the same |
1009 | | // name in a different TUs. |
1010 | | // |
1011 | | // For -fgpu-rdc, the shadow variables should not be internalized because |
1012 | | // they may be accessed by different TU. |
1013 | 0 | if (CGM.getLangOpts().GPURelocatableDeviceCode) |
1014 | 0 | return; |
1015 | | |
1016 | | // __shared__ variables are odd. Shadows do get created, but |
1017 | | // they are not registered with the CUDA runtime, so they |
1018 | | // can't really be used to access their device-side |
1019 | | // counterparts. It's not clear yet whether it's nvcc's bug or |
1020 | | // a feature, but we've got to do the same for compatibility. |
1021 | 0 | if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || |
1022 | 0 | D->hasAttr<CUDASharedAttr>() || |
1023 | 0 | D->getType()->isCUDADeviceBuiltinSurfaceType() || |
1024 | 0 | D->getType()->isCUDADeviceBuiltinTextureType()) { |
1025 | 0 | Linkage = llvm::GlobalValue::InternalLinkage; |
1026 | 0 | } |
1027 | 0 | } |
1028 | | |
1029 | | void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D, |
1030 | 0 | llvm::GlobalVariable &GV) { |
1031 | 0 | if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) { |
1032 | | // Shadow variables and their properties must be registered with CUDA |
1033 | | // runtime. Skip Extern global variables, which will be registered in |
1034 | | // the TU where they are defined. |
1035 | | // |
1036 | | // Don't register a C++17 inline variable. The local symbol can be |
1037 | | // discarded and referencing a discarded local symbol from outside the |
1038 | | // comdat (__cuda_register_globals) is disallowed by the ELF spec. |
1039 | | // |
1040 | | // HIP managed variables need to be always recorded in device and host |
1041 | | // compilations for transformation. |
1042 | | // |
1043 | | // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are |
1044 | | // added to llvm.compiler-used, therefore they are safe to be registered. |
1045 | 0 | if ((!D->hasExternalStorage() && !D->isInline()) || |
1046 | 0 | CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) || |
1047 | 0 | D->hasAttr<HIPManagedAttr>()) { |
1048 | 0 | registerDeviceVar(D, GV, !D->hasDefinition(), |
1049 | 0 | D->hasAttr<CUDAConstantAttr>()); |
1050 | 0 | } |
1051 | 0 | } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() || |
1052 | 0 | D->getType()->isCUDADeviceBuiltinTextureType()) { |
1053 | | // Builtin surfaces and textures and their template arguments are |
1054 | | // also registered with CUDA runtime. |
1055 | 0 | const auto *TD = cast<ClassTemplateSpecializationDecl>( |
1056 | 0 | D->getType()->castAs<RecordType>()->getDecl()); |
1057 | 0 | const TemplateArgumentList &Args = TD->getTemplateArgs(); |
1058 | 0 | if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) { |
1059 | 0 | assert(Args.size() == 2 && |
1060 | 0 | "Unexpected number of template arguments of CUDA device " |
1061 | 0 | "builtin surface type."); |
1062 | 0 | auto SurfType = Args[1].getAsIntegral(); |
1063 | 0 | if (!D->hasExternalStorage()) |
1064 | 0 | registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue()); |
1065 | 0 | } else { |
1066 | 0 | assert(Args.size() == 3 && |
1067 | 0 | "Unexpected number of template arguments of CUDA device " |
1068 | 0 | "builtin texture type."); |
1069 | 0 | auto TexType = Args[1].getAsIntegral(); |
1070 | 0 | auto Normalized = Args[2].getAsIntegral(); |
1071 | 0 | if (!D->hasExternalStorage()) |
1072 | 0 | registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(), |
1073 | 0 | Normalized.getZExtValue()); |
1074 | 0 | } |
1075 | 0 | } |
1076 | 0 | } |
1077 | | |
1078 | | // Transform managed variables to pointers to managed variables in device code. |
1079 | | // Each use of the original managed variable is replaced by a load from the |
1080 | | // transformed managed variable. The transformed managed variable contains |
1081 | | // the address of managed memory which will be allocated by the runtime. |
1082 | 0 | void CGNVCUDARuntime::transformManagedVars() { |
1083 | 0 | for (auto &&Info : DeviceVars) { |
1084 | 0 | llvm::GlobalVariable *Var = Info.Var; |
1085 | 0 | if (Info.Flags.getKind() == DeviceVarFlags::Variable && |
1086 | 0 | Info.Flags.isManaged()) { |
1087 | 0 | auto *ManagedVar = new llvm::GlobalVariable( |
1088 | 0 | CGM.getModule(), Var->getType(), |
1089 | 0 | /*isConstant=*/false, Var->getLinkage(), |
1090 | 0 | /*Init=*/Var->isDeclaration() |
1091 | 0 | ? nullptr |
1092 | 0 | : llvm::ConstantPointerNull::get(Var->getType()), |
1093 | 0 | /*Name=*/"", /*InsertBefore=*/nullptr, |
1094 | 0 | llvm::GlobalVariable::NotThreadLocal, |
1095 | 0 | CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); |
1096 | 0 | ManagedVar->setDSOLocal(Var->isDSOLocal()); |
1097 | 0 | ManagedVar->setVisibility(Var->getVisibility()); |
1098 | 0 | ManagedVar->setExternallyInitialized(true); |
1099 | 0 | replaceManagedVar(Var, ManagedVar); |
1100 | 0 | ManagedVar->takeName(Var); |
1101 | 0 | Var->setName(Twine(ManagedVar->getName()) + ".managed"); |
1102 | | // Keep managed variables even if they are not used in device code since |
1103 | | // they need to be allocated by the runtime. |
1104 | 0 | if (!Var->isDeclaration()) { |
1105 | 0 | assert(!ManagedVar->isDeclaration()); |
1106 | 0 | CGM.addCompilerUsedGlobal(Var); |
1107 | 0 | CGM.addCompilerUsedGlobal(ManagedVar); |
1108 | 0 | } |
1109 | 0 | } |
1110 | 0 | } |
1111 | 0 | } |
1112 | | |
1113 | | // Creates offloading entries for all the kernels and globals that must be |
1114 | | // registered. The linker will provide a pointer to this section so we can |
1115 | | // register the symbols with the linked device image. |
1116 | 0 | void CGNVCUDARuntime::createOffloadingEntries() { |
1117 | 0 | StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries" |
1118 | 0 | : "cuda_offloading_entries"; |
1119 | 0 | llvm::Module &M = CGM.getModule(); |
1120 | 0 | for (KernelInfo &I : EmittedKernels) |
1121 | 0 | llvm::offloading::emitOffloadingEntry( |
1122 | 0 | M, KernelHandles[I.Kernel->getName()], |
1123 | 0 | getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0, |
1124 | 0 | llvm::offloading::OffloadGlobalEntry, Section); |
1125 | |
|
1126 | 0 | for (VarInfo &I : DeviceVars) { |
1127 | 0 | uint64_t VarSize = |
1128 | 0 | CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType()); |
1129 | 0 | int32_t Flags = |
1130 | 0 | (I.Flags.isExtern() |
1131 | 0 | ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern) |
1132 | 0 | : 0) | |
1133 | 0 | (I.Flags.isConstant() |
1134 | 0 | ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant) |
1135 | 0 | : 0) | |
1136 | 0 | (I.Flags.isNormalized() |
1137 | 0 | ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized) |
1138 | 0 | : 0); |
1139 | 0 | if (I.Flags.getKind() == DeviceVarFlags::Variable) { |
1140 | 0 | llvm::offloading::emitOffloadingEntry( |
1141 | 0 | M, I.Var, getDeviceSideName(I.D), VarSize, |
1142 | 0 | (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry |
1143 | 0 | : llvm::offloading::OffloadGlobalEntry) | |
1144 | 0 | Flags, |
1145 | 0 | /*Data=*/0, Section); |
1146 | 0 | } else if (I.Flags.getKind() == DeviceVarFlags::Surface) { |
1147 | 0 | llvm::offloading::emitOffloadingEntry( |
1148 | 0 | M, I.Var, getDeviceSideName(I.D), VarSize, |
1149 | 0 | llvm::offloading::OffloadGlobalSurfaceEntry | Flags, |
1150 | 0 | I.Flags.getSurfTexType(), Section); |
1151 | 0 | } else if (I.Flags.getKind() == DeviceVarFlags::Texture) { |
1152 | 0 | llvm::offloading::emitOffloadingEntry( |
1153 | 0 | M, I.Var, getDeviceSideName(I.D), VarSize, |
1154 | 0 | llvm::offloading::OffloadGlobalTextureEntry | Flags, |
1155 | 0 | I.Flags.getSurfTexType(), Section); |
1156 | 0 | } |
1157 | 0 | } |
1158 | 0 | } |
1159 | | |
1160 | | // Returns module constructor to be added. |
1161 | 0 | llvm::Function *CGNVCUDARuntime::finalizeModule() { |
1162 | 0 | if (CGM.getLangOpts().CUDAIsDevice) { |
1163 | 0 | transformManagedVars(); |
1164 | | |
1165 | | // Mark ODR-used device variables as compiler used to prevent it from being |
1166 | | // eliminated by optimization. This is necessary for device variables |
1167 | | // ODR-used by host functions. Sema correctly marks them as ODR-used no |
1168 | | // matter whether they are ODR-used by device or host functions. |
1169 | | // |
1170 | | // We do not need to do this if the variable has used attribute since it |
1171 | | // has already been added. |
1172 | | // |
1173 | | // Static device variables have been externalized at this point, therefore |
1174 | | // variables with LLVM private or internal linkage need not be added. |
1175 | 0 | for (auto &&Info : DeviceVars) { |
1176 | 0 | auto Kind = Info.Flags.getKind(); |
1177 | 0 | if (!Info.Var->isDeclaration() && |
1178 | 0 | !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) && |
1179 | 0 | (Kind == DeviceVarFlags::Variable || |
1180 | 0 | Kind == DeviceVarFlags::Surface || |
1181 | 0 | Kind == DeviceVarFlags::Texture) && |
1182 | 0 | Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) { |
1183 | 0 | CGM.addCompilerUsedGlobal(Info.Var); |
1184 | 0 | } |
1185 | 0 | } |
1186 | 0 | return nullptr; |
1187 | 0 | } |
1188 | 0 | if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode) |
1189 | 0 | createOffloadingEntries(); |
1190 | 0 | else |
1191 | 0 | return makeModuleCtorFunction(); |
1192 | | |
1193 | 0 | return nullptr; |
1194 | 0 | } |
1195 | | |
1196 | | llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F, |
1197 | 0 | GlobalDecl GD) { |
1198 | 0 | auto Loc = KernelHandles.find(F->getName()); |
1199 | 0 | if (Loc != KernelHandles.end()) { |
1200 | 0 | auto OldHandle = Loc->second; |
1201 | 0 | if (KernelStubs[OldHandle] == F) |
1202 | 0 | return OldHandle; |
1203 | | |
1204 | | // We've found the function name, but F itself has changed, so we need to |
1205 | | // update the references. |
1206 | 0 | if (CGM.getLangOpts().HIP) { |
1207 | | // For HIP compilation the handle itself does not change, so we only need |
1208 | | // to update the Stub value. |
1209 | 0 | KernelStubs[OldHandle] = F; |
1210 | 0 | return OldHandle; |
1211 | 0 | } |
1212 | | // For non-HIP compilation, erase the old Stub and fall-through to creating |
1213 | | // new entries. |
1214 | 0 | KernelStubs.erase(OldHandle); |
1215 | 0 | } |
1216 | | |
1217 | 0 | if (!CGM.getLangOpts().HIP) { |
1218 | 0 | KernelHandles[F->getName()] = F; |
1219 | 0 | KernelStubs[F] = F; |
1220 | 0 | return F; |
1221 | 0 | } |
1222 | | |
1223 | 0 | auto *Var = new llvm::GlobalVariable( |
1224 | 0 | TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(), |
1225 | 0 | /*Initializer=*/nullptr, |
1226 | 0 | CGM.getMangledName( |
1227 | 0 | GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel))); |
1228 | 0 | Var->setAlignment(CGM.getPointerAlign().getAsAlign()); |
1229 | 0 | Var->setDSOLocal(F->isDSOLocal()); |
1230 | 0 | Var->setVisibility(F->getVisibility()); |
1231 | 0 | auto *FD = cast<FunctionDecl>(GD.getDecl()); |
1232 | 0 | auto *FT = FD->getPrimaryTemplate(); |
1233 | 0 | if (!FT || FT->isThisDeclarationADefinition()) |
1234 | 0 | CGM.maybeSetTrivialComdat(*FD, *Var); |
1235 | 0 | KernelHandles[F->getName()] = Var; |
1236 | 0 | KernelStubs[Var] = F; |
1237 | 0 | return Var; |
1238 | 0 | } |