Coverage Report

Created: 2024-01-17 10:31

/src/llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- AMDGPU.cpp ---------------------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "ABIInfoImpl.h"
10
#include "TargetInfo.h"
11
#include "clang/Basic/TargetOptions.h"
12
13
using namespace clang;
14
using namespace clang::CodeGen;
15
16
//===----------------------------------------------------------------------===//
17
// AMDGPU ABI Implementation
18
//===----------------------------------------------------------------------===//
19
20
namespace {
21
22
class AMDGPUABIInfo final : public DefaultABIInfo {
23
private:
24
  static const unsigned MaxNumRegsForArgsRet = 16;
25
26
  unsigned numRegsForType(QualType Ty) const;
27
28
  bool isHomogeneousAggregateBaseType(QualType Ty) const override;
29
  bool isHomogeneousAggregateSmallEnough(const Type *Base,
30
                                         uint64_t Members) const override;
31
32
  // Coerce HIP scalar pointer arguments from generic pointers to global ones.
33
  llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
34
0
                                       unsigned ToAS) const {
35
    // Single value types.
36
0
    auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
37
0
    if (PtrTy && PtrTy->getAddressSpace() == FromAS)
38
0
      return llvm::PointerType::get(Ty->getContext(), ToAS);
39
0
    return Ty;
40
0
  }
41
42
public:
43
  explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
44
0
    DefaultABIInfo(CGT) {}
45
46
  ABIArgInfo classifyReturnType(QualType RetTy) const;
47
  ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
48
  ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const;
49
50
  void computeInfo(CGFunctionInfo &FI) const override;
51
  Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
52
                    QualType Ty) const override;
53
};
54
55
0
bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
56
0
  return true;
57
0
}
58
59
bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
60
0
  const Type *Base, uint64_t Members) const {
61
0
  uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
62
63
  // Homogeneous Aggregates may occupy at most 16 registers.
64
0
  return Members * NumRegs <= MaxNumRegsForArgsRet;
65
0
}
66
67
/// Estimate number of registers the type will use when passed in registers.
68
0
unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
69
0
  unsigned NumRegs = 0;
70
71
0
  if (const VectorType *VT = Ty->getAs<VectorType>()) {
72
    // Compute from the number of elements. The reported size is based on the
73
    // in-memory size, which includes the padding 4th element for 3-vectors.
74
0
    QualType EltTy = VT->getElementType();
75
0
    unsigned EltSize = getContext().getTypeSize(EltTy);
76
77
    // 16-bit element vectors should be passed as packed.
78
0
    if (EltSize == 16)
79
0
      return (VT->getNumElements() + 1) / 2;
80
81
0
    unsigned EltNumRegs = (EltSize + 31) / 32;
82
0
    return EltNumRegs * VT->getNumElements();
83
0
  }
84
85
0
  if (const RecordType *RT = Ty->getAs<RecordType>()) {
86
0
    const RecordDecl *RD = RT->getDecl();
87
0
    assert(!RD->hasFlexibleArrayMember());
88
89
0
    for (const FieldDecl *Field : RD->fields()) {
90
0
      QualType FieldTy = Field->getType();
91
0
      NumRegs += numRegsForType(FieldTy);
92
0
    }
93
94
0
    return NumRegs;
95
0
  }
96
97
0
  return (getContext().getTypeSize(Ty) + 31) / 32;
98
0
}
99
100
0
void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
101
0
  llvm::CallingConv::ID CC = FI.getCallingConvention();
102
103
0
  if (!getCXXABI().classifyReturnType(FI))
104
0
    FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
105
106
0
  unsigned NumRegsLeft = MaxNumRegsForArgsRet;
107
0
  for (auto &Arg : FI.arguments()) {
108
0
    if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
109
0
      Arg.info = classifyKernelArgumentType(Arg.type);
110
0
    } else {
111
0
      Arg.info = classifyArgumentType(Arg.type, NumRegsLeft);
112
0
    }
113
0
  }
114
0
}
115
116
Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
117
0
                                 QualType Ty) const {
118
0
  llvm_unreachable("AMDGPU does not support varargs");
119
0
}
120
121
0
ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
122
0
  if (isAggregateTypeForABI(RetTy)) {
123
    // Records with non-trivial destructors/copy-constructors should not be
124
    // returned by value.
125
0
    if (!getRecordArgABI(RetTy, getCXXABI())) {
126
      // Ignore empty structs/unions.
127
0
      if (isEmptyRecord(getContext(), RetTy, true))
128
0
        return ABIArgInfo::getIgnore();
129
130
      // Lower single-element structs to just return a regular value.
131
0
      if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
132
0
        return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
133
134
0
      if (const RecordType *RT = RetTy->getAs<RecordType>()) {
135
0
        const RecordDecl *RD = RT->getDecl();
136
0
        if (RD->hasFlexibleArrayMember())
137
0
          return DefaultABIInfo::classifyReturnType(RetTy);
138
0
      }
139
140
      // Pack aggregates <= 4 bytes into single VGPR or pair.
141
0
      uint64_t Size = getContext().getTypeSize(RetTy);
142
0
      if (Size <= 16)
143
0
        return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
144
145
0
      if (Size <= 32)
146
0
        return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
147
148
0
      if (Size <= 64) {
149
0
        llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
150
0
        return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
151
0
      }
152
153
0
      if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
154
0
        return ABIArgInfo::getDirect();
155
0
    }
156
0
  }
157
158
  // Otherwise just do the default thing.
159
0
  return DefaultABIInfo::classifyReturnType(RetTy);
160
0
}
161
162
/// For kernels all parameters are really passed in a special buffer. It doesn't
163
/// make sense to pass anything byval, so everything must be direct.
164
0
ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
165
0
  Ty = useFirstFieldIfTransparentUnion(Ty);
166
167
  // TODO: Can we omit empty structs?
168
169
0
  if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
170
0
    Ty = QualType(SeltTy, 0);
171
172
0
  llvm::Type *OrigLTy = CGT.ConvertType(Ty);
173
0
  llvm::Type *LTy = OrigLTy;
174
0
  if (getContext().getLangOpts().HIP) {
175
0
    LTy = coerceKernelArgumentType(
176
0
        OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
177
0
        /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
178
0
  }
179
180
  // FIXME: Should also use this for OpenCL, but it requires addressing the
181
  // problem of kernels being called.
182
  //
183
  // FIXME: This doesn't apply the optimization of coercing pointers in structs
184
  // to global address space when using byref. This would require implementing a
185
  // new kind of coercion of the in-memory type when for indirect arguments.
186
0
  if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
187
0
      isAggregateTypeForABI(Ty)) {
188
0
    return ABIArgInfo::getIndirectAliased(
189
0
        getContext().getTypeAlignInChars(Ty),
190
0
        getContext().getTargetAddressSpace(LangAS::opencl_constant),
191
0
        false /*Realign*/, nullptr /*Padding*/);
192
0
  }
193
194
  // If we set CanBeFlattened to true, CodeGen will expand the struct to its
195
  // individual elements, which confuses the Clover OpenCL backend; therefore we
196
  // have to set it to false here. Other args of getDirect() are just defaults.
197
0
  return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
198
0
}
199
200
ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
201
0
                                               unsigned &NumRegsLeft) const {
202
0
  assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
203
204
0
  Ty = useFirstFieldIfTransparentUnion(Ty);
205
206
0
  if (isAggregateTypeForABI(Ty)) {
207
    // Records with non-trivial destructors/copy-constructors should not be
208
    // passed by value.
209
0
    if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
210
0
      return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
211
212
    // Ignore empty structs/unions.
213
0
    if (isEmptyRecord(getContext(), Ty, true))
214
0
      return ABIArgInfo::getIgnore();
215
216
    // Lower single-element structs to just pass a regular value. TODO: We
217
    // could do reasonable-size multiple-element structs too, using getExpand(),
218
    // though watch out for things like bitfields.
219
0
    if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
220
0
      return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
221
222
0
    if (const RecordType *RT = Ty->getAs<RecordType>()) {
223
0
      const RecordDecl *RD = RT->getDecl();
224
0
      if (RD->hasFlexibleArrayMember())
225
0
        return DefaultABIInfo::classifyArgumentType(Ty);
226
0
    }
227
228
    // Pack aggregates <= 8 bytes into single VGPR or pair.
229
0
    uint64_t Size = getContext().getTypeSize(Ty);
230
0
    if (Size <= 64) {
231
0
      unsigned NumRegs = (Size + 31) / 32;
232
0
      NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
233
234
0
      if (Size <= 16)
235
0
        return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
236
237
0
      if (Size <= 32)
238
0
        return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
239
240
      // XXX: Should this be i64 instead, and should the limit increase?
241
0
      llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
242
0
      return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
243
0
    }
244
245
0
    if (NumRegsLeft > 0) {
246
0
      unsigned NumRegs = numRegsForType(Ty);
247
0
      if (NumRegsLeft >= NumRegs) {
248
0
        NumRegsLeft -= NumRegs;
249
0
        return ABIArgInfo::getDirect();
250
0
      }
251
0
    }
252
253
    // Use pass-by-reference in stead of pass-by-value for struct arguments in
254
    // function ABI.
255
0
    return ABIArgInfo::getIndirectAliased(
256
0
        getContext().getTypeAlignInChars(Ty),
257
0
        getContext().getTargetAddressSpace(LangAS::opencl_private));
258
0
  }
259
260
  // Otherwise just do the default thing.
261
0
  ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
262
0
  if (!ArgInfo.isIndirect()) {
263
0
    unsigned NumRegs = numRegsForType(Ty);
264
0
    NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
265
0
  }
266
267
0
  return ArgInfo;
268
0
}
269
270
class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
271
public:
272
  AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
273
0
      : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
274
275
  void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
276
                                 CodeGenModule &CGM) const;
277
278
  void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
279
280
  void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
281
                           CodeGen::CodeGenModule &M) const override;
282
  unsigned getOpenCLKernelCallingConv() const override;
283
284
  llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
285
      llvm::PointerType *T, QualType QT) const override;
286
287
0
  LangAS getASTAllocaAddressSpace() const override {
288
0
    return getLangASFromTargetAS(
289
0
        getABIInfo().getDataLayout().getAllocaAddrSpace());
290
0
  }
291
  LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
292
                                  const VarDecl *D) const override;
293
  llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
294
                                         SyncScope Scope,
295
                                         llvm::AtomicOrdering Ordering,
296
                                         llvm::LLVMContext &Ctx) const override;
297
  llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
298
                                         llvm::Function *BlockInvokeFunc,
299
                                         llvm::Type *BlockTy) const override;
300
  bool shouldEmitStaticExternCAliases() const override;
301
  bool shouldEmitDWARFBitFieldSeparators() const override;
302
  void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
303
};
304
}
305
306
static bool requiresAMDGPUProtectedVisibility(const Decl *D,
307
0
                                              llvm::GlobalValue *GV) {
308
0
  if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
309
0
    return false;
310
311
0
  return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
312
0
         (D->hasAttr<OpenCLKernelAttr>() ||
313
0
          (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
314
0
          (isa<VarDecl>(D) &&
315
0
           (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
316
0
            cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
317
0
            cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
318
0
}
319
320
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
321
0
    const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
322
0
  const auto *ReqdWGS =
323
0
      M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
324
0
  const bool IsOpenCLKernel =
325
0
      M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
326
0
  const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
327
328
0
  const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
329
0
  if (ReqdWGS || FlatWGS) {
330
0
    M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
331
0
  } else if (IsOpenCLKernel || IsHIPKernel) {
332
    // By default, restrict the maximum size to a value specified by
333
    // --gpu-max-threads-per-block=n or its default value for HIP.
334
0
    const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
335
0
    const unsigned DefaultMaxWorkGroupSize =
336
0
        IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
337
0
                       : M.getLangOpts().GPUMaxThreadsPerBlock;
338
0
    std::string AttrVal =
339
0
        std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
340
0
    F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
341
0
  }
342
343
0
  if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
344
0
    M.handleAMDGPUWavesPerEUAttr(F, Attr);
345
346
0
  if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
347
0
    unsigned NumSGPR = Attr->getNumSGPR();
348
349
0
    if (NumSGPR != 0)
350
0
      F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
351
0
  }
352
353
0
  if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
354
0
    uint32_t NumVGPR = Attr->getNumVGPR();
355
356
0
    if (NumVGPR != 0)
357
0
      F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
358
0
  }
359
0
}
360
361
/// Emits control constants used to change per-architecture behaviour in the
362
/// AMDGPU ROCm device libraries.
363
void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
364
0
    CodeGen::CodeGenModule &CGM) const {
365
0
  StringRef Name = "__oclc_ABI_version";
366
0
  llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
367
0
  if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
368
0
    return;
369
370
0
  if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==
371
0
      llvm::CodeObjectVersionKind::COV_None)
372
0
    return;
373
374
0
  auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
375
0
  llvm::Constant *COV = llvm::ConstantInt::get(
376
0
      Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
377
378
  // It needs to be constant weak_odr without externally_initialized so that
379
  // the load instuction can be eliminated by the IPSCCP.
380
0
  auto *GV = new llvm::GlobalVariable(
381
0
      CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
382
0
      nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
383
0
      CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
384
0
  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
385
0
  GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
386
387
  // Replace any external references to this variable with the new global.
388
0
  if (OriginalGV) {
389
0
    OriginalGV->replaceAllUsesWith(GV);
390
0
    GV->takeName(OriginalGV);
391
0
    OriginalGV->eraseFromParent();
392
0
  }
393
0
}
394
395
void AMDGPUTargetCodeGenInfo::setTargetAttributes(
396
0
    const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
397
0
  if (requiresAMDGPUProtectedVisibility(D, GV)) {
398
0
    GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
399
0
    GV->setDSOLocal(true);
400
0
  }
401
402
0
  if (GV->isDeclaration())
403
0
    return;
404
405
0
  llvm::Function *F = dyn_cast<llvm::Function>(GV);
406
0
  if (!F)
407
0
    return;
408
409
0
  const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
410
0
  if (FD)
411
0
    setFunctionDeclAttributes(FD, F, M);
412
413
0
  if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
414
0
    F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
415
416
0
  if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
417
0
    F->addFnAttr("amdgpu-ieee", "false");
418
0
}
419
420
0
unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
421
0
  return llvm::CallingConv::AMDGPU_KERNEL;
422
0
}
423
424
// Currently LLVM assumes null pointers always have value 0,
425
// which results in incorrectly transformed IR. Therefore, instead of
426
// emitting null pointers in private and local address spaces, a null
427
// pointer in generic address space is emitted which is casted to a
428
// pointer in local or private address space.
429
llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
430
    const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
431
0
    QualType QT) const {
432
0
  if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
433
0
    return llvm::ConstantPointerNull::get(PT);
434
435
0
  auto &Ctx = CGM.getContext();
436
0
  auto NPT = llvm::PointerType::get(
437
0
      PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
438
0
  return llvm::ConstantExpr::getAddrSpaceCast(
439
0
      llvm::ConstantPointerNull::get(NPT), PT);
440
0
}
441
442
LangAS
443
AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
444
0
                                                  const VarDecl *D) const {
445
0
  assert(!CGM.getLangOpts().OpenCL &&
446
0
         !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
447
0
         "Address space agnostic languages only");
448
0
  LangAS DefaultGlobalAS = getLangASFromTargetAS(
449
0
      CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
450
0
  if (!D)
451
0
    return DefaultGlobalAS;
452
453
0
  LangAS AddrSpace = D->getType().getAddressSpace();
454
0
  if (AddrSpace != LangAS::Default)
455
0
    return AddrSpace;
456
457
  // Only promote to address space 4 if VarDecl has constant initialization.
458
0
  if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
459
0
      D->hasConstantInitialization()) {
460
0
    if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
461
0
      return *ConstAS;
462
0
  }
463
0
  return DefaultGlobalAS;
464
0
}
465
466
llvm::SyncScope::ID
467
AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
468
                                            SyncScope Scope,
469
                                            llvm::AtomicOrdering Ordering,
470
0
                                            llvm::LLVMContext &Ctx) const {
471
0
  std::string Name;
472
0
  switch (Scope) {
473
0
  case SyncScope::HIPSingleThread:
474
0
  case SyncScope::SingleScope:
475
0
    Name = "singlethread";
476
0
    break;
477
0
  case SyncScope::HIPWavefront:
478
0
  case SyncScope::OpenCLSubGroup:
479
0
  case SyncScope::WavefrontScope:
480
0
    Name = "wavefront";
481
0
    break;
482
0
  case SyncScope::HIPWorkgroup:
483
0
  case SyncScope::OpenCLWorkGroup:
484
0
  case SyncScope::WorkgroupScope:
485
0
    Name = "workgroup";
486
0
    break;
487
0
  case SyncScope::HIPAgent:
488
0
  case SyncScope::OpenCLDevice:
489
0
  case SyncScope::DeviceScope:
490
0
    Name = "agent";
491
0
    break;
492
0
  case SyncScope::SystemScope:
493
0
  case SyncScope::HIPSystem:
494
0
  case SyncScope::OpenCLAllSVMDevices:
495
0
    Name = "";
496
0
    break;
497
0
  }
498
499
0
  if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
500
0
    if (!Name.empty())
501
0
      Name = Twine(Twine(Name) + Twine("-")).str();
502
503
0
    Name = Twine(Twine(Name) + Twine("one-as")).str();
504
0
  }
505
506
0
  return Ctx.getOrInsertSyncScopeID(Name);
507
0
}
508
509
0
bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
510
0
  return false;
511
0
}
512
513
0
bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
514
0
  return true;
515
0
}
516
517
void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
518
0
    const FunctionType *&FT) const {
519
0
  FT = getABIInfo().getContext().adjustFunctionType(
520
0
      FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
521
0
}
522
523
/// Create an OpenCL kernel for an enqueued block.
524
///
525
/// The type of the first argument (the block literal) is the struct type
526
/// of the block literal instead of a pointer type. The first argument
527
/// (block literal) is passed directly by value to the kernel. The kernel
528
/// allocates the same type of struct on stack and stores the block literal
529
/// to it and passes its pointer to the block invoke function. The kernel
530
/// has "enqueued-block" function attribute and kernel argument metadata.
531
llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
532
0
    CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
533
0
  auto &Builder = CGF.Builder;
534
0
  auto &C = CGF.getLLVMContext();
535
536
0
  auto *InvokeFT = Invoke->getFunctionType();
537
0
  llvm::SmallVector<llvm::Type *, 2> ArgTys;
538
0
  llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
539
0
  llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
540
0
  llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
541
0
  llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
542
0
  llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
543
0
  llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
544
545
0
  ArgTys.push_back(BlockTy);
546
0
  ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
547
0
  AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
548
0
  ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
549
0
  ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
550
0
  AccessQuals.push_back(llvm::MDString::get(C, "none"));
551
0
  ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
552
0
  for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
553
0
    ArgTys.push_back(InvokeFT->getParamType(I));
554
0
    ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
555
0
    AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
556
0
    AccessQuals.push_back(llvm::MDString::get(C, "none"));
557
0
    ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
558
0
    ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
559
0
    ArgNames.push_back(
560
0
        llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
561
0
  }
562
0
  std::string Name = Invoke->getName().str() + "_kernel";
563
0
  auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
564
0
  auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
565
0
                                   &CGF.CGM.getModule());
566
0
  F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
567
568
0
  llvm::AttrBuilder KernelAttrs(C);
569
  // FIXME: The invoke isn't applying the right attributes either
570
  // FIXME: This is missing setTargetAttributes
571
0
  CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
572
0
  KernelAttrs.addAttribute("enqueued-block");
573
0
  F->addFnAttrs(KernelAttrs);
574
575
0
  auto IP = CGF.Builder.saveIP();
576
0
  auto *BB = llvm::BasicBlock::Create(C, "entry", F);
577
0
  Builder.SetInsertPoint(BB);
578
0
  const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
579
0
  auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
580
0
  BlockPtr->setAlignment(BlockAlign);
581
0
  Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
582
0
  auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
583
0
  llvm::SmallVector<llvm::Value *, 2> Args;
584
0
  Args.push_back(Cast);
585
0
  for (llvm::Argument &A : llvm::drop_begin(F->args()))
586
0
    Args.push_back(&A);
587
0
  llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
588
0
  call->setCallingConv(Invoke->getCallingConv());
589
0
  Builder.CreateRetVoid();
590
0
  Builder.restoreIP(IP);
591
592
0
  F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
593
0
  F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
594
0
  F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
595
0
  F->setMetadata("kernel_arg_base_type",
596
0
                 llvm::MDNode::get(C, ArgBaseTypeNames));
597
0
  F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
598
0
  if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
599
0
    F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
600
601
0
  return F;
602
0
}
603
604
void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
605
    llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
606
    const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
607
0
    int32_t *MaxThreadsVal) {
608
0
  unsigned Min = 0;
609
0
  unsigned Max = 0;
610
0
  if (FlatWGS) {
611
0
    Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
612
0
    Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
613
0
  }
614
0
  if (ReqdWGS && Min == 0 && Max == 0)
615
0
    Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
616
617
0
  if (Min != 0) {
618
0
    assert(Min <= Max && "Min must be less than or equal Max");
619
620
0
    if (MinThreadsVal)
621
0
      *MinThreadsVal = Min;
622
0
    if (MaxThreadsVal)
623
0
      *MaxThreadsVal = Max;
624
0
    std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
625
0
    if (F)
626
0
      F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
627
0
  } else
628
0
    assert(Max == 0 && "Max must be zero");
629
0
}
630
631
void CodeGenModule::handleAMDGPUWavesPerEUAttr(
632
0
    llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
633
0
  unsigned Min =
634
0
      Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
635
0
  unsigned Max =
636
0
      Attr->getMax()
637
0
          ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
638
0
          : 0;
639
640
0
  if (Min != 0) {
641
0
    assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
642
643
0
    std::string AttrVal = llvm::utostr(Min);
644
0
    if (Max != 0)
645
0
      AttrVal = AttrVal + "," + llvm::utostr(Max);
646
0
    F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
647
0
  } else
648
0
    assert(Max == 0 && "Max must be zero");
649
0
}
650
651
std::unique_ptr<TargetCodeGenInfo>
652
0
CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
653
0
  return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
654
0
}