/src/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- SIInsertHardClauses.cpp - Insert Hard Clauses ----------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | /// \file |
10 | | /// Insert s_clause instructions to form hard clauses. |
11 | | /// |
12 | | /// Clausing load instructions can give cache coherency benefits. Before gfx10, |
13 | | /// the hardware automatically detected "soft clauses", which were sequences of |
14 | | /// memory instructions of the same type. In gfx10 this detection was removed, |
15 | | /// and the s_clause instruction was introduced to explicitly mark "hard |
16 | | /// clauses". |
17 | | /// |
18 | | /// It's the scheduler's job to form the clauses by putting similar memory |
19 | | /// instructions next to each other. Our job is just to insert an s_clause |
20 | | /// instruction to mark the start of each clause. |
21 | | /// |
22 | | /// Note that hard clauses are very similar to, but logically distinct from, the |
23 | | /// groups of instructions that have to be restartable when XNACK is enabled. |
24 | | /// The rules are slightly different in each case. For example an s_nop |
25 | | /// instruction breaks a restartable group, but can appear in the middle of a |
26 | | /// hard clause. (Before gfx10 there wasn't a distinction, and both were called |
27 | | /// "soft clauses" or just "clauses".) |
28 | | /// |
29 | | /// The SIFormMemoryClauses pass and GCNHazardRecognizer deal with restartable |
30 | | /// groups, not hard clauses. |
31 | | // |
32 | | //===----------------------------------------------------------------------===// |
33 | | |
34 | | #include "AMDGPU.h" |
35 | | #include "GCNSubtarget.h" |
36 | | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
37 | | #include "llvm/ADT/SmallVector.h" |
38 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
39 | | |
40 | | using namespace llvm; |
41 | | |
42 | | #define DEBUG_TYPE "si-insert-hard-clauses" |
43 | | |
44 | | namespace { |
45 | | |
46 | | // A clause length of 64 instructions could be encoded in the s_clause |
47 | | // instruction, but the hardware documentation (at least for GFX11) says that |
48 | | // 63 is the maximum allowed. |
49 | | constexpr unsigned MaxInstructionsInClause = 63; |
50 | | |
51 | | enum HardClauseType { |
52 | | // For GFX10: |
53 | | |
54 | | // Texture, buffer, global or scratch memory instructions. |
55 | | HARDCLAUSE_VMEM, |
56 | | // Flat (not global or scratch) memory instructions. |
57 | | HARDCLAUSE_FLAT, |
58 | | |
59 | | // For GFX11: |
60 | | |
61 | | // Texture memory instructions. |
62 | | HARDCLAUSE_MIMG_LOAD, |
63 | | HARDCLAUSE_MIMG_STORE, |
64 | | HARDCLAUSE_MIMG_ATOMIC, |
65 | | HARDCLAUSE_MIMG_SAMPLE, |
66 | | // Buffer, global or scratch memory instructions. |
67 | | HARDCLAUSE_VMEM_LOAD, |
68 | | HARDCLAUSE_VMEM_STORE, |
69 | | HARDCLAUSE_VMEM_ATOMIC, |
70 | | // Flat (not global or scratch) memory instructions. |
71 | | HARDCLAUSE_FLAT_LOAD, |
72 | | HARDCLAUSE_FLAT_STORE, |
73 | | HARDCLAUSE_FLAT_ATOMIC, |
74 | | // BVH instructions. |
75 | | HARDCLAUSE_BVH, |
76 | | |
77 | | // Common: |
78 | | |
79 | | // Instructions that access LDS. |
80 | | HARDCLAUSE_LDS, |
81 | | // Scalar memory instructions. |
82 | | HARDCLAUSE_SMEM, |
83 | | // VALU instructions. |
84 | | HARDCLAUSE_VALU, |
85 | | LAST_REAL_HARDCLAUSE_TYPE = HARDCLAUSE_VALU, |
86 | | |
87 | | // Internal instructions, which are allowed in the middle of a hard clause, |
88 | | // except for s_waitcnt. |
89 | | HARDCLAUSE_INTERNAL, |
90 | | // Meta instructions that do not result in any ISA like KILL. |
91 | | HARDCLAUSE_IGNORE, |
92 | | // Instructions that are not allowed in a hard clause: SALU, export, branch, |
93 | | // message, GDS, s_waitcnt and anything else not mentioned above. |
94 | | HARDCLAUSE_ILLEGAL, |
95 | | }; |
96 | | |
97 | | class SIInsertHardClauses : public MachineFunctionPass { |
98 | | public: |
99 | | static char ID; |
100 | | const GCNSubtarget *ST = nullptr; |
101 | | |
102 | 0 | SIInsertHardClauses() : MachineFunctionPass(ID) {} |
103 | | |
104 | 0 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
105 | 0 | AU.setPreservesCFG(); |
106 | 0 | MachineFunctionPass::getAnalysisUsage(AU); |
107 | 0 | } |
108 | | |
109 | 0 | HardClauseType getHardClauseType(const MachineInstr &MI) { |
110 | 0 | if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) { |
111 | 0 | if (ST->getGeneration() == AMDGPUSubtarget::GFX10) { |
112 | 0 | if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) { |
113 | 0 | if (ST->hasNSAClauseBug()) { |
114 | 0 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); |
115 | 0 | if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA) |
116 | 0 | return HARDCLAUSE_ILLEGAL; |
117 | 0 | } |
118 | 0 | return HARDCLAUSE_VMEM; |
119 | 0 | } |
120 | 0 | if (SIInstrInfo::isFLAT(MI)) |
121 | 0 | return HARDCLAUSE_FLAT; |
122 | 0 | } else { |
123 | 0 | assert(ST->getGeneration() >= AMDGPUSubtarget::GFX11); |
124 | 0 | if (SIInstrInfo::isMIMG(MI)) { |
125 | 0 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); |
126 | 0 | const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = |
127 | 0 | AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); |
128 | 0 | if (BaseInfo->BVH) |
129 | 0 | return HARDCLAUSE_BVH; |
130 | 0 | if (BaseInfo->Sampler) |
131 | 0 | return HARDCLAUSE_MIMG_SAMPLE; |
132 | 0 | return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_MIMG_ATOMIC |
133 | 0 | : HARDCLAUSE_MIMG_LOAD |
134 | 0 | : HARDCLAUSE_MIMG_STORE; |
135 | 0 | } |
136 | 0 | if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) { |
137 | 0 | return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_VMEM_ATOMIC |
138 | 0 | : HARDCLAUSE_VMEM_LOAD |
139 | 0 | : HARDCLAUSE_VMEM_STORE; |
140 | 0 | } |
141 | 0 | if (SIInstrInfo::isFLAT(MI)) { |
142 | 0 | return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_FLAT_ATOMIC |
143 | 0 | : HARDCLAUSE_FLAT_LOAD |
144 | 0 | : HARDCLAUSE_FLAT_STORE; |
145 | 0 | } |
146 | 0 | } |
147 | | // TODO: LDS |
148 | 0 | if (SIInstrInfo::isSMRD(MI)) |
149 | 0 | return HARDCLAUSE_SMEM; |
150 | 0 | } |
151 | | |
152 | | // Don't form VALU clauses. It's not clear what benefit they give, if any. |
153 | | |
154 | | // In practice s_nop is the only internal instruction we're likely to see. |
155 | | // It's safe to treat the rest as illegal. |
156 | 0 | if (MI.getOpcode() == AMDGPU::S_NOP) |
157 | 0 | return HARDCLAUSE_INTERNAL; |
158 | 0 | if (MI.isMetaInstruction()) |
159 | 0 | return HARDCLAUSE_IGNORE; |
160 | 0 | return HARDCLAUSE_ILLEGAL; |
161 | 0 | } |
162 | | |
163 | | // Track information about a clause as we discover it. |
164 | | struct ClauseInfo { |
165 | | // The type of all (non-internal) instructions in the clause. |
166 | | HardClauseType Type = HARDCLAUSE_ILLEGAL; |
167 | | // The first (necessarily non-internal) instruction in the clause. |
168 | | MachineInstr *First = nullptr; |
169 | | // The last non-internal instruction in the clause. |
170 | | MachineInstr *Last = nullptr; |
171 | | // The length of the clause including any internal instructions in the |
172 | | // middle (but not at the end) of the clause. |
173 | | unsigned Length = 0; |
174 | | // Internal instructions at the and of a clause should not be included in |
175 | | // the clause. Count them in TrailingInternalLength until a new memory |
176 | | // instruction is added. |
177 | | unsigned TrailingInternalLength = 0; |
178 | | // The base operands of *Last. |
179 | | SmallVector<const MachineOperand *, 4> BaseOps; |
180 | | }; |
181 | | |
182 | 0 | bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) { |
183 | 0 | if (CI.First == CI.Last) |
184 | 0 | return false; |
185 | 0 | assert(CI.Length <= MaxInstructionsInClause && "Hard clause is too long!"); |
186 | | |
187 | 0 | auto &MBB = *CI.First->getParent(); |
188 | 0 | auto ClauseMI = |
189 | 0 | BuildMI(MBB, *CI.First, DebugLoc(), SII->get(AMDGPU::S_CLAUSE)) |
190 | 0 | .addImm(CI.Length - 1); |
191 | 0 | finalizeBundle(MBB, ClauseMI->getIterator(), |
192 | 0 | std::next(CI.Last->getIterator())); |
193 | 0 | return true; |
194 | 0 | } |
195 | | |
196 | 0 | bool runOnMachineFunction(MachineFunction &MF) override { |
197 | 0 | if (skipFunction(MF.getFunction())) |
198 | 0 | return false; |
199 | | |
200 | 0 | ST = &MF.getSubtarget<GCNSubtarget>(); |
201 | 0 | if (!ST->hasHardClauses()) |
202 | 0 | return false; |
203 | | |
204 | 0 | const SIInstrInfo *SII = ST->getInstrInfo(); |
205 | 0 | const TargetRegisterInfo *TRI = ST->getRegisterInfo(); |
206 | |
|
207 | 0 | bool Changed = false; |
208 | 0 | for (auto &MBB : MF) { |
209 | 0 | ClauseInfo CI; |
210 | 0 | for (auto &MI : MBB) { |
211 | 0 | HardClauseType Type = getHardClauseType(MI); |
212 | |
|
213 | 0 | int64_t Dummy1; |
214 | 0 | bool Dummy2; |
215 | 0 | unsigned Dummy3; |
216 | 0 | SmallVector<const MachineOperand *, 4> BaseOps; |
217 | 0 | if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { |
218 | 0 | if (!SII->getMemOperandsWithOffsetWidth(MI, BaseOps, Dummy1, Dummy2, |
219 | 0 | Dummy3, TRI)) { |
220 | | // We failed to get the base operands, so we'll never clause this |
221 | | // instruction with any other, so pretend it's illegal. |
222 | 0 | Type = HARDCLAUSE_ILLEGAL; |
223 | 0 | } |
224 | 0 | } |
225 | |
|
226 | 0 | if (CI.Length == MaxInstructionsInClause || |
227 | 0 | (CI.Length && Type != HARDCLAUSE_INTERNAL && |
228 | 0 | Type != HARDCLAUSE_IGNORE && |
229 | 0 | (Type != CI.Type || |
230 | | // Note that we lie to shouldClusterMemOps about the size of the |
231 | | // cluster. When shouldClusterMemOps is called from the machine |
232 | | // scheduler it limits the size of the cluster to avoid increasing |
233 | | // register pressure too much, but this pass runs after register |
234 | | // allocation so there is no need for that kind of limit. |
235 | | // We also lie about the Offset and OffsetIsScalable parameters, |
236 | | // as they aren't used in the SIInstrInfo implementation. |
237 | 0 | !SII->shouldClusterMemOps(CI.BaseOps, 0, false, BaseOps, 0, false, |
238 | 0 | 2, 2)))) { |
239 | | // Finish the current clause. |
240 | 0 | Changed |= emitClause(CI, SII); |
241 | 0 | CI = ClauseInfo(); |
242 | 0 | } |
243 | |
|
244 | 0 | if (CI.Length) { |
245 | | // Extend the current clause. |
246 | 0 | if (Type != HARDCLAUSE_IGNORE) { |
247 | 0 | if (Type == HARDCLAUSE_INTERNAL) { |
248 | 0 | ++CI.TrailingInternalLength; |
249 | 0 | } else { |
250 | 0 | ++CI.Length; |
251 | 0 | CI.Length += CI.TrailingInternalLength; |
252 | 0 | CI.TrailingInternalLength = 0; |
253 | 0 | CI.Last = &MI; |
254 | 0 | CI.BaseOps = std::move(BaseOps); |
255 | 0 | } |
256 | 0 | } |
257 | 0 | } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { |
258 | | // Start a new clause. |
259 | 0 | CI = ClauseInfo{Type, &MI, &MI, 1, 0, std::move(BaseOps)}; |
260 | 0 | } |
261 | 0 | } |
262 | | |
263 | | // Finish the last clause in the basic block if any. |
264 | 0 | if (CI.Length) |
265 | 0 | Changed |= emitClause(CI, SII); |
266 | 0 | } |
267 | |
|
268 | 0 | return Changed; |
269 | 0 | } |
270 | | }; |
271 | | |
272 | | } // namespace |
273 | | |
274 | | char SIInsertHardClauses::ID = 0; |
275 | | |
276 | | char &llvm::SIInsertHardClausesID = SIInsertHardClauses::ID; |
277 | | |
278 | | INITIALIZE_PASS(SIInsertHardClauses, DEBUG_TYPE, "SI Insert Hard Clauses", |
279 | | false, false) |