/src/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This pass lowers all occurrences of i1 values (with a vreg_1 register class) |
10 | | // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA |
11 | | // form and a wave-level control flow graph. |
12 | | // |
13 | | // Before this pass, values that are semantically i1 and are defined and used |
14 | | // within the same basic block are already represented as lane masks in scalar |
15 | | // registers. However, values that cross basic blocks are always transferred |
16 | | // between basic blocks in vreg_1 virtual registers and are lowered by this |
17 | | // pass. |
18 | | // |
19 | | // The only instructions that use or define vreg_1 virtual registers are COPY, |
20 | | // PHI, and IMPLICIT_DEF. |
21 | | // |
22 | | //===----------------------------------------------------------------------===// |
23 | | |
24 | | #include "SILowerI1Copies.h" |
25 | | #include "AMDGPU.h" |
26 | | #include "llvm/CodeGen/MachineSSAUpdater.h" |
27 | | #include "llvm/InitializePasses.h" |
28 | | #include "llvm/Target/CGPassBuilderOption.h" |
29 | | |
30 | | #define DEBUG_TYPE "si-i1-copies" |
31 | | |
32 | | using namespace llvm; |
33 | | |
34 | | static Register insertUndefLaneMask(MachineBasicBlock *MBB, |
35 | | MachineRegisterInfo *MRI, |
36 | | Register LaneMaskRegAttrs); |
37 | | |
38 | | namespace { |
39 | | |
40 | | class SILowerI1Copies : public MachineFunctionPass { |
41 | | public: |
42 | | static char ID; |
43 | | |
44 | 0 | SILowerI1Copies() : MachineFunctionPass(ID) { |
45 | 0 | initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); |
46 | 0 | } |
47 | | |
48 | | bool runOnMachineFunction(MachineFunction &MF) override; |
49 | | |
50 | 0 | StringRef getPassName() const override { return "SI Lower i1 Copies"; } |
51 | | |
52 | 0 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
53 | 0 | AU.setPreservesCFG(); |
54 | 0 | AU.addRequired<MachineDominatorTree>(); |
55 | 0 | AU.addRequired<MachinePostDominatorTree>(); |
56 | 0 | MachineFunctionPass::getAnalysisUsage(AU); |
57 | 0 | } |
58 | | }; |
59 | | |
60 | | class Vreg1LoweringHelper : public PhiLoweringHelper { |
61 | | public: |
62 | | Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, |
63 | | MachinePostDominatorTree *PDT); |
64 | | |
65 | | private: |
66 | | DenseSet<Register> ConstrainRegs; |
67 | | |
68 | | public: |
69 | | void markAsLaneMask(Register DstReg) const override; |
70 | | void getCandidatesForLowering( |
71 | | SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override; |
72 | | void collectIncomingValuesFromPhi( |
73 | | const MachineInstr *MI, |
74 | | SmallVectorImpl<Incoming> &Incomings) const override; |
75 | | void replaceDstReg(Register NewReg, Register OldReg, |
76 | | MachineBasicBlock *MBB) override; |
77 | | void buildMergeLaneMasks(MachineBasicBlock &MBB, |
78 | | MachineBasicBlock::iterator I, const DebugLoc &DL, |
79 | | Register DstReg, Register PrevReg, |
80 | | Register CurReg) override; |
81 | | void constrainIncomingRegisterTakenAsIs(Incoming &In) override; |
82 | | |
83 | | bool lowerCopiesFromI1(); |
84 | | bool lowerCopiesToI1(); |
85 | | bool cleanConstrainRegs(bool Changed); |
86 | 0 | bool isVreg1(Register Reg) const { |
87 | 0 | return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; |
88 | 0 | } |
89 | | }; |
90 | | |
91 | | Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF, |
92 | | MachineDominatorTree *DT, |
93 | | MachinePostDominatorTree *PDT) |
94 | 0 | : PhiLoweringHelper(MF, DT, PDT) {} |
95 | | |
96 | 0 | bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { |
97 | 0 | assert(Changed || ConstrainRegs.empty()); |
98 | 0 | for (Register Reg : ConstrainRegs) |
99 | 0 | MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); |
100 | 0 | ConstrainRegs.clear(); |
101 | |
|
102 | 0 | return Changed; |
103 | 0 | } |
104 | | |
105 | | /// Helper class that determines the relationship between incoming values of a |
106 | | /// phi in the control flow graph to determine where an incoming value can |
107 | | /// simply be taken as a scalar lane mask as-is, and where it needs to be |
108 | | /// merged with another, previously defined lane mask. |
109 | | /// |
110 | | /// The approach is as follows: |
111 | | /// - Determine all basic blocks which, starting from the incoming blocks, |
112 | | /// a wave may reach before entering the def block (the block containing the |
113 | | /// phi). |
114 | | /// - If an incoming block has no predecessors in this set, we can take the |
115 | | /// incoming value as a scalar lane mask as-is. |
116 | | /// -- A special case of this is when the def block has a self-loop. |
117 | | /// - Otherwise, the incoming value needs to be merged with a previously |
118 | | /// defined lane mask. |
119 | | /// - If there is a path into the set of reachable blocks that does _not_ go |
120 | | /// through an incoming block where we can take the scalar lane mask as-is, |
121 | | /// we need to invent an available value for the SSAUpdater. Choices are |
122 | | /// 0 and undef, with differing consequences for how to merge values etc. |
123 | | /// |
124 | | /// TODO: We could use region analysis to quickly skip over SESE regions during |
125 | | /// the traversal. |
126 | | /// |
127 | | class PhiIncomingAnalysis { |
128 | | MachinePostDominatorTree &PDT; |
129 | | const SIInstrInfo *TII; |
130 | | |
131 | | // For each reachable basic block, whether it is a source in the induced |
132 | | // subgraph of the CFG. |
133 | | DenseMap<MachineBasicBlock *, bool> ReachableMap; |
134 | | SmallVector<MachineBasicBlock *, 4> ReachableOrdered; |
135 | | SmallVector<MachineBasicBlock *, 4> Stack; |
136 | | SmallVector<MachineBasicBlock *, 4> Predecessors; |
137 | | |
138 | | public: |
139 | | PhiIncomingAnalysis(MachinePostDominatorTree &PDT, const SIInstrInfo *TII) |
140 | 0 | : PDT(PDT), TII(TII) {} |
141 | | |
142 | | /// Returns whether \p MBB is a source in the induced subgraph of reachable |
143 | | /// blocks. |
144 | 0 | bool isSource(MachineBasicBlock &MBB) const { |
145 | 0 | return ReachableMap.find(&MBB)->second; |
146 | 0 | } |
147 | | |
148 | 0 | ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } |
149 | | |
150 | 0 | void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) { |
151 | 0 | assert(Stack.empty()); |
152 | 0 | ReachableMap.clear(); |
153 | 0 | ReachableOrdered.clear(); |
154 | 0 | Predecessors.clear(); |
155 | | |
156 | | // Insert the def block first, so that it acts as an end point for the |
157 | | // traversal. |
158 | 0 | ReachableMap.try_emplace(&DefBlock, false); |
159 | 0 | ReachableOrdered.push_back(&DefBlock); |
160 | |
|
161 | 0 | for (auto Incoming : Incomings) { |
162 | 0 | MachineBasicBlock *MBB = Incoming.Block; |
163 | 0 | if (MBB == &DefBlock) { |
164 | 0 | ReachableMap[&DefBlock] = true; // self-loop on DefBlock |
165 | 0 | continue; |
166 | 0 | } |
167 | | |
168 | 0 | ReachableMap.try_emplace(MBB, false); |
169 | 0 | ReachableOrdered.push_back(MBB); |
170 | | |
171 | | // If this block has a divergent terminator and the def block is its |
172 | | // post-dominator, the wave may first visit the other successors. |
173 | 0 | if (TII->hasDivergentBranch(MBB) && PDT.dominates(&DefBlock, MBB)) |
174 | 0 | append_range(Stack, MBB->successors()); |
175 | 0 | } |
176 | |
|
177 | 0 | while (!Stack.empty()) { |
178 | 0 | MachineBasicBlock *MBB = Stack.pop_back_val(); |
179 | 0 | if (!ReachableMap.try_emplace(MBB, false).second) |
180 | 0 | continue; |
181 | 0 | ReachableOrdered.push_back(MBB); |
182 | |
|
183 | 0 | append_range(Stack, MBB->successors()); |
184 | 0 | } |
185 | |
|
186 | 0 | for (MachineBasicBlock *MBB : ReachableOrdered) { |
187 | 0 | bool HaveReachablePred = false; |
188 | 0 | for (MachineBasicBlock *Pred : MBB->predecessors()) { |
189 | 0 | if (ReachableMap.count(Pred)) { |
190 | 0 | HaveReachablePred = true; |
191 | 0 | } else { |
192 | 0 | Stack.push_back(Pred); |
193 | 0 | } |
194 | 0 | } |
195 | 0 | if (!HaveReachablePred) |
196 | 0 | ReachableMap[MBB] = true; |
197 | 0 | if (HaveReachablePred) { |
198 | 0 | for (MachineBasicBlock *UnreachablePred : Stack) { |
199 | 0 | if (!llvm::is_contained(Predecessors, UnreachablePred)) |
200 | 0 | Predecessors.push_back(UnreachablePred); |
201 | 0 | } |
202 | 0 | } |
203 | 0 | Stack.clear(); |
204 | 0 | } |
205 | 0 | } |
206 | | }; |
207 | | |
208 | | /// Helper class that detects loops which require us to lower an i1 COPY into |
209 | | /// bitwise manipulation. |
210 | | /// |
211 | | /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish |
212 | | /// between loops with the same header. Consider this example: |
213 | | /// |
214 | | /// A-+-+ |
215 | | /// | | | |
216 | | /// B-+ | |
217 | | /// | | |
218 | | /// C---+ |
219 | | /// |
220 | | /// A is the header of a loop containing A, B, and C as far as LoopInfo is |
221 | | /// concerned. However, an i1 COPY in B that is used in C must be lowered to |
222 | | /// bitwise operations to combine results from different loop iterations when |
223 | | /// B has a divergent branch (since by default we will compile this code such |
224 | | /// that threads in a wave are merged at the entry of C). |
225 | | /// |
226 | | /// The following rule is implemented to determine whether bitwise operations |
227 | | /// are required: use the bitwise lowering for a def in block B if a backward |
228 | | /// edge to B is reachable without going through the nearest common |
229 | | /// post-dominator of B and all uses of the def. |
230 | | /// |
231 | | /// TODO: This rule is conservative because it does not check whether the |
232 | | /// relevant branches are actually divergent. |
233 | | /// |
234 | | /// The class is designed to cache the CFG traversal so that it can be re-used |
235 | | /// for multiple defs within the same basic block. |
236 | | /// |
237 | | /// TODO: We could use region analysis to quickly skip over SESE regions during |
238 | | /// the traversal. |
239 | | /// |
240 | | class LoopFinder { |
241 | | MachineDominatorTree &DT; |
242 | | MachinePostDominatorTree &PDT; |
243 | | |
244 | | // All visited / reachable block, tagged by level (level 0 is the def block, |
245 | | // level 1 are all blocks reachable including but not going through the def |
246 | | // block's IPDOM, etc.). |
247 | | DenseMap<MachineBasicBlock *, unsigned> Visited; |
248 | | |
249 | | // Nearest common dominator of all visited blocks by level (level 0 is the |
250 | | // def block). Used for seeding the SSAUpdater. |
251 | | SmallVector<MachineBasicBlock *, 4> CommonDominators; |
252 | | |
253 | | // Post-dominator of all visited blocks. |
254 | | MachineBasicBlock *VisitedPostDom = nullptr; |
255 | | |
256 | | // Level at which a loop was found: 0 is not possible; 1 = a backward edge is |
257 | | // reachable without going through the IPDOM of the def block (if the IPDOM |
258 | | // itself has an edge to the def block, the loop level is 2), etc. |
259 | | unsigned FoundLoopLevel = ~0u; |
260 | | |
261 | | MachineBasicBlock *DefBlock = nullptr; |
262 | | SmallVector<MachineBasicBlock *, 4> Stack; |
263 | | SmallVector<MachineBasicBlock *, 4> NextLevel; |
264 | | |
265 | | public: |
266 | | LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) |
267 | 0 | : DT(DT), PDT(PDT) {} |
268 | | |
269 | 0 | void initialize(MachineBasicBlock &MBB) { |
270 | 0 | Visited.clear(); |
271 | 0 | CommonDominators.clear(); |
272 | 0 | Stack.clear(); |
273 | 0 | NextLevel.clear(); |
274 | 0 | VisitedPostDom = nullptr; |
275 | 0 | FoundLoopLevel = ~0u; |
276 | |
|
277 | 0 | DefBlock = &MBB; |
278 | 0 | } |
279 | | |
280 | | /// Check whether a backward edge can be reached without going through the |
281 | | /// given \p PostDom of the def block. |
282 | | /// |
283 | | /// Return the level of \p PostDom if a loop was found, or 0 otherwise. |
284 | 0 | unsigned findLoop(MachineBasicBlock *PostDom) { |
285 | 0 | MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); |
286 | |
|
287 | 0 | if (!VisitedPostDom) |
288 | 0 | advanceLevel(); |
289 | |
|
290 | 0 | unsigned Level = 0; |
291 | 0 | while (PDNode->getBlock() != PostDom) { |
292 | 0 | if (PDNode->getBlock() == VisitedPostDom) |
293 | 0 | advanceLevel(); |
294 | 0 | PDNode = PDNode->getIDom(); |
295 | 0 | Level++; |
296 | 0 | if (FoundLoopLevel == Level) |
297 | 0 | return Level; |
298 | 0 | } |
299 | | |
300 | 0 | return 0; |
301 | 0 | } |
302 | | |
303 | | /// Add undef values dominating the loop and the optionally given additional |
304 | | /// blocks, so that the SSA updater doesn't have to search all the way to the |
305 | | /// function entry. |
306 | | void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, |
307 | | MachineRegisterInfo &MRI, Register LaneMaskRegAttrs, |
308 | 0 | ArrayRef<Incoming> Incomings = {}) { |
309 | 0 | assert(LoopLevel < CommonDominators.size()); |
310 | | |
311 | 0 | MachineBasicBlock *Dom = CommonDominators[LoopLevel]; |
312 | 0 | for (auto &Incoming : Incomings) |
313 | 0 | Dom = DT.findNearestCommonDominator(Dom, Incoming.Block); |
314 | |
|
315 | 0 | if (!inLoopLevel(*Dom, LoopLevel, Incomings)) { |
316 | 0 | SSAUpdater.AddAvailableValue( |
317 | 0 | Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs)); |
318 | 0 | } else { |
319 | | // The dominator is part of the loop or the given blocks, so add the |
320 | | // undef value to unreachable predecessors instead. |
321 | 0 | for (MachineBasicBlock *Pred : Dom->predecessors()) { |
322 | 0 | if (!inLoopLevel(*Pred, LoopLevel, Incomings)) |
323 | 0 | SSAUpdater.AddAvailableValue( |
324 | 0 | Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs)); |
325 | 0 | } |
326 | 0 | } |
327 | 0 | } |
328 | | |
329 | | private: |
330 | | bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, |
331 | 0 | ArrayRef<Incoming> Incomings) const { |
332 | 0 | auto DomIt = Visited.find(&MBB); |
333 | 0 | if (DomIt != Visited.end() && DomIt->second <= LoopLevel) |
334 | 0 | return true; |
335 | | |
336 | 0 | for (auto &Incoming : Incomings) |
337 | 0 | if (Incoming.Block == &MBB) |
338 | 0 | return true; |
339 | | |
340 | 0 | return false; |
341 | 0 | } |
342 | | |
343 | 0 | void advanceLevel() { |
344 | 0 | MachineBasicBlock *VisitedDom; |
345 | |
|
346 | 0 | if (!VisitedPostDom) { |
347 | 0 | VisitedPostDom = DefBlock; |
348 | 0 | VisitedDom = DefBlock; |
349 | 0 | Stack.push_back(DefBlock); |
350 | 0 | } else { |
351 | 0 | VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); |
352 | 0 | VisitedDom = CommonDominators.back(); |
353 | |
|
354 | 0 | for (unsigned i = 0; i < NextLevel.size();) { |
355 | 0 | if (PDT.dominates(VisitedPostDom, NextLevel[i])) { |
356 | 0 | Stack.push_back(NextLevel[i]); |
357 | |
|
358 | 0 | NextLevel[i] = NextLevel.back(); |
359 | 0 | NextLevel.pop_back(); |
360 | 0 | } else { |
361 | 0 | i++; |
362 | 0 | } |
363 | 0 | } |
364 | 0 | } |
365 | |
|
366 | 0 | unsigned Level = CommonDominators.size(); |
367 | 0 | while (!Stack.empty()) { |
368 | 0 | MachineBasicBlock *MBB = Stack.pop_back_val(); |
369 | 0 | if (!PDT.dominates(VisitedPostDom, MBB)) |
370 | 0 | NextLevel.push_back(MBB); |
371 | |
|
372 | 0 | Visited[MBB] = Level; |
373 | 0 | VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); |
374 | |
|
375 | 0 | for (MachineBasicBlock *Succ : MBB->successors()) { |
376 | 0 | if (Succ == DefBlock) { |
377 | 0 | if (MBB == VisitedPostDom) |
378 | 0 | FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); |
379 | 0 | else |
380 | 0 | FoundLoopLevel = std::min(FoundLoopLevel, Level); |
381 | 0 | continue; |
382 | 0 | } |
383 | | |
384 | 0 | if (Visited.try_emplace(Succ, ~0u).second) { |
385 | 0 | if (MBB == VisitedPostDom) |
386 | 0 | NextLevel.push_back(Succ); |
387 | 0 | else |
388 | 0 | Stack.push_back(Succ); |
389 | 0 | } |
390 | 0 | } |
391 | 0 | } |
392 | |
|
393 | 0 | CommonDominators.push_back(VisitedDom); |
394 | 0 | } |
395 | | }; |
396 | | |
397 | | } // End anonymous namespace. |
398 | | |
399 | 62 | INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, |
400 | 62 | false) |
401 | 62 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) |
402 | 62 | INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) |
403 | 62 | INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, |
404 | | false) |
405 | | |
406 | | char SILowerI1Copies::ID = 0; |
407 | | |
408 | | char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; |
409 | | |
410 | 0 | FunctionPass *llvm::createSILowerI1CopiesPass() { |
411 | 0 | return new SILowerI1Copies(); |
412 | 0 | } |
413 | | |
414 | | Register llvm::createLaneMaskReg(MachineRegisterInfo *MRI, |
415 | 0 | Register LaneMaskRegAttrs) { |
416 | 0 | return MRI->cloneVirtualRegister(LaneMaskRegAttrs); |
417 | 0 | } |
418 | | |
419 | | static Register insertUndefLaneMask(MachineBasicBlock *MBB, |
420 | | MachineRegisterInfo *MRI, |
421 | 0 | Register LaneMaskRegAttrs) { |
422 | 0 | MachineFunction &MF = *MBB->getParent(); |
423 | 0 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
424 | 0 | const SIInstrInfo *TII = ST.getInstrInfo(); |
425 | 0 | Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
426 | 0 | BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), |
427 | 0 | UndefReg); |
428 | 0 | return UndefReg; |
429 | 0 | } |
430 | | |
431 | | /// Lower all instructions that def or use vreg_1 registers. |
432 | | /// |
433 | | /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can |
434 | | /// occur around inline assembly. We do this first, before vreg_1 registers |
435 | | /// are changed to scalar mask registers. |
436 | | /// |
437 | | /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before |
438 | | /// all others, because phi lowering looks through copies and can therefore |
439 | | /// often make copy lowering unnecessary. |
440 | 0 | bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { |
441 | | // Only need to run this in SelectionDAG path. |
442 | 0 | if (TheMF.getProperties().hasProperty( |
443 | 0 | MachineFunctionProperties::Property::Selected)) |
444 | 0 | return false; |
445 | | |
446 | 0 | Vreg1LoweringHelper Helper(&TheMF, &getAnalysis<MachineDominatorTree>(), |
447 | 0 | &getAnalysis<MachinePostDominatorTree>()); |
448 | |
|
449 | 0 | bool Changed = false; |
450 | 0 | Changed |= Helper.lowerCopiesFromI1(); |
451 | 0 | Changed |= Helper.lowerPhis(); |
452 | 0 | Changed |= Helper.lowerCopiesToI1(); |
453 | 0 | return Helper.cleanConstrainRegs(Changed); |
454 | 0 | } |
455 | | |
456 | | #ifndef NDEBUG |
457 | | static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, |
458 | | const MachineRegisterInfo &MRI, |
459 | 0 | Register Reg) { |
460 | 0 | unsigned Size = TRI.getRegSizeInBits(Reg, MRI); |
461 | 0 | return Size == 1 || Size == 32; |
462 | 0 | } |
463 | | #endif |
464 | | |
465 | 0 | bool Vreg1LoweringHelper::lowerCopiesFromI1() { |
466 | 0 | bool Changed = false; |
467 | 0 | SmallVector<MachineInstr *, 4> DeadCopies; |
468 | |
|
469 | 0 | for (MachineBasicBlock &MBB : *MF) { |
470 | 0 | for (MachineInstr &MI : MBB) { |
471 | 0 | if (MI.getOpcode() != AMDGPU::COPY) |
472 | 0 | continue; |
473 | | |
474 | 0 | Register DstReg = MI.getOperand(0).getReg(); |
475 | 0 | Register SrcReg = MI.getOperand(1).getReg(); |
476 | 0 | if (!isVreg1(SrcReg)) |
477 | 0 | continue; |
478 | | |
479 | 0 | if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) |
480 | 0 | continue; |
481 | | |
482 | 0 | Changed = true; |
483 | | |
484 | | // Copy into a 32-bit vector register. |
485 | 0 | LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); |
486 | 0 | DebugLoc DL = MI.getDebugLoc(); |
487 | |
|
488 | 0 | assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); |
489 | 0 | assert(!MI.getOperand(0).getSubReg()); |
490 | | |
491 | 0 | ConstrainRegs.insert(SrcReg); |
492 | 0 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
493 | 0 | .addImm(0) |
494 | 0 | .addImm(0) |
495 | 0 | .addImm(0) |
496 | 0 | .addImm(-1) |
497 | 0 | .addReg(SrcReg); |
498 | 0 | DeadCopies.push_back(&MI); |
499 | 0 | } |
500 | |
|
501 | 0 | for (MachineInstr *MI : DeadCopies) |
502 | 0 | MI->eraseFromParent(); |
503 | 0 | DeadCopies.clear(); |
504 | 0 | } |
505 | 0 | return Changed; |
506 | 0 | } |
507 | | |
508 | | PhiLoweringHelper::PhiLoweringHelper(MachineFunction *MF, |
509 | | MachineDominatorTree *DT, |
510 | | MachinePostDominatorTree *PDT) |
511 | 0 | : MF(MF), DT(DT), PDT(PDT) { |
512 | 0 | MRI = &MF->getRegInfo(); |
513 | |
|
514 | 0 | ST = &MF->getSubtarget<GCNSubtarget>(); |
515 | 0 | TII = ST->getInstrInfo(); |
516 | 0 | IsWave32 = ST->isWave32(); |
517 | |
|
518 | 0 | if (IsWave32) { |
519 | 0 | ExecReg = AMDGPU::EXEC_LO; |
520 | 0 | MovOp = AMDGPU::S_MOV_B32; |
521 | 0 | AndOp = AMDGPU::S_AND_B32; |
522 | 0 | OrOp = AMDGPU::S_OR_B32; |
523 | 0 | XorOp = AMDGPU::S_XOR_B32; |
524 | 0 | AndN2Op = AMDGPU::S_ANDN2_B32; |
525 | 0 | OrN2Op = AMDGPU::S_ORN2_B32; |
526 | 0 | } else { |
527 | 0 | ExecReg = AMDGPU::EXEC; |
528 | 0 | MovOp = AMDGPU::S_MOV_B64; |
529 | 0 | AndOp = AMDGPU::S_AND_B64; |
530 | 0 | OrOp = AMDGPU::S_OR_B64; |
531 | 0 | XorOp = AMDGPU::S_XOR_B64; |
532 | 0 | AndN2Op = AMDGPU::S_ANDN2_B64; |
533 | 0 | OrN2Op = AMDGPU::S_ORN2_B64; |
534 | 0 | } |
535 | 0 | } |
536 | | |
537 | 0 | bool PhiLoweringHelper::lowerPhis() { |
538 | 0 | MachineSSAUpdater SSAUpdater(*MF); |
539 | 0 | LoopFinder LF(*DT, *PDT); |
540 | 0 | PhiIncomingAnalysis PIA(*PDT, TII); |
541 | 0 | SmallVector<MachineInstr *, 4> Vreg1Phis; |
542 | 0 | SmallVector<Incoming, 4> Incomings; |
543 | |
|
544 | 0 | getCandidatesForLowering(Vreg1Phis); |
545 | 0 | if (Vreg1Phis.empty()) |
546 | 0 | return false; |
547 | | |
548 | 0 | DT->getBase().updateDFSNumbers(); |
549 | 0 | MachineBasicBlock *PrevMBB = nullptr; |
550 | 0 | for (MachineInstr *MI : Vreg1Phis) { |
551 | 0 | MachineBasicBlock &MBB = *MI->getParent(); |
552 | 0 | if (&MBB != PrevMBB) { |
553 | 0 | LF.initialize(MBB); |
554 | 0 | PrevMBB = &MBB; |
555 | 0 | } |
556 | |
|
557 | 0 | LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); |
558 | |
|
559 | 0 | Register DstReg = MI->getOperand(0).getReg(); |
560 | 0 | markAsLaneMask(DstReg); |
561 | 0 | initializeLaneMaskRegisterAttributes(DstReg); |
562 | |
|
563 | 0 | collectIncomingValuesFromPhi(MI, Incomings); |
564 | | |
565 | | // Sort the incomings such that incoming values that dominate other incoming |
566 | | // values are sorted earlier. This allows us to do some amount of on-the-fly |
567 | | // constant folding. |
568 | | // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block. |
569 | 0 | llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) { |
570 | 0 | return DT->getNode(LHS.Block)->getDFSNumIn() < |
571 | 0 | DT->getNode(RHS.Block)->getDFSNumIn(); |
572 | 0 | }); |
573 | |
|
574 | 0 | #ifndef NDEBUG |
575 | 0 | PhiRegisters.insert(DstReg); |
576 | 0 | #endif |
577 | | |
578 | | // Phis in a loop that are observed outside the loop receive a simple but |
579 | | // conservatively correct treatment. |
580 | 0 | std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; |
581 | 0 | for (MachineInstr &Use : MRI->use_instructions(DstReg)) |
582 | 0 | DomBlocks.push_back(Use.getParent()); |
583 | |
|
584 | 0 | MachineBasicBlock *PostDomBound = |
585 | 0 | PDT->findNearestCommonDominator(DomBlocks); |
586 | | |
587 | | // FIXME: This fails to find irreducible cycles. If we have a def (other |
588 | | // than a constant) in a pair of blocks that end up looping back to each |
589 | | // other, it will be mishandle. Due to structurization this shouldn't occur |
590 | | // in practice. |
591 | 0 | unsigned FoundLoopLevel = LF.findLoop(PostDomBound); |
592 | |
|
593 | 0 | SSAUpdater.Initialize(DstReg); |
594 | |
|
595 | 0 | if (FoundLoopLevel) { |
596 | 0 | LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs, |
597 | 0 | Incomings); |
598 | |
|
599 | 0 | for (auto &Incoming : Incomings) { |
600 | 0 | Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
601 | 0 | SSAUpdater.AddAvailableValue(Incoming.Block, Incoming.UpdatedReg); |
602 | 0 | } |
603 | |
|
604 | 0 | for (auto &Incoming : Incomings) { |
605 | 0 | MachineBasicBlock &IMBB = *Incoming.Block; |
606 | 0 | buildMergeLaneMasks( |
607 | 0 | IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg, |
608 | 0 | SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg); |
609 | 0 | } |
610 | 0 | } else { |
611 | | // The phi is not observed from outside a loop. Use a more accurate |
612 | | // lowering. |
613 | 0 | PIA.analyze(MBB, Incomings); |
614 | |
|
615 | 0 | for (MachineBasicBlock *MBB : PIA.predecessors()) |
616 | 0 | SSAUpdater.AddAvailableValue( |
617 | 0 | MBB, insertUndefLaneMask(MBB, MRI, LaneMaskRegAttrs)); |
618 | |
|
619 | 0 | for (auto &Incoming : Incomings) { |
620 | 0 | MachineBasicBlock &IMBB = *Incoming.Block; |
621 | 0 | if (PIA.isSource(IMBB)) { |
622 | 0 | constrainIncomingRegisterTakenAsIs(Incoming); |
623 | 0 | SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg); |
624 | 0 | } else { |
625 | 0 | Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
626 | 0 | SSAUpdater.AddAvailableValue(&IMBB, Incoming.UpdatedReg); |
627 | 0 | } |
628 | 0 | } |
629 | |
|
630 | 0 | for (auto &Incoming : Incomings) { |
631 | 0 | if (!Incoming.UpdatedReg.isValid()) |
632 | 0 | continue; |
633 | | |
634 | 0 | MachineBasicBlock &IMBB = *Incoming.Block; |
635 | 0 | buildMergeLaneMasks( |
636 | 0 | IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg, |
637 | 0 | SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg); |
638 | 0 | } |
639 | 0 | } |
640 | |
|
641 | 0 | Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); |
642 | 0 | if (NewReg != DstReg) { |
643 | 0 | replaceDstReg(NewReg, DstReg, &MBB); |
644 | 0 | MI->eraseFromParent(); |
645 | 0 | } |
646 | |
|
647 | 0 | Incomings.clear(); |
648 | 0 | } |
649 | 0 | return true; |
650 | 0 | } |
651 | | |
652 | 0 | bool Vreg1LoweringHelper::lowerCopiesToI1() { |
653 | 0 | bool Changed = false; |
654 | 0 | MachineSSAUpdater SSAUpdater(*MF); |
655 | 0 | LoopFinder LF(*DT, *PDT); |
656 | 0 | SmallVector<MachineInstr *, 4> DeadCopies; |
657 | |
|
658 | 0 | for (MachineBasicBlock &MBB : *MF) { |
659 | 0 | LF.initialize(MBB); |
660 | |
|
661 | 0 | for (MachineInstr &MI : MBB) { |
662 | 0 | if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && |
663 | 0 | MI.getOpcode() != AMDGPU::COPY) |
664 | 0 | continue; |
665 | | |
666 | 0 | Register DstReg = MI.getOperand(0).getReg(); |
667 | 0 | if (!isVreg1(DstReg)) |
668 | 0 | continue; |
669 | | |
670 | 0 | Changed = true; |
671 | |
|
672 | 0 | if (MRI->use_empty(DstReg)) { |
673 | 0 | DeadCopies.push_back(&MI); |
674 | 0 | continue; |
675 | 0 | } |
676 | | |
677 | 0 | LLVM_DEBUG(dbgs() << "Lower Other: " << MI); |
678 | |
|
679 | 0 | markAsLaneMask(DstReg); |
680 | 0 | initializeLaneMaskRegisterAttributes(DstReg); |
681 | |
|
682 | 0 | if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) |
683 | 0 | continue; |
684 | | |
685 | 0 | DebugLoc DL = MI.getDebugLoc(); |
686 | 0 | Register SrcReg = MI.getOperand(1).getReg(); |
687 | 0 | assert(!MI.getOperand(1).getSubReg()); |
688 | | |
689 | 0 | if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { |
690 | 0 | assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); |
691 | 0 | Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
692 | 0 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) |
693 | 0 | .addReg(SrcReg) |
694 | 0 | .addImm(0); |
695 | 0 | MI.getOperand(1).setReg(TmpReg); |
696 | 0 | SrcReg = TmpReg; |
697 | 0 | } else { |
698 | | // SrcReg needs to be live beyond copy. |
699 | 0 | MI.getOperand(1).setIsKill(false); |
700 | 0 | } |
701 | | |
702 | | // Defs in a loop that are observed outside the loop must be transformed |
703 | | // into appropriate bit manipulation. |
704 | 0 | std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; |
705 | 0 | for (MachineInstr &Use : MRI->use_instructions(DstReg)) |
706 | 0 | DomBlocks.push_back(Use.getParent()); |
707 | |
|
708 | 0 | MachineBasicBlock *PostDomBound = |
709 | 0 | PDT->findNearestCommonDominator(DomBlocks); |
710 | 0 | unsigned FoundLoopLevel = LF.findLoop(PostDomBound); |
711 | 0 | if (FoundLoopLevel) { |
712 | 0 | SSAUpdater.Initialize(DstReg); |
713 | 0 | SSAUpdater.AddAvailableValue(&MBB, DstReg); |
714 | 0 | LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs); |
715 | |
|
716 | 0 | buildMergeLaneMasks(MBB, MI, DL, DstReg, |
717 | 0 | SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); |
718 | 0 | DeadCopies.push_back(&MI); |
719 | 0 | } |
720 | 0 | } |
721 | |
|
722 | 0 | for (MachineInstr *MI : DeadCopies) |
723 | 0 | MI->eraseFromParent(); |
724 | 0 | DeadCopies.clear(); |
725 | 0 | } |
726 | 0 | return Changed; |
727 | 0 | } |
728 | | |
729 | 0 | bool PhiLoweringHelper::isConstantLaneMask(Register Reg, bool &Val) const { |
730 | 0 | const MachineInstr *MI; |
731 | 0 | for (;;) { |
732 | 0 | MI = MRI->getUniqueVRegDef(Reg); |
733 | 0 | if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) |
734 | 0 | return true; |
735 | | |
736 | 0 | if (MI->getOpcode() != AMDGPU::COPY) |
737 | 0 | break; |
738 | | |
739 | 0 | Reg = MI->getOperand(1).getReg(); |
740 | 0 | if (!Reg.isVirtual()) |
741 | 0 | return false; |
742 | 0 | if (!isLaneMaskReg(Reg)) |
743 | 0 | return false; |
744 | 0 | } |
745 | | |
746 | 0 | if (MI->getOpcode() != MovOp) |
747 | 0 | return false; |
748 | | |
749 | 0 | if (!MI->getOperand(1).isImm()) |
750 | 0 | return false; |
751 | | |
752 | 0 | int64_t Imm = MI->getOperand(1).getImm(); |
753 | 0 | if (Imm == 0) { |
754 | 0 | Val = false; |
755 | 0 | return true; |
756 | 0 | } |
757 | 0 | if (Imm == -1) { |
758 | 0 | Val = true; |
759 | 0 | return true; |
760 | 0 | } |
761 | | |
762 | 0 | return false; |
763 | 0 | } |
764 | | |
765 | 0 | static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { |
766 | 0 | Def = false; |
767 | 0 | Use = false; |
768 | |
|
769 | 0 | for (const MachineOperand &MO : MI.operands()) { |
770 | 0 | if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { |
771 | 0 | if (MO.isUse()) |
772 | 0 | Use = true; |
773 | 0 | else |
774 | 0 | Def = true; |
775 | 0 | } |
776 | 0 | } |
777 | 0 | } |
778 | | |
779 | | /// Return a point at the end of the given \p MBB to insert SALU instructions |
780 | | /// for lane mask calculation. Take terminators and SCC into account. |
781 | | MachineBasicBlock::iterator |
782 | 0 | PhiLoweringHelper::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { |
783 | 0 | auto InsertionPt = MBB.getFirstTerminator(); |
784 | 0 | bool TerminatorsUseSCC = false; |
785 | 0 | for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { |
786 | 0 | bool DefsSCC; |
787 | 0 | instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); |
788 | 0 | if (TerminatorsUseSCC || DefsSCC) |
789 | 0 | break; |
790 | 0 | } |
791 | |
|
792 | 0 | if (!TerminatorsUseSCC) |
793 | 0 | return InsertionPt; |
794 | | |
795 | 0 | while (InsertionPt != MBB.begin()) { |
796 | 0 | InsertionPt--; |
797 | |
|
798 | 0 | bool DefSCC, UseSCC; |
799 | 0 | instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); |
800 | 0 | if (DefSCC) |
801 | 0 | return InsertionPt; |
802 | 0 | } |
803 | | |
804 | | // We should have at least seen an IMPLICIT_DEF or COPY |
805 | 0 | llvm_unreachable("SCC used by terminator but no def in block"); |
806 | 0 | } |
807 | | |
808 | | // VReg_1 -> SReg_32 or SReg_64 |
809 | 0 | void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const { |
810 | 0 | MRI->setRegClass(DstReg, ST->getBoolRC()); |
811 | 0 | } |
812 | | |
813 | | void Vreg1LoweringHelper::getCandidatesForLowering( |
814 | 0 | SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { |
815 | 0 | for (MachineBasicBlock &MBB : *MF) { |
816 | 0 | for (MachineInstr &MI : MBB.phis()) { |
817 | 0 | if (isVreg1(MI.getOperand(0).getReg())) |
818 | 0 | Vreg1Phis.push_back(&MI); |
819 | 0 | } |
820 | 0 | } |
821 | 0 | } |
822 | | |
823 | | void Vreg1LoweringHelper::collectIncomingValuesFromPhi( |
824 | 0 | const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { |
825 | 0 | for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { |
826 | 0 | assert(i + 1 < MI->getNumOperands()); |
827 | 0 | Register IncomingReg = MI->getOperand(i).getReg(); |
828 | 0 | MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); |
829 | 0 | MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); |
830 | |
|
831 | 0 | if (IncomingDef->getOpcode() == AMDGPU::COPY) { |
832 | 0 | IncomingReg = IncomingDef->getOperand(1).getReg(); |
833 | 0 | assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); |
834 | 0 | assert(!IncomingDef->getOperand(1).getSubReg()); |
835 | 0 | } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { |
836 | 0 | continue; |
837 | 0 | } else { |
838 | 0 | assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); |
839 | 0 | } |
840 | | |
841 | 0 | Incomings.emplace_back(IncomingReg, IncomingMBB, Register()); |
842 | 0 | } |
843 | 0 | } |
844 | | |
845 | | void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg, |
846 | 0 | MachineBasicBlock *MBB) { |
847 | 0 | MRI->replaceRegWith(NewReg, OldReg); |
848 | 0 | } |
849 | | |
850 | | void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB, |
851 | | MachineBasicBlock::iterator I, |
852 | | const DebugLoc &DL, |
853 | | Register DstReg, Register PrevReg, |
854 | 0 | Register CurReg) { |
855 | 0 | bool PrevVal = false; |
856 | 0 | bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); |
857 | 0 | bool CurVal = false; |
858 | 0 | bool CurConstant = isConstantLaneMask(CurReg, CurVal); |
859 | |
|
860 | 0 | if (PrevConstant && CurConstant) { |
861 | 0 | if (PrevVal == CurVal) { |
862 | 0 | BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); |
863 | 0 | } else if (CurVal) { |
864 | 0 | BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); |
865 | 0 | } else { |
866 | 0 | BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) |
867 | 0 | .addReg(ExecReg) |
868 | 0 | .addImm(-1); |
869 | 0 | } |
870 | 0 | return; |
871 | 0 | } |
872 | | |
873 | 0 | Register PrevMaskedReg; |
874 | 0 | Register CurMaskedReg; |
875 | 0 | if (!PrevConstant) { |
876 | 0 | if (CurConstant && CurVal) { |
877 | 0 | PrevMaskedReg = PrevReg; |
878 | 0 | } else { |
879 | 0 | PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
880 | 0 | BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) |
881 | 0 | .addReg(PrevReg) |
882 | 0 | .addReg(ExecReg); |
883 | 0 | } |
884 | 0 | } |
885 | 0 | if (!CurConstant) { |
886 | | // TODO: check whether CurReg is already masked by EXEC |
887 | 0 | if (PrevConstant && PrevVal) { |
888 | 0 | CurMaskedReg = CurReg; |
889 | 0 | } else { |
890 | 0 | CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); |
891 | 0 | BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) |
892 | 0 | .addReg(CurReg) |
893 | 0 | .addReg(ExecReg); |
894 | 0 | } |
895 | 0 | } |
896 | |
|
897 | 0 | if (PrevConstant && !PrevVal) { |
898 | 0 | BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) |
899 | 0 | .addReg(CurMaskedReg); |
900 | 0 | } else if (CurConstant && !CurVal) { |
901 | 0 | BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) |
902 | 0 | .addReg(PrevMaskedReg); |
903 | 0 | } else if (PrevConstant && PrevVal) { |
904 | 0 | BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) |
905 | 0 | .addReg(CurMaskedReg) |
906 | 0 | .addReg(ExecReg); |
907 | 0 | } else { |
908 | 0 | BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) |
909 | 0 | .addReg(PrevMaskedReg) |
910 | 0 | .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); |
911 | 0 | } |
912 | 0 | } |
913 | | |
914 | 0 | void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) { |
915 | 0 | return; |
916 | 0 | } |