/src/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | /// \file |
10 | | /// This contains a MachineSchedStrategy implementation for maximizing wave |
11 | | /// occupancy on GCN hardware. |
12 | | /// |
13 | | /// This pass will apply multiple scheduling stages to the same function. |
14 | | /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual |
15 | | /// entry point for the scheduling of those regions is |
16 | | /// GCNScheduleDAGMILive::runSchedStages. |
17 | | |
18 | | /// Generally, the reason for having multiple scheduling stages is to account |
19 | | /// for the kernel-wide effect of register usage on occupancy. Usually, only a |
20 | | /// few scheduling regions will have register pressure high enough to limit |
21 | | /// occupancy for the kernel, so constraints can be relaxed to improve ILP in |
22 | | /// other regions. |
23 | | /// |
24 | | //===----------------------------------------------------------------------===// |
25 | | |
26 | | #include "GCNSchedStrategy.h" |
27 | | #include "AMDGPUIGroupLP.h" |
28 | | #include "SIMachineFunctionInfo.h" |
29 | | #include "llvm/CodeGen/RegisterClassInfo.h" |
30 | | |
31 | | #define DEBUG_TYPE "machine-scheduler" |
32 | | |
33 | | using namespace llvm; |
34 | | |
35 | | static cl::opt<bool> DisableUnclusterHighRP( |
36 | | "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, |
37 | | cl::desc("Disable unclustered high register pressure " |
38 | | "reduction scheduling stage."), |
39 | | cl::init(false)); |
40 | | |
41 | | static cl::opt<bool> DisableClusteredLowOccupancy( |
42 | | "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, |
43 | | cl::desc("Disable clustered low occupancy " |
44 | | "rescheduling for ILP scheduling stage."), |
45 | | cl::init(false)); |
46 | | |
47 | | static cl::opt<unsigned> ScheduleMetricBias( |
48 | | "amdgpu-schedule-metric-bias", cl::Hidden, |
49 | | cl::desc( |
50 | | "Sets the bias which adds weight to occupancy vs latency. Set it to " |
51 | | "100 to chase the occupancy only."), |
52 | | cl::init(10)); |
53 | | |
54 | | static cl::opt<bool> |
55 | | RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, |
56 | | cl::desc("Relax occupancy targets for kernels which are memory " |
57 | | "bound (amdgpu-membound-threshold), or " |
58 | | "Wave Limited (amdgpu-limit-wave-threshold)."), |
59 | | cl::init(false)); |
60 | | |
61 | | const unsigned ScheduleMetrics::ScaleFactor = 100; |
62 | | |
63 | | GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) |
64 | | : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), |
65 | 0 | HasHighPressure(false) {} |
66 | | |
67 | 0 | void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { |
68 | 0 | GenericScheduler::initialize(DAG); |
69 | |
|
70 | 0 | MF = &DAG->MF; |
71 | |
|
72 | 0 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
73 | |
|
74 | 0 | SGPRExcessLimit = |
75 | 0 | Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); |
76 | 0 | VGPRExcessLimit = |
77 | 0 | Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); |
78 | |
|
79 | 0 | SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
80 | | // Set the initial TargetOccupnacy to the maximum occupancy that we can |
81 | | // achieve for this function. This effectively sets a lower bound on the |
82 | | // 'Critical' register limits in the scheduler. |
83 | | // Allow for lower occupancy targets if kernel is wave limited or memory |
84 | | // bound, and using the relaxed occupancy feature. |
85 | 0 | TargetOccupancy = |
86 | 0 | RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); |
87 | 0 | SGPRCriticalLimit = |
88 | 0 | std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); |
89 | |
|
90 | 0 | if (!KnownExcessRP) { |
91 | 0 | VGPRCriticalLimit = |
92 | 0 | std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); |
93 | 0 | } else { |
94 | | // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except |
95 | | // returns a reasonably small number for targets with lots of VGPRs, such |
96 | | // as GFX10 and GFX11. |
97 | 0 | LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " |
98 | 0 | "VGPRCriticalLimit calculation method.\n"); |
99 | |
|
100 | 0 | unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST); |
101 | 0 | unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST); |
102 | 0 | unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule); |
103 | 0 | VGPRBudget = std::max(VGPRBudget, Granule); |
104 | 0 | VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit); |
105 | 0 | } |
106 | | |
107 | | // Subtract error margin and bias from register limits and avoid overflow. |
108 | 0 | SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit); |
109 | 0 | VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit); |
110 | 0 | SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit); |
111 | 0 | VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit); |
112 | |
|
113 | 0 | LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit |
114 | 0 | << ", VGPRExcessLimit = " << VGPRExcessLimit |
115 | 0 | << ", SGPRCriticalLimit = " << SGPRCriticalLimit |
116 | 0 | << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n"); |
117 | 0 | } |
118 | | |
119 | | void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, |
120 | | bool AtTop, |
121 | | const RegPressureTracker &RPTracker, |
122 | | const SIRegisterInfo *SRI, |
123 | | unsigned SGPRPressure, |
124 | 0 | unsigned VGPRPressure) { |
125 | 0 | Cand.SU = SU; |
126 | 0 | Cand.AtTop = AtTop; |
127 | |
|
128 | 0 | if (!DAG->isTrackingPressure()) |
129 | 0 | return; |
130 | | |
131 | | // getDownwardPressure() and getUpwardPressure() make temporary changes to |
132 | | // the tracker, so we need to pass those function a non-const copy. |
133 | 0 | RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); |
134 | |
|
135 | 0 | Pressure.clear(); |
136 | 0 | MaxPressure.clear(); |
137 | |
|
138 | 0 | if (AtTop) |
139 | 0 | TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); |
140 | 0 | else { |
141 | | // FIXME: I think for bottom up scheduling, the register pressure is cached |
142 | | // and can be retrieved by DAG->getPressureDif(SU). |
143 | 0 | TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); |
144 | 0 | } |
145 | |
|
146 | 0 | unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
147 | 0 | unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
148 | | |
149 | | // If two instructions increase the pressure of different register sets |
150 | | // by the same amount, the generic scheduler will prefer to schedule the |
151 | | // instruction that increases the set with the least amount of registers, |
152 | | // which in our case would be SGPRs. This is rarely what we want, so |
153 | | // when we report excess/critical register pressure, we do it either |
154 | | // only for VGPRs or only for SGPRs. |
155 | | |
156 | | // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. |
157 | 0 | const unsigned MaxVGPRPressureInc = 16; |
158 | 0 | bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; |
159 | 0 | bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; |
160 | | |
161 | | |
162 | | // FIXME: We have to enter REG-EXCESS before we reach the actual threshold |
163 | | // to increase the likelihood we don't go over the limits. We should improve |
164 | | // the analysis to look through dependencies to find the path with the least |
165 | | // register pressure. |
166 | | |
167 | | // We only need to update the RPDelta for instructions that increase register |
168 | | // pressure. Instructions that decrease or keep reg pressure the same will be |
169 | | // marked as RegExcess in tryCandidate() when they are compared with |
170 | | // instructions that increase the register pressure. |
171 | 0 | if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { |
172 | 0 | HasHighPressure = true; |
173 | 0 | Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
174 | 0 | Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); |
175 | 0 | } |
176 | |
|
177 | 0 | if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { |
178 | 0 | HasHighPressure = true; |
179 | 0 | Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
180 | 0 | Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); |
181 | 0 | } |
182 | | |
183 | | // Register pressure is considered 'CRITICAL' if it is approaching a value |
184 | | // that would reduce the wave occupancy for the execution unit. When |
185 | | // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both |
186 | | // has the same cost, so we don't need to prefer one over the other. |
187 | |
|
188 | 0 | int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; |
189 | 0 | int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; |
190 | |
|
191 | 0 | if (SGPRDelta >= 0 || VGPRDelta >= 0) { |
192 | 0 | HasHighPressure = true; |
193 | 0 | if (SGPRDelta > VGPRDelta) { |
194 | 0 | Cand.RPDelta.CriticalMax = |
195 | 0 | PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
196 | 0 | Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); |
197 | 0 | } else { |
198 | 0 | Cand.RPDelta.CriticalMax = |
199 | 0 | PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
200 | 0 | Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); |
201 | 0 | } |
202 | 0 | } |
203 | 0 | } |
204 | | |
205 | | // This function is mostly cut and pasted from |
206 | | // GenericScheduler::pickNodeFromQueue() |
207 | | void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, |
208 | | const CandPolicy &ZonePolicy, |
209 | | const RegPressureTracker &RPTracker, |
210 | 0 | SchedCandidate &Cand) { |
211 | 0 | const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); |
212 | 0 | ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); |
213 | 0 | unsigned SGPRPressure = 0; |
214 | 0 | unsigned VGPRPressure = 0; |
215 | 0 | if (DAG->isTrackingPressure()) { |
216 | 0 | SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
217 | 0 | VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
218 | 0 | } |
219 | 0 | ReadyQueue &Q = Zone.Available; |
220 | 0 | for (SUnit *SU : Q) { |
221 | |
|
222 | 0 | SchedCandidate TryCand(ZonePolicy); |
223 | 0 | initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, |
224 | 0 | SGPRPressure, VGPRPressure); |
225 | | // Pass SchedBoundary only when comparing nodes from the same boundary. |
226 | 0 | SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; |
227 | 0 | tryCandidate(Cand, TryCand, ZoneArg); |
228 | 0 | if (TryCand.Reason != NoCand) { |
229 | | // Initialize resource delta if needed in case future heuristics query it. |
230 | 0 | if (TryCand.ResDelta == SchedResourceDelta()) |
231 | 0 | TryCand.initResourceDelta(Zone.DAG, SchedModel); |
232 | 0 | Cand.setBest(TryCand); |
233 | 0 | LLVM_DEBUG(traceCandidate(Cand)); |
234 | 0 | } |
235 | 0 | } |
236 | 0 | } |
237 | | |
238 | | // This function is mostly cut and pasted from |
239 | | // GenericScheduler::pickNodeBidirectional() |
240 | 0 | SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { |
241 | | // Schedule as far as possible in the direction of no choice. This is most |
242 | | // efficient, but also provides the best heuristics for CriticalPSets. |
243 | 0 | if (SUnit *SU = Bot.pickOnlyChoice()) { |
244 | 0 | IsTopNode = false; |
245 | 0 | return SU; |
246 | 0 | } |
247 | 0 | if (SUnit *SU = Top.pickOnlyChoice()) { |
248 | 0 | IsTopNode = true; |
249 | 0 | return SU; |
250 | 0 | } |
251 | | // Set the bottom-up policy based on the state of the current bottom zone and |
252 | | // the instructions outside the zone, including the top zone. |
253 | 0 | CandPolicy BotPolicy; |
254 | 0 | setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); |
255 | | // Set the top-down policy based on the state of the current top zone and |
256 | | // the instructions outside the zone, including the bottom zone. |
257 | 0 | CandPolicy TopPolicy; |
258 | 0 | setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); |
259 | | |
260 | | // See if BotCand is still valid (because we previously scheduled from Top). |
261 | 0 | LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); |
262 | 0 | if (!BotCand.isValid() || BotCand.SU->isScheduled || |
263 | 0 | BotCand.Policy != BotPolicy) { |
264 | 0 | BotCand.reset(CandPolicy()); |
265 | 0 | pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); |
266 | 0 | assert(BotCand.Reason != NoCand && "failed to find the first candidate"); |
267 | 0 | } else { |
268 | 0 | LLVM_DEBUG(traceCandidate(BotCand)); |
269 | 0 | #ifndef NDEBUG |
270 | 0 | if (VerifyScheduling) { |
271 | 0 | SchedCandidate TCand; |
272 | 0 | TCand.reset(CandPolicy()); |
273 | 0 | pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); |
274 | 0 | assert(TCand.SU == BotCand.SU && |
275 | 0 | "Last pick result should correspond to re-picking right now"); |
276 | 0 | } |
277 | 0 | #endif |
278 | 0 | } |
279 | | |
280 | | // Check if the top Q has a better candidate. |
281 | 0 | LLVM_DEBUG(dbgs() << "Picking from Top:\n"); |
282 | 0 | if (!TopCand.isValid() || TopCand.SU->isScheduled || |
283 | 0 | TopCand.Policy != TopPolicy) { |
284 | 0 | TopCand.reset(CandPolicy()); |
285 | 0 | pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); |
286 | 0 | assert(TopCand.Reason != NoCand && "failed to find the first candidate"); |
287 | 0 | } else { |
288 | 0 | LLVM_DEBUG(traceCandidate(TopCand)); |
289 | 0 | #ifndef NDEBUG |
290 | 0 | if (VerifyScheduling) { |
291 | 0 | SchedCandidate TCand; |
292 | 0 | TCand.reset(CandPolicy()); |
293 | 0 | pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); |
294 | 0 | assert(TCand.SU == TopCand.SU && |
295 | 0 | "Last pick result should correspond to re-picking right now"); |
296 | 0 | } |
297 | 0 | #endif |
298 | 0 | } |
299 | | |
300 | | // Pick best from BotCand and TopCand. |
301 | 0 | LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); |
302 | 0 | dbgs() << "Bot Cand: "; traceCandidate(BotCand);); |
303 | 0 | SchedCandidate Cand = BotCand; |
304 | 0 | TopCand.Reason = NoCand; |
305 | 0 | tryCandidate(Cand, TopCand, nullptr); |
306 | 0 | if (TopCand.Reason != NoCand) { |
307 | 0 | Cand.setBest(TopCand); |
308 | 0 | } |
309 | 0 | LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); |
310 | |
|
311 | 0 | IsTopNode = Cand.AtTop; |
312 | 0 | return Cand.SU; |
313 | 0 | } |
314 | | |
315 | | // This function is mostly cut and pasted from |
316 | | // GenericScheduler::pickNode() |
317 | 0 | SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { |
318 | 0 | if (DAG->top() == DAG->bottom()) { |
319 | 0 | assert(Top.Available.empty() && Top.Pending.empty() && |
320 | 0 | Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); |
321 | 0 | return nullptr; |
322 | 0 | } |
323 | 0 | SUnit *SU; |
324 | 0 | do { |
325 | 0 | if (RegionPolicy.OnlyTopDown) { |
326 | 0 | SU = Top.pickOnlyChoice(); |
327 | 0 | if (!SU) { |
328 | 0 | CandPolicy NoPolicy; |
329 | 0 | TopCand.reset(NoPolicy); |
330 | 0 | pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); |
331 | 0 | assert(TopCand.Reason != NoCand && "failed to find a candidate"); |
332 | 0 | SU = TopCand.SU; |
333 | 0 | } |
334 | 0 | IsTopNode = true; |
335 | 0 | } else if (RegionPolicy.OnlyBottomUp) { |
336 | 0 | SU = Bot.pickOnlyChoice(); |
337 | 0 | if (!SU) { |
338 | 0 | CandPolicy NoPolicy; |
339 | 0 | BotCand.reset(NoPolicy); |
340 | 0 | pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); |
341 | 0 | assert(BotCand.Reason != NoCand && "failed to find a candidate"); |
342 | 0 | SU = BotCand.SU; |
343 | 0 | } |
344 | 0 | IsTopNode = false; |
345 | 0 | } else { |
346 | 0 | SU = pickNodeBidirectional(IsTopNode); |
347 | 0 | } |
348 | 0 | } while (SU->isScheduled); |
349 | | |
350 | 0 | if (SU->isTopReady()) |
351 | 0 | Top.removeReady(SU); |
352 | 0 | if (SU->isBottomReady()) |
353 | 0 | Bot.removeReady(SU); |
354 | |
|
355 | 0 | LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " |
356 | 0 | << *SU->getInstr()); |
357 | 0 | return SU; |
358 | 0 | } |
359 | | |
360 | 0 | GCNSchedStageID GCNSchedStrategy::getCurrentStage() { |
361 | 0 | assert(CurrentStage && CurrentStage != SchedStages.end()); |
362 | 0 | return *CurrentStage; |
363 | 0 | } |
364 | | |
365 | 0 | bool GCNSchedStrategy::advanceStage() { |
366 | 0 | assert(CurrentStage != SchedStages.end()); |
367 | 0 | if (!CurrentStage) |
368 | 0 | CurrentStage = SchedStages.begin(); |
369 | 0 | else |
370 | 0 | CurrentStage++; |
371 | |
|
372 | 0 | return CurrentStage != SchedStages.end(); |
373 | 0 | } |
374 | | |
375 | 0 | bool GCNSchedStrategy::hasNextStage() const { |
376 | 0 | assert(CurrentStage); |
377 | 0 | return std::next(CurrentStage) != SchedStages.end(); |
378 | 0 | } |
379 | | |
380 | 0 | GCNSchedStageID GCNSchedStrategy::getNextStage() const { |
381 | 0 | assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); |
382 | 0 | return *std::next(CurrentStage); |
383 | 0 | } |
384 | | |
385 | | GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( |
386 | | const MachineSchedContext *C) |
387 | 0 | : GCNSchedStrategy(C) { |
388 | 0 | SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); |
389 | 0 | SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); |
390 | 0 | SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); |
391 | 0 | SchedStages.push_back(GCNSchedStageID::PreRARematerialize); |
392 | 0 | } |
393 | | |
394 | | GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) |
395 | 0 | : GCNSchedStrategy(C) { |
396 | 0 | SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule); |
397 | 0 | } |
398 | | |
399 | | bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, |
400 | | SchedCandidate &TryCand, |
401 | 0 | SchedBoundary *Zone) const { |
402 | | // Initialize the candidate if needed. |
403 | 0 | if (!Cand.isValid()) { |
404 | 0 | TryCand.Reason = NodeOrder; |
405 | 0 | return true; |
406 | 0 | } |
407 | | |
408 | | // Avoid spilling by exceeding the register limit. |
409 | 0 | if (DAG->isTrackingPressure() && |
410 | 0 | tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, |
411 | 0 | RegExcess, TRI, DAG->MF)) |
412 | 0 | return TryCand.Reason != NoCand; |
413 | | |
414 | | // Bias PhysReg Defs and copies to their uses and defined respectively. |
415 | 0 | if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), |
416 | 0 | biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) |
417 | 0 | return TryCand.Reason != NoCand; |
418 | | |
419 | 0 | bool SameBoundary = Zone != nullptr; |
420 | 0 | if (SameBoundary) { |
421 | | // Prioritize instructions that read unbuffered resources by stall cycles. |
422 | 0 | if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), |
423 | 0 | Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) |
424 | 0 | return TryCand.Reason != NoCand; |
425 | | |
426 | | // Avoid critical resource consumption and balance the schedule. |
427 | 0 | TryCand.initResourceDelta(DAG, SchedModel); |
428 | 0 | if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, |
429 | 0 | TryCand, Cand, ResourceReduce)) |
430 | 0 | return TryCand.Reason != NoCand; |
431 | 0 | if (tryGreater(TryCand.ResDelta.DemandedResources, |
432 | 0 | Cand.ResDelta.DemandedResources, TryCand, Cand, |
433 | 0 | ResourceDemand)) |
434 | 0 | return TryCand.Reason != NoCand; |
435 | | |
436 | | // Unconditionally try to reduce latency. |
437 | 0 | if (tryLatency(TryCand, Cand, *Zone)) |
438 | 0 | return TryCand.Reason != NoCand; |
439 | | |
440 | | // Weak edges are for clustering and other constraints. |
441 | 0 | if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), |
442 | 0 | getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) |
443 | 0 | return TryCand.Reason != NoCand; |
444 | 0 | } |
445 | | |
446 | | // Keep clustered nodes together to encourage downstream peephole |
447 | | // optimizations which may reduce resource requirements. |
448 | | // |
449 | | // This is a best effort to set things up for a post-RA pass. Optimizations |
450 | | // like generating loads of multiple registers should ideally be done within |
451 | | // the scheduler pass by combining the loads during DAG postprocessing. |
452 | 0 | const SUnit *CandNextClusterSU = |
453 | 0 | Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); |
454 | 0 | const SUnit *TryCandNextClusterSU = |
455 | 0 | TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); |
456 | 0 | if (tryGreater(TryCand.SU == TryCandNextClusterSU, |
457 | 0 | Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) |
458 | 0 | return TryCand.Reason != NoCand; |
459 | | |
460 | | // Avoid increasing the max critical pressure in the scheduled region. |
461 | 0 | if (DAG->isTrackingPressure() && |
462 | 0 | tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, |
463 | 0 | TryCand, Cand, RegCritical, TRI, DAG->MF)) |
464 | 0 | return TryCand.Reason != NoCand; |
465 | | |
466 | | // Avoid increasing the max pressure of the entire region. |
467 | 0 | if (DAG->isTrackingPressure() && |
468 | 0 | tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, |
469 | 0 | Cand, RegMax, TRI, DAG->MF)) |
470 | 0 | return TryCand.Reason != NoCand; |
471 | | |
472 | 0 | if (SameBoundary) { |
473 | | // Fall through to original instruction order. |
474 | 0 | if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || |
475 | 0 | (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { |
476 | 0 | TryCand.Reason = NodeOrder; |
477 | 0 | return true; |
478 | 0 | } |
479 | 0 | } |
480 | 0 | return false; |
481 | 0 | } |
482 | | |
483 | | GCNScheduleDAGMILive::GCNScheduleDAGMILive( |
484 | | MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) |
485 | | : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), |
486 | | MFI(*MF.getInfo<SIMachineFunctionInfo>()), |
487 | 0 | StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) { |
488 | |
|
489 | 0 | LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); |
490 | 0 | if (RelaxedOcc) { |
491 | 0 | MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy); |
492 | 0 | if (MinOccupancy != StartingOccupancy) |
493 | 0 | LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy |
494 | 0 | << ".\n"); |
495 | 0 | } |
496 | 0 | } |
497 | | |
498 | | std::unique_ptr<GCNSchedStage> |
499 | 0 | GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { |
500 | 0 | switch (SchedStageID) { |
501 | 0 | case GCNSchedStageID::OccInitialSchedule: |
502 | 0 | return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this); |
503 | 0 | case GCNSchedStageID::UnclusteredHighRPReschedule: |
504 | 0 | return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this); |
505 | 0 | case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
506 | 0 | return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this); |
507 | 0 | case GCNSchedStageID::PreRARematerialize: |
508 | 0 | return std::make_unique<PreRARematStage>(SchedStageID, *this); |
509 | 0 | case GCNSchedStageID::ILPInitialSchedule: |
510 | 0 | return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this); |
511 | 0 | } |
512 | | |
513 | 0 | llvm_unreachable("Unknown SchedStageID."); |
514 | 0 | } |
515 | | |
516 | 0 | void GCNScheduleDAGMILive::schedule() { |
517 | | // Collect all scheduling regions. The actual scheduling is performed in |
518 | | // GCNScheduleDAGMILive::finalizeSchedule. |
519 | 0 | Regions.push_back(std::pair(RegionBegin, RegionEnd)); |
520 | 0 | } |
521 | | |
522 | | GCNRegPressure |
523 | 0 | GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { |
524 | 0 | GCNDownwardRPTracker RPTracker(*LIS); |
525 | 0 | RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); |
526 | 0 | return RPTracker.moveMaxPressure(); |
527 | 0 | } |
528 | | |
529 | | void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, |
530 | 0 | const MachineBasicBlock *MBB) { |
531 | 0 | GCNDownwardRPTracker RPTracker(*LIS); |
532 | | |
533 | | // If the block has the only successor then live-ins of that successor are |
534 | | // live-outs of the current block. We can reuse calculated live set if the |
535 | | // successor will be sent to scheduling past current block. |
536 | | |
537 | | // However, due to the bug in LiveInterval analysis it may happen that two |
538 | | // predecessors of the same successor block have different lane bitmasks for |
539 | | // a live-out register. Workaround that by sticking to one-to-one relationship |
540 | | // i.e. one predecessor with one successor block. |
541 | 0 | const MachineBasicBlock *OnlySucc = nullptr; |
542 | 0 | if (MBB->succ_size() == 1) { |
543 | 0 | auto *Candidate = *MBB->succ_begin(); |
544 | 0 | if (!Candidate->empty() && Candidate->pred_size() == 1) { |
545 | 0 | SlotIndexes *Ind = LIS->getSlotIndexes(); |
546 | 0 | if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate)) |
547 | 0 | OnlySucc = Candidate; |
548 | 0 | } |
549 | 0 | } |
550 | | |
551 | | // Scheduler sends regions from the end of the block upwards. |
552 | 0 | size_t CurRegion = RegionIdx; |
553 | 0 | for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) |
554 | 0 | if (Regions[CurRegion].first->getParent() != MBB) |
555 | 0 | break; |
556 | 0 | --CurRegion; |
557 | |
|
558 | 0 | auto I = MBB->begin(); |
559 | 0 | auto LiveInIt = MBBLiveIns.find(MBB); |
560 | 0 | auto &Rgn = Regions[CurRegion]; |
561 | 0 | auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); |
562 | 0 | if (LiveInIt != MBBLiveIns.end()) { |
563 | 0 | auto LiveIn = std::move(LiveInIt->second); |
564 | 0 | RPTracker.reset(*MBB->begin(), &LiveIn); |
565 | 0 | MBBLiveIns.erase(LiveInIt); |
566 | 0 | } else { |
567 | 0 | I = Rgn.first; |
568 | 0 | auto LRS = BBLiveInMap.lookup(NonDbgMI); |
569 | | #ifdef EXPENSIVE_CHECKS |
570 | | assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); |
571 | | #endif |
572 | 0 | RPTracker.reset(*I, &LRS); |
573 | 0 | } |
574 | |
|
575 | 0 | for (;;) { |
576 | 0 | I = RPTracker.getNext(); |
577 | |
|
578 | 0 | if (Regions[CurRegion].first == I || NonDbgMI == I) { |
579 | 0 | LiveIns[CurRegion] = RPTracker.getLiveRegs(); |
580 | 0 | RPTracker.clearMaxPressure(); |
581 | 0 | } |
582 | |
|
583 | 0 | if (Regions[CurRegion].second == I) { |
584 | 0 | Pressure[CurRegion] = RPTracker.moveMaxPressure(); |
585 | 0 | if (CurRegion-- == RegionIdx) |
586 | 0 | break; |
587 | 0 | } |
588 | 0 | RPTracker.advanceToNext(); |
589 | 0 | RPTracker.advanceBeforeNext(); |
590 | 0 | } |
591 | |
|
592 | 0 | if (OnlySucc) { |
593 | 0 | if (I != MBB->end()) { |
594 | 0 | RPTracker.advanceToNext(); |
595 | 0 | RPTracker.advance(MBB->end()); |
596 | 0 | } |
597 | 0 | RPTracker.advanceBeforeNext(); |
598 | 0 | MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); |
599 | 0 | } |
600 | 0 | } |
601 | | |
602 | | DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
603 | 0 | GCNScheduleDAGMILive::getBBLiveInMap() const { |
604 | 0 | assert(!Regions.empty()); |
605 | 0 | std::vector<MachineInstr *> BBStarters; |
606 | 0 | BBStarters.reserve(Regions.size()); |
607 | 0 | auto I = Regions.rbegin(), E = Regions.rend(); |
608 | 0 | auto *BB = I->first->getParent(); |
609 | 0 | do { |
610 | 0 | auto *MI = &*skipDebugInstructionsForward(I->first, I->second); |
611 | 0 | BBStarters.push_back(MI); |
612 | 0 | do { |
613 | 0 | ++I; |
614 | 0 | } while (I != E && I->first->getParent() == BB); |
615 | 0 | } while (I != E); |
616 | 0 | return getLiveRegMap(BBStarters, false /*After*/, *LIS); |
617 | 0 | } |
618 | | |
619 | 0 | void GCNScheduleDAGMILive::finalizeSchedule() { |
620 | | // Start actual scheduling here. This function is called by the base |
621 | | // MachineScheduler after all regions have been recorded by |
622 | | // GCNScheduleDAGMILive::schedule(). |
623 | 0 | LiveIns.resize(Regions.size()); |
624 | 0 | Pressure.resize(Regions.size()); |
625 | 0 | RescheduleRegions.resize(Regions.size()); |
626 | 0 | RegionsWithHighRP.resize(Regions.size()); |
627 | 0 | RegionsWithExcessRP.resize(Regions.size()); |
628 | 0 | RegionsWithMinOcc.resize(Regions.size()); |
629 | 0 | RegionsWithIGLPInstrs.resize(Regions.size()); |
630 | 0 | RescheduleRegions.set(); |
631 | 0 | RegionsWithHighRP.reset(); |
632 | 0 | RegionsWithExcessRP.reset(); |
633 | 0 | RegionsWithMinOcc.reset(); |
634 | 0 | RegionsWithIGLPInstrs.reset(); |
635 | |
|
636 | 0 | runSchedStages(); |
637 | 0 | } |
638 | | |
639 | 0 | void GCNScheduleDAGMILive::runSchedStages() { |
640 | 0 | LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); |
641 | |
|
642 | 0 | if (!Regions.empty()) |
643 | 0 | BBLiveInMap = getBBLiveInMap(); |
644 | |
|
645 | 0 | GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); |
646 | 0 | while (S.advanceStage()) { |
647 | 0 | auto Stage = createSchedStage(S.getCurrentStage()); |
648 | 0 | if (!Stage->initGCNSchedStage()) |
649 | 0 | continue; |
650 | | |
651 | 0 | for (auto Region : Regions) { |
652 | 0 | RegionBegin = Region.first; |
653 | 0 | RegionEnd = Region.second; |
654 | | // Setup for scheduling the region and check whether it should be skipped. |
655 | 0 | if (!Stage->initGCNRegion()) { |
656 | 0 | Stage->advanceRegion(); |
657 | 0 | exitRegion(); |
658 | 0 | continue; |
659 | 0 | } |
660 | | |
661 | 0 | ScheduleDAGMILive::schedule(); |
662 | 0 | Stage->finalizeGCNRegion(); |
663 | 0 | } |
664 | |
|
665 | 0 | Stage->finalizeGCNSchedStage(); |
666 | 0 | } |
667 | 0 | } |
668 | | |
669 | | #ifndef NDEBUG |
670 | 0 | raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { |
671 | 0 | switch (StageID) { |
672 | 0 | case GCNSchedStageID::OccInitialSchedule: |
673 | 0 | OS << "Max Occupancy Initial Schedule"; |
674 | 0 | break; |
675 | 0 | case GCNSchedStageID::UnclusteredHighRPReschedule: |
676 | 0 | OS << "Unclustered High Register Pressure Reschedule"; |
677 | 0 | break; |
678 | 0 | case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
679 | 0 | OS << "Clustered Low Occupancy Reschedule"; |
680 | 0 | break; |
681 | 0 | case GCNSchedStageID::PreRARematerialize: |
682 | 0 | OS << "Pre-RA Rematerialize"; |
683 | 0 | break; |
684 | 0 | case GCNSchedStageID::ILPInitialSchedule: |
685 | 0 | OS << "Max ILP Initial Schedule"; |
686 | 0 | break; |
687 | 0 | } |
688 | | |
689 | 0 | return OS; |
690 | 0 | } |
691 | | #endif |
692 | | |
693 | | GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) |
694 | | : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), |
695 | 0 | MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} |
696 | | |
697 | 0 | bool GCNSchedStage::initGCNSchedStage() { |
698 | 0 | if (!DAG.LIS) |
699 | 0 | return false; |
700 | | |
701 | 0 | LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n"); |
702 | 0 | return true; |
703 | 0 | } |
704 | | |
705 | 0 | bool UnclusteredHighRPStage::initGCNSchedStage() { |
706 | 0 | if (DisableUnclusterHighRP) |
707 | 0 | return false; |
708 | | |
709 | 0 | if (!GCNSchedStage::initGCNSchedStage()) |
710 | 0 | return false; |
711 | | |
712 | 0 | if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) |
713 | 0 | return false; |
714 | | |
715 | 0 | SavedMutations.swap(DAG.Mutations); |
716 | 0 | DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false)); |
717 | |
|
718 | 0 | InitialOccupancy = DAG.MinOccupancy; |
719 | | // Aggressivly try to reduce register pressure in the unclustered high RP |
720 | | // stage. Temporarily increase occupancy target in the region. |
721 | 0 | S.SGPRLimitBias = S.HighRPSGPRBias; |
722 | 0 | S.VGPRLimitBias = S.HighRPVGPRBias; |
723 | 0 | if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) |
724 | 0 | MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); |
725 | |
|
726 | 0 | LLVM_DEBUG( |
727 | 0 | dbgs() |
728 | 0 | << "Retrying function scheduling without clustering. " |
729 | 0 | "Aggressivly try to reduce register pressure to achieve occupancy " |
730 | 0 | << DAG.MinOccupancy << ".\n"); |
731 | |
|
732 | 0 | return true; |
733 | 0 | } |
734 | | |
735 | 0 | bool ClusteredLowOccStage::initGCNSchedStage() { |
736 | 0 | if (DisableClusteredLowOccupancy) |
737 | 0 | return false; |
738 | | |
739 | 0 | if (!GCNSchedStage::initGCNSchedStage()) |
740 | 0 | return false; |
741 | | |
742 | | // Don't bother trying to improve ILP in lower RP regions if occupancy has not |
743 | | // been dropped. All regions will have already been scheduled with the ideal |
744 | | // occupancy targets. |
745 | 0 | if (DAG.StartingOccupancy <= DAG.MinOccupancy) |
746 | 0 | return false; |
747 | | |
748 | 0 | LLVM_DEBUG( |
749 | 0 | dbgs() << "Retrying function scheduling with lowest recorded occupancy " |
750 | 0 | << DAG.MinOccupancy << ".\n"); |
751 | 0 | return true; |
752 | 0 | } |
753 | | |
754 | 0 | bool PreRARematStage::initGCNSchedStage() { |
755 | 0 | if (!GCNSchedStage::initGCNSchedStage()) |
756 | 0 | return false; |
757 | | |
758 | 0 | if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1) |
759 | 0 | return false; |
760 | | |
761 | 0 | const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); |
762 | | // Check maximum occupancy |
763 | 0 | if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == |
764 | 0 | DAG.MinOccupancy) |
765 | 0 | return false; |
766 | | |
767 | | // FIXME: This pass will invalidate cached MBBLiveIns for regions |
768 | | // inbetween the defs and region we sinked the def to. Cached pressure |
769 | | // for regions where a def is sinked from will also be invalidated. Will |
770 | | // need to be fixed if there is another pass after this pass. |
771 | 0 | assert(!S.hasNextStage()); |
772 | | |
773 | 0 | collectRematerializableInstructions(); |
774 | 0 | if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) |
775 | 0 | return false; |
776 | | |
777 | 0 | LLVM_DEBUG( |
778 | 0 | dbgs() << "Retrying function scheduling with improved occupancy of " |
779 | 0 | << DAG.MinOccupancy << " from rematerializing\n"); |
780 | 0 | return true; |
781 | 0 | } |
782 | | |
783 | 0 | void GCNSchedStage::finalizeGCNSchedStage() { |
784 | 0 | DAG.finishBlock(); |
785 | 0 | LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); |
786 | 0 | } |
787 | | |
788 | 0 | void UnclusteredHighRPStage::finalizeGCNSchedStage() { |
789 | 0 | SavedMutations.swap(DAG.Mutations); |
790 | 0 | S.SGPRLimitBias = S.VGPRLimitBias = 0; |
791 | 0 | if (DAG.MinOccupancy > InitialOccupancy) { |
792 | 0 | for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) |
793 | 0 | DAG.RegionsWithMinOcc[IDX] = |
794 | 0 | DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy; |
795 | |
|
796 | 0 | LLVM_DEBUG(dbgs() << StageID |
797 | 0 | << " stage successfully increased occupancy to " |
798 | 0 | << DAG.MinOccupancy << '\n'); |
799 | 0 | } |
800 | |
|
801 | 0 | GCNSchedStage::finalizeGCNSchedStage(); |
802 | 0 | } |
803 | | |
804 | 0 | bool GCNSchedStage::initGCNRegion() { |
805 | | // Check whether this new region is also a new block. |
806 | 0 | if (DAG.RegionBegin->getParent() != CurrentMBB) |
807 | 0 | setupNewBlock(); |
808 | |
|
809 | 0 | unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end()); |
810 | 0 | DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs); |
811 | | |
812 | | // Skip empty scheduling regions (0 or 1 schedulable instructions). |
813 | 0 | if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end())) |
814 | 0 | return false; |
815 | | |
816 | 0 | LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); |
817 | 0 | LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) |
818 | 0 | << " " << CurrentMBB->getName() |
819 | 0 | << "\n From: " << *DAG.begin() << " To: "; |
820 | 0 | if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; |
821 | 0 | else dbgs() << "End"; |
822 | 0 | dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); |
823 | | |
824 | | // Save original instruction order before scheduling for possible revert. |
825 | 0 | Unsched.clear(); |
826 | 0 | Unsched.reserve(DAG.NumRegionInstrs); |
827 | 0 | if (StageID == GCNSchedStageID::OccInitialSchedule || |
828 | 0 | StageID == GCNSchedStageID::ILPInitialSchedule) { |
829 | 0 | for (auto &I : DAG) { |
830 | 0 | Unsched.push_back(&I); |
831 | 0 | if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || |
832 | 0 | I.getOpcode() == AMDGPU::IGLP_OPT) |
833 | 0 | DAG.RegionsWithIGLPInstrs[RegionIdx] = true; |
834 | 0 | } |
835 | 0 | } else { |
836 | 0 | for (auto &I : DAG) |
837 | 0 | Unsched.push_back(&I); |
838 | 0 | } |
839 | |
|
840 | 0 | PressureBefore = DAG.Pressure[RegionIdx]; |
841 | |
|
842 | 0 | LLVM_DEBUG( |
843 | 0 | dbgs() << "Pressure before scheduling:\nRegion live-ins:" |
844 | 0 | << print(DAG.LiveIns[RegionIdx], DAG.MRI) |
845 | 0 | << "Region live-in pressure: " |
846 | 0 | << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) |
847 | 0 | << "Region register pressure: " << print(PressureBefore)); |
848 | |
|
849 | 0 | S.HasHighPressure = false; |
850 | 0 | S.KnownExcessRP = isRegionWithExcessRP(); |
851 | |
|
852 | 0 | if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
853 | 0 | StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { |
854 | 0 | SavedMutations.clear(); |
855 | 0 | SavedMutations.swap(DAG.Mutations); |
856 | 0 | bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || |
857 | 0 | StageID == GCNSchedStageID::ILPInitialSchedule; |
858 | 0 | DAG.addMutation(createIGroupLPDAGMutation(/*IsReentry=*/!IsInitialStage)); |
859 | 0 | } |
860 | |
|
861 | 0 | return true; |
862 | 0 | } |
863 | | |
864 | 0 | bool UnclusteredHighRPStage::initGCNRegion() { |
865 | | // Only reschedule regions with the minimum occupancy or regions that may have |
866 | | // spilling (excess register pressure). |
867 | 0 | if ((!DAG.RegionsWithMinOcc[RegionIdx] || |
868 | 0 | DAG.MinOccupancy <= InitialOccupancy) && |
869 | 0 | !DAG.RegionsWithExcessRP[RegionIdx]) |
870 | 0 | return false; |
871 | | |
872 | 0 | return GCNSchedStage::initGCNRegion(); |
873 | 0 | } |
874 | | |
875 | 0 | bool ClusteredLowOccStage::initGCNRegion() { |
876 | | // We may need to reschedule this region if it wasn't rescheduled in the last |
877 | | // stage, or if we found it was testing critical register pressure limits in |
878 | | // the unclustered reschedule stage. The later is because we may not have been |
879 | | // able to raise the min occupancy in the previous stage so the region may be |
880 | | // overly constrained even if it was already rescheduled. |
881 | 0 | if (!DAG.RegionsWithHighRP[RegionIdx]) |
882 | 0 | return false; |
883 | | |
884 | 0 | return GCNSchedStage::initGCNRegion(); |
885 | 0 | } |
886 | | |
887 | 0 | bool PreRARematStage::initGCNRegion() { |
888 | 0 | if (!DAG.RescheduleRegions[RegionIdx]) |
889 | 0 | return false; |
890 | | |
891 | 0 | return GCNSchedStage::initGCNRegion(); |
892 | 0 | } |
893 | | |
894 | 0 | void GCNSchedStage::setupNewBlock() { |
895 | 0 | if (CurrentMBB) |
896 | 0 | DAG.finishBlock(); |
897 | |
|
898 | 0 | CurrentMBB = DAG.RegionBegin->getParent(); |
899 | 0 | DAG.startBlock(CurrentMBB); |
900 | | // Get real RP for the region if it hasn't be calculated before. After the |
901 | | // initial schedule stage real RP will be collected after scheduling. |
902 | 0 | if (StageID == GCNSchedStageID::OccInitialSchedule || |
903 | 0 | StageID == GCNSchedStageID::ILPInitialSchedule) |
904 | 0 | DAG.computeBlockPressure(RegionIdx, CurrentMBB); |
905 | 0 | } |
906 | | |
907 | 0 | void GCNSchedStage::finalizeGCNRegion() { |
908 | 0 | DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
909 | 0 | DAG.RescheduleRegions[RegionIdx] = false; |
910 | 0 | if (S.HasHighPressure) |
911 | 0 | DAG.RegionsWithHighRP[RegionIdx] = true; |
912 | | |
913 | | // Revert scheduling if we have dropped occupancy or there is some other |
914 | | // reason that the original schedule is better. |
915 | 0 | checkScheduling(); |
916 | |
|
917 | 0 | if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
918 | 0 | StageID != GCNSchedStageID::UnclusteredHighRPReschedule) |
919 | 0 | SavedMutations.swap(DAG.Mutations); |
920 | |
|
921 | 0 | DAG.exitRegion(); |
922 | 0 | RegionIdx++; |
923 | 0 | } |
924 | | |
925 | 0 | void GCNSchedStage::checkScheduling() { |
926 | | // Check the results of scheduling. |
927 | 0 | PressureAfter = DAG.getRealRegPressure(RegionIdx); |
928 | 0 | LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); |
929 | 0 | LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); |
930 | |
|
931 | 0 | if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && |
932 | 0 | PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { |
933 | 0 | DAG.Pressure[RegionIdx] = PressureAfter; |
934 | 0 | DAG.RegionsWithMinOcc[RegionIdx] = |
935 | 0 | PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; |
936 | | |
937 | | // Early out if we have achieve the occupancy target. |
938 | 0 | LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); |
939 | 0 | return; |
940 | 0 | } |
941 | | |
942 | 0 | unsigned TargetOccupancy = |
943 | 0 | std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF)); |
944 | 0 | unsigned WavesAfter = |
945 | 0 | std::min(TargetOccupancy, PressureAfter.getOccupancy(ST)); |
946 | 0 | unsigned WavesBefore = |
947 | 0 | std::min(TargetOccupancy, PressureBefore.getOccupancy(ST)); |
948 | 0 | LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore |
949 | 0 | << ", after " << WavesAfter << ".\n"); |
950 | | |
951 | | // We may not be able to keep the current target occupancy because of the just |
952 | | // scheduled region. We might still be able to revert scheduling if the |
953 | | // occupancy before was higher, or if the current schedule has register |
954 | | // pressure higher than the excess limits which could lead to more spilling. |
955 | 0 | unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); |
956 | | |
957 | | // Allow memory bound functions to drop to 4 waves if not limited by an |
958 | | // attribute. |
959 | 0 | if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && |
960 | 0 | WavesAfter >= MFI.getMinAllowedOccupancy()) { |
961 | 0 | LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " |
962 | 0 | << MFI.getMinAllowedOccupancy() << " waves\n"); |
963 | 0 | NewOccupancy = WavesAfter; |
964 | 0 | } |
965 | |
|
966 | 0 | if (NewOccupancy < DAG.MinOccupancy) { |
967 | 0 | DAG.MinOccupancy = NewOccupancy; |
968 | 0 | MFI.limitOccupancy(DAG.MinOccupancy); |
969 | 0 | DAG.RegionsWithMinOcc.reset(); |
970 | 0 | LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " |
971 | 0 | << DAG.MinOccupancy << ".\n"); |
972 | 0 | } |
973 | |
|
974 | 0 | unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); |
975 | 0 | unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); |
976 | 0 | if (PressureAfter.getVGPRNum(false) > MaxVGPRs || |
977 | 0 | PressureAfter.getAGPRNum() > MaxVGPRs || |
978 | 0 | PressureAfter.getSGPRNum() > MaxSGPRs) { |
979 | 0 | DAG.RescheduleRegions[RegionIdx] = true; |
980 | 0 | DAG.RegionsWithHighRP[RegionIdx] = true; |
981 | 0 | DAG.RegionsWithExcessRP[RegionIdx] = true; |
982 | 0 | } |
983 | | |
984 | | // Revert if this region's schedule would cause a drop in occupancy or |
985 | | // spilling. |
986 | 0 | if (shouldRevertScheduling(WavesAfter)) { |
987 | 0 | revertScheduling(); |
988 | 0 | } else { |
989 | 0 | DAG.Pressure[RegionIdx] = PressureAfter; |
990 | 0 | DAG.RegionsWithMinOcc[RegionIdx] = |
991 | 0 | PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; |
992 | 0 | } |
993 | 0 | } |
994 | | |
995 | | unsigned |
996 | | GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, |
997 | | DenseMap<unsigned, unsigned> &ReadyCycles, |
998 | 0 | const TargetSchedModel &SM) { |
999 | 0 | unsigned ReadyCycle = CurrCycle; |
1000 | 0 | for (auto &D : SU.Preds) { |
1001 | 0 | if (D.isAssignedRegDep()) { |
1002 | 0 | MachineInstr *DefMI = D.getSUnit()->getInstr(); |
1003 | 0 | unsigned Latency = SM.computeInstrLatency(DefMI); |
1004 | 0 | unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum]; |
1005 | 0 | ReadyCycle = std::max(ReadyCycle, DefReady + Latency); |
1006 | 0 | } |
1007 | 0 | } |
1008 | 0 | ReadyCycles[SU.NodeNum] = ReadyCycle; |
1009 | 0 | return ReadyCycle; |
1010 | 0 | } |
1011 | | |
1012 | | #ifndef NDEBUG |
1013 | | struct EarlierIssuingCycle { |
1014 | | bool operator()(std::pair<MachineInstr *, unsigned> A, |
1015 | 0 | std::pair<MachineInstr *, unsigned> B) const { |
1016 | 0 | return A.second < B.second; |
1017 | 0 | } |
1018 | | }; |
1019 | | |
1020 | | static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, |
1021 | 0 | EarlierIssuingCycle> &ReadyCycles) { |
1022 | 0 | if (ReadyCycles.empty()) |
1023 | 0 | return; |
1024 | 0 | unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); |
1025 | 0 | dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum |
1026 | 0 | << " ##################\n# Cycle #\t\t\tInstruction " |
1027 | 0 | " " |
1028 | 0 | " \n"; |
1029 | 0 | unsigned IPrev = 1; |
1030 | 0 | for (auto &I : ReadyCycles) { |
1031 | 0 | if (I.second > IPrev + 1) |
1032 | 0 | dbgs() << "****************************** BUBBLE OF " << I.second - IPrev |
1033 | 0 | << " CYCLES DETECTED ******************************\n\n"; |
1034 | 0 | dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; |
1035 | 0 | IPrev = I.second; |
1036 | 0 | } |
1037 | 0 | } |
1038 | | #endif |
1039 | | |
1040 | | ScheduleMetrics |
1041 | 0 | GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { |
1042 | 0 | #ifndef NDEBUG |
1043 | 0 | std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
1044 | 0 | ReadyCyclesSorted; |
1045 | 0 | #endif |
1046 | 0 | const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
1047 | 0 | unsigned SumBubbles = 0; |
1048 | 0 | DenseMap<unsigned, unsigned> ReadyCycles; |
1049 | 0 | unsigned CurrCycle = 0; |
1050 | 0 | for (auto &SU : InputSchedule) { |
1051 | 0 | unsigned ReadyCycle = |
1052 | 0 | computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); |
1053 | 0 | SumBubbles += ReadyCycle - CurrCycle; |
1054 | 0 | #ifndef NDEBUG |
1055 | 0 | ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); |
1056 | 0 | #endif |
1057 | 0 | CurrCycle = ++ReadyCycle; |
1058 | 0 | } |
1059 | 0 | #ifndef NDEBUG |
1060 | 0 | LLVM_DEBUG( |
1061 | 0 | printScheduleModel(ReadyCyclesSorted); |
1062 | 0 | dbgs() << "\n\t" |
1063 | 0 | << "Metric: " |
1064 | 0 | << (SumBubbles |
1065 | 0 | ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
1066 | 0 | : 1) |
1067 | 0 | << "\n\n"); |
1068 | 0 | #endif |
1069 | |
|
1070 | 0 | return ScheduleMetrics(CurrCycle, SumBubbles); |
1071 | 0 | } |
1072 | | |
1073 | | ScheduleMetrics |
1074 | 0 | GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { |
1075 | 0 | #ifndef NDEBUG |
1076 | 0 | std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
1077 | 0 | ReadyCyclesSorted; |
1078 | 0 | #endif |
1079 | 0 | const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
1080 | 0 | unsigned SumBubbles = 0; |
1081 | 0 | DenseMap<unsigned, unsigned> ReadyCycles; |
1082 | 0 | unsigned CurrCycle = 0; |
1083 | 0 | for (auto &MI : DAG) { |
1084 | 0 | SUnit *SU = DAG.getSUnit(&MI); |
1085 | 0 | if (!SU) |
1086 | 0 | continue; |
1087 | 0 | unsigned ReadyCycle = |
1088 | 0 | computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM); |
1089 | 0 | SumBubbles += ReadyCycle - CurrCycle; |
1090 | 0 | #ifndef NDEBUG |
1091 | 0 | ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); |
1092 | 0 | #endif |
1093 | 0 | CurrCycle = ++ReadyCycle; |
1094 | 0 | } |
1095 | 0 | #ifndef NDEBUG |
1096 | 0 | LLVM_DEBUG( |
1097 | 0 | printScheduleModel(ReadyCyclesSorted); |
1098 | 0 | dbgs() << "\n\t" |
1099 | 0 | << "Metric: " |
1100 | 0 | << (SumBubbles |
1101 | 0 | ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
1102 | 0 | : 1) |
1103 | 0 | << "\n\n"); |
1104 | 0 | #endif |
1105 | |
|
1106 | 0 | return ScheduleMetrics(CurrCycle, SumBubbles); |
1107 | 0 | } |
1108 | | |
1109 | 0 | bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { |
1110 | 0 | if (WavesAfter < DAG.MinOccupancy) |
1111 | 0 | return true; |
1112 | | |
1113 | 0 | return false; |
1114 | 0 | } |
1115 | | |
1116 | 0 | bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
1117 | 0 | if (PressureAfter == PressureBefore) |
1118 | 0 | return false; |
1119 | | |
1120 | 0 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
1121 | 0 | return true; |
1122 | | |
1123 | 0 | if (mayCauseSpilling(WavesAfter)) |
1124 | 0 | return true; |
1125 | | |
1126 | 0 | return false; |
1127 | 0 | } |
1128 | | |
1129 | 0 | bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { |
1130 | | // If RP is not reduced in the unclustered reschedule stage, revert to the |
1131 | | // old schedule. |
1132 | 0 | if ((WavesAfter <= PressureBefore.getOccupancy(ST) && |
1133 | 0 | mayCauseSpilling(WavesAfter)) || |
1134 | 0 | GCNSchedStage::shouldRevertScheduling(WavesAfter)) { |
1135 | 0 | LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); |
1136 | 0 | return true; |
1137 | 0 | } |
1138 | | |
1139 | | // Do not attempt to relax schedule even more if we are already spilling. |
1140 | 0 | if (isRegionWithExcessRP()) |
1141 | 0 | return false; |
1142 | | |
1143 | 0 | LLVM_DEBUG( |
1144 | 0 | dbgs() |
1145 | 0 | << "\n\t *** In shouldRevertScheduling ***\n" |
1146 | 0 | << " *********** BEFORE UnclusteredHighRPStage ***********\n"); |
1147 | 0 | ScheduleMetrics MBefore = |
1148 | 0 | getScheduleMetrics(DAG.SUnits); |
1149 | 0 | LLVM_DEBUG( |
1150 | 0 | dbgs() |
1151 | 0 | << "\n *********** AFTER UnclusteredHighRPStage ***********\n"); |
1152 | 0 | ScheduleMetrics MAfter = getScheduleMetrics(DAG); |
1153 | 0 | unsigned OldMetric = MBefore.getMetric(); |
1154 | 0 | unsigned NewMetric = MAfter.getMetric(); |
1155 | 0 | unsigned WavesBefore = |
1156 | 0 | std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); |
1157 | 0 | unsigned Profit = |
1158 | 0 | ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * |
1159 | 0 | ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / |
1160 | 0 | NewMetric) / |
1161 | 0 | ScheduleMetrics::ScaleFactor; |
1162 | 0 | LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " |
1163 | 0 | << MAfter << "Profit: " << Profit << "\n"); |
1164 | 0 | return Profit < ScheduleMetrics::ScaleFactor; |
1165 | 0 | } |
1166 | | |
1167 | 0 | bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { |
1168 | 0 | if (PressureAfter == PressureBefore) |
1169 | 0 | return false; |
1170 | | |
1171 | 0 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
1172 | 0 | return true; |
1173 | | |
1174 | 0 | if (mayCauseSpilling(WavesAfter)) |
1175 | 0 | return true; |
1176 | | |
1177 | 0 | return false; |
1178 | 0 | } |
1179 | | |
1180 | 0 | bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { |
1181 | 0 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
1182 | 0 | return true; |
1183 | | |
1184 | 0 | if (mayCauseSpilling(WavesAfter)) |
1185 | 0 | return true; |
1186 | | |
1187 | 0 | return false; |
1188 | 0 | } |
1189 | | |
1190 | 0 | bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
1191 | 0 | if (mayCauseSpilling(WavesAfter)) |
1192 | 0 | return true; |
1193 | | |
1194 | 0 | return false; |
1195 | 0 | } |
1196 | | |
1197 | 0 | bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { |
1198 | 0 | if (WavesAfter <= MFI.getMinWavesPerEU() && |
1199 | 0 | !PressureAfter.less(ST, PressureBefore) && |
1200 | 0 | isRegionWithExcessRP()) { |
1201 | 0 | LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); |
1202 | 0 | return true; |
1203 | 0 | } |
1204 | | |
1205 | 0 | return false; |
1206 | 0 | } |
1207 | | |
1208 | 0 | void GCNSchedStage::revertScheduling() { |
1209 | 0 | DAG.RegionsWithMinOcc[RegionIdx] = |
1210 | 0 | PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; |
1211 | 0 | LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); |
1212 | 0 | DAG.RescheduleRegions[RegionIdx] = |
1213 | 0 | S.hasNextStage() && |
1214 | 0 | S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule; |
1215 | 0 | DAG.RegionEnd = DAG.RegionBegin; |
1216 | 0 | int SkippedDebugInstr = 0; |
1217 | 0 | for (MachineInstr *MI : Unsched) { |
1218 | 0 | if (MI->isDebugInstr()) { |
1219 | 0 | ++SkippedDebugInstr; |
1220 | 0 | continue; |
1221 | 0 | } |
1222 | | |
1223 | 0 | if (MI->getIterator() != DAG.RegionEnd) { |
1224 | 0 | DAG.BB->remove(MI); |
1225 | 0 | DAG.BB->insert(DAG.RegionEnd, MI); |
1226 | 0 | if (!MI->isDebugInstr()) |
1227 | 0 | DAG.LIS->handleMove(*MI, true); |
1228 | 0 | } |
1229 | | |
1230 | | // Reset read-undef flags and update them later. |
1231 | 0 | for (auto &Op : MI->all_defs()) |
1232 | 0 | Op.setIsUndef(false); |
1233 | 0 | RegisterOperands RegOpers; |
1234 | 0 | RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); |
1235 | 0 | if (!MI->isDebugInstr()) { |
1236 | 0 | if (DAG.ShouldTrackLaneMasks) { |
1237 | | // Adjust liveness and add missing dead+read-undef flags. |
1238 | 0 | SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); |
1239 | 0 | RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); |
1240 | 0 | } else { |
1241 | | // Adjust for missing dead-def flags. |
1242 | 0 | RegOpers.detectDeadDefs(*MI, *DAG.LIS); |
1243 | 0 | } |
1244 | 0 | } |
1245 | 0 | DAG.RegionEnd = MI->getIterator(); |
1246 | 0 | ++DAG.RegionEnd; |
1247 | 0 | LLVM_DEBUG(dbgs() << "Scheduling " << *MI); |
1248 | 0 | } |
1249 | | |
1250 | | // After reverting schedule, debug instrs will now be at the end of the block |
1251 | | // and RegionEnd will point to the first debug instr. Increment RegionEnd |
1252 | | // pass debug instrs to the actual end of the scheduling region. |
1253 | 0 | while (SkippedDebugInstr-- > 0) |
1254 | 0 | ++DAG.RegionEnd; |
1255 | | |
1256 | | // If Unsched.front() instruction is a debug instruction, this will actually |
1257 | | // shrink the region since we moved all debug instructions to the end of the |
1258 | | // block. Find the first instruction that is not a debug instruction. |
1259 | 0 | DAG.RegionBegin = Unsched.front()->getIterator(); |
1260 | 0 | if (DAG.RegionBegin->isDebugInstr()) { |
1261 | 0 | for (MachineInstr *MI : Unsched) { |
1262 | 0 | if (MI->isDebugInstr()) |
1263 | 0 | continue; |
1264 | 0 | DAG.RegionBegin = MI->getIterator(); |
1265 | 0 | break; |
1266 | 0 | } |
1267 | 0 | } |
1268 | | |
1269 | | // Then move the debug instructions back into their correct place and set |
1270 | | // RegionBegin and RegionEnd if needed. |
1271 | 0 | DAG.placeDebugValues(); |
1272 | |
|
1273 | 0 | DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
1274 | 0 | } |
1275 | | |
1276 | 0 | void PreRARematStage::collectRematerializableInstructions() { |
1277 | 0 | const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI); |
1278 | 0 | for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) { |
1279 | 0 | Register Reg = Register::index2VirtReg(I); |
1280 | 0 | if (!DAG.LIS->hasInterval(Reg)) |
1281 | 0 | continue; |
1282 | | |
1283 | | // TODO: Handle AGPR and SGPR rematerialization |
1284 | 0 | if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) || |
1285 | 0 | !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg)) |
1286 | 0 | continue; |
1287 | | |
1288 | 0 | MachineOperand *Op = DAG.MRI.getOneDef(Reg); |
1289 | 0 | MachineInstr *Def = Op->getParent(); |
1290 | 0 | if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def)) |
1291 | 0 | continue; |
1292 | | |
1293 | 0 | MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg); |
1294 | 0 | if (Def->getParent() == UseI->getParent()) |
1295 | 0 | continue; |
1296 | | |
1297 | | // We are only collecting defs that are defined in another block and are |
1298 | | // live-through or used inside regions at MinOccupancy. This means that the |
1299 | | // register must be in the live-in set for the region. |
1300 | 0 | bool AddedToRematList = false; |
1301 | 0 | for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
1302 | 0 | auto It = DAG.LiveIns[I].find(Reg); |
1303 | 0 | if (It != DAG.LiveIns[I].end() && !It->second.none()) { |
1304 | 0 | if (DAG.RegionsWithMinOcc[I]) { |
1305 | 0 | RematerializableInsts[I][Def] = UseI; |
1306 | 0 | AddedToRematList = true; |
1307 | 0 | } |
1308 | | |
1309 | | // Collect regions with rematerializable reg as live-in to avoid |
1310 | | // searching later when updating RP. |
1311 | 0 | RematDefToLiveInRegions[Def].push_back(I); |
1312 | 0 | } |
1313 | 0 | } |
1314 | 0 | if (!AddedToRematList) |
1315 | 0 | RematDefToLiveInRegions.erase(Def); |
1316 | 0 | } |
1317 | 0 | } |
1318 | | |
1319 | | bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, |
1320 | 0 | const TargetInstrInfo *TII) { |
1321 | | // Temporary copies of cached variables we will be modifying and replacing if |
1322 | | // sinking succeeds. |
1323 | 0 | SmallVector< |
1324 | 0 | std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> |
1325 | 0 | NewRegions; |
1326 | 0 | DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns; |
1327 | 0 | DenseMap<unsigned, GCNRegPressure> NewPressure; |
1328 | 0 | BitVector NewRescheduleRegions; |
1329 | 0 | LiveIntervals *LIS = DAG.LIS; |
1330 | |
|
1331 | 0 | NewRegions.resize(DAG.Regions.size()); |
1332 | 0 | NewRescheduleRegions.resize(DAG.Regions.size()); |
1333 | | |
1334 | | // Collect only regions that has a rematerializable def as a live-in. |
1335 | 0 | SmallSet<unsigned, 16> ImpactedRegions; |
1336 | 0 | for (const auto &It : RematDefToLiveInRegions) |
1337 | 0 | ImpactedRegions.insert(It.second.begin(), It.second.end()); |
1338 | | |
1339 | | // Make copies of register pressure and live-ins cache that will be updated |
1340 | | // as we rematerialize. |
1341 | 0 | for (auto Idx : ImpactedRegions) { |
1342 | 0 | NewPressure[Idx] = DAG.Pressure[Idx]; |
1343 | 0 | NewLiveIns[Idx] = DAG.LiveIns[Idx]; |
1344 | 0 | } |
1345 | 0 | NewRegions = DAG.Regions; |
1346 | 0 | NewRescheduleRegions.reset(); |
1347 | |
|
1348 | 0 | DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; |
1349 | 0 | bool Improved = false; |
1350 | 0 | for (auto I : ImpactedRegions) { |
1351 | 0 | if (!DAG.RegionsWithMinOcc[I]) |
1352 | 0 | continue; |
1353 | | |
1354 | 0 | Improved = false; |
1355 | 0 | int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts()); |
1356 | 0 | int SGPRUsage = NewPressure[I].getSGPRNum(); |
1357 | | |
1358 | | // TODO: Handle occupancy drop due to AGPR and SGPR. |
1359 | | // Check if cause of occupancy drop is due to VGPR usage and not SGPR. |
1360 | 0 | if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy) |
1361 | 0 | break; |
1362 | | |
1363 | | // The occupancy of this region could have been improved by a previous |
1364 | | // iteration's sinking of defs. |
1365 | 0 | if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) { |
1366 | 0 | NewRescheduleRegions[I] = true; |
1367 | 0 | Improved = true; |
1368 | 0 | continue; |
1369 | 0 | } |
1370 | | |
1371 | | // First check if we have enough trivially rematerializable instructions to |
1372 | | // improve occupancy. Optimistically assume all instructions we are able to |
1373 | | // sink decreased RP. |
1374 | 0 | int TotalSinkableRegs = 0; |
1375 | 0 | for (const auto &It : RematerializableInsts[I]) { |
1376 | 0 | MachineInstr *Def = It.first; |
1377 | 0 | Register DefReg = Def->getOperand(0).getReg(); |
1378 | 0 | TotalSinkableRegs += |
1379 | 0 | SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]); |
1380 | 0 | } |
1381 | 0 | int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; |
1382 | 0 | unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); |
1383 | | // If in the most optimistic scenario, we cannot improve occupancy, then do |
1384 | | // not attempt to sink any instructions. |
1385 | 0 | if (OptimisticOccupancy <= DAG.MinOccupancy) |
1386 | 0 | break; |
1387 | | |
1388 | 0 | unsigned ImproveOccupancy = 0; |
1389 | 0 | SmallVector<MachineInstr *, 4> SinkedDefs; |
1390 | 0 | for (auto &It : RematerializableInsts[I]) { |
1391 | 0 | MachineInstr *Def = It.first; |
1392 | 0 | MachineBasicBlock::iterator InsertPos = |
1393 | 0 | MachineBasicBlock::iterator(It.second); |
1394 | 0 | Register Reg = Def->getOperand(0).getReg(); |
1395 | | // Rematerialize MI to its use block. Since we are only rematerializing |
1396 | | // instructions that do not have any virtual reg uses, we do not need to |
1397 | | // call LiveRangeEdit::allUsesAvailableAt() and |
1398 | | // LiveRangeEdit::canRematerializeAt(). |
1399 | 0 | TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, |
1400 | 0 | Def->getOperand(0).getSubReg(), *Def, *DAG.TRI); |
1401 | 0 | MachineInstr *NewMI = &*std::prev(InsertPos); |
1402 | 0 | LIS->InsertMachineInstrInMaps(*NewMI); |
1403 | 0 | LIS->removeInterval(Reg); |
1404 | 0 | LIS->createAndComputeVirtRegInterval(Reg); |
1405 | 0 | InsertedMIToOldDef[NewMI] = Def; |
1406 | | |
1407 | | // Update region boundaries in scheduling region we sinked from since we |
1408 | | // may sink an instruction that was at the beginning or end of its region |
1409 | 0 | DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr, |
1410 | 0 | /*Removing =*/true); |
1411 | | |
1412 | | // Update region boundaries in region we sinked to. |
1413 | 0 | DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI); |
1414 | |
|
1415 | 0 | LaneBitmask PrevMask = NewLiveIns[I][Reg]; |
1416 | | // FIXME: Also update cached pressure for where the def was sinked from. |
1417 | | // Update RP for all regions that has this reg as a live-in and remove |
1418 | | // the reg from all regions as a live-in. |
1419 | 0 | for (auto Idx : RematDefToLiveInRegions[Def]) { |
1420 | 0 | NewLiveIns[Idx].erase(Reg); |
1421 | 0 | if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) { |
1422 | | // Def is live-through and not used in this block. |
1423 | 0 | NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI); |
1424 | 0 | } else { |
1425 | | // Def is used and rematerialized into this block. |
1426 | 0 | GCNDownwardRPTracker RPT(*LIS); |
1427 | 0 | auto *NonDbgMI = &*skipDebugInstructionsForward( |
1428 | 0 | NewRegions[Idx].first, NewRegions[Idx].second); |
1429 | 0 | RPT.reset(*NonDbgMI, &NewLiveIns[Idx]); |
1430 | 0 | RPT.advance(NewRegions[Idx].second); |
1431 | 0 | NewPressure[Idx] = RPT.moveMaxPressure(); |
1432 | 0 | } |
1433 | 0 | } |
1434 | |
|
1435 | 0 | SinkedDefs.push_back(Def); |
1436 | 0 | ImproveOccupancy = NewPressure[I].getOccupancy(ST); |
1437 | 0 | if (ImproveOccupancy > DAG.MinOccupancy) |
1438 | 0 | break; |
1439 | 0 | } |
1440 | | |
1441 | | // Remove defs we just sinked from all regions' list of sinkable defs |
1442 | 0 | for (auto &Def : SinkedDefs) |
1443 | 0 | for (auto TrackedIdx : RematDefToLiveInRegions[Def]) |
1444 | 0 | RematerializableInsts[TrackedIdx].erase(Def); |
1445 | |
|
1446 | 0 | if (ImproveOccupancy <= DAG.MinOccupancy) |
1447 | 0 | break; |
1448 | | |
1449 | 0 | NewRescheduleRegions[I] = true; |
1450 | 0 | Improved = true; |
1451 | 0 | } |
1452 | |
|
1453 | 0 | if (!Improved) { |
1454 | | // Occupancy was not improved for all regions that were at MinOccupancy. |
1455 | | // Undo sinking and remove newly rematerialized instructions. |
1456 | 0 | for (auto &Entry : InsertedMIToOldDef) { |
1457 | 0 | MachineInstr *MI = Entry.first; |
1458 | 0 | MachineInstr *OldMI = Entry.second; |
1459 | 0 | Register Reg = MI->getOperand(0).getReg(); |
1460 | 0 | LIS->RemoveMachineInstrFromMaps(*MI); |
1461 | 0 | MI->eraseFromParent(); |
1462 | 0 | OldMI->clearRegisterDeads(Reg); |
1463 | 0 | LIS->removeInterval(Reg); |
1464 | 0 | LIS->createAndComputeVirtRegInterval(Reg); |
1465 | 0 | } |
1466 | 0 | return false; |
1467 | 0 | } |
1468 | | |
1469 | | // Occupancy was improved for all regions. |
1470 | 0 | for (auto &Entry : InsertedMIToOldDef) { |
1471 | 0 | MachineInstr *MI = Entry.first; |
1472 | 0 | MachineInstr *OldMI = Entry.second; |
1473 | | |
1474 | | // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. |
1475 | 0 | DAG.BBLiveInMap.erase(OldMI); |
1476 | | |
1477 | | // Remove OldMI and update LIS |
1478 | 0 | Register Reg = MI->getOperand(0).getReg(); |
1479 | 0 | LIS->RemoveMachineInstrFromMaps(*OldMI); |
1480 | 0 | OldMI->eraseFromParent(); |
1481 | 0 | LIS->removeInterval(Reg); |
1482 | 0 | LIS->createAndComputeVirtRegInterval(Reg); |
1483 | 0 | } |
1484 | | |
1485 | | // Update live-ins, register pressure, and regions caches. |
1486 | 0 | for (auto Idx : ImpactedRegions) { |
1487 | 0 | DAG.LiveIns[Idx] = NewLiveIns[Idx]; |
1488 | 0 | DAG.Pressure[Idx] = NewPressure[Idx]; |
1489 | 0 | DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent()); |
1490 | 0 | } |
1491 | 0 | DAG.Regions = NewRegions; |
1492 | 0 | DAG.RescheduleRegions = NewRescheduleRegions; |
1493 | |
|
1494 | 0 | SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
1495 | 0 | MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); |
1496 | |
|
1497 | 0 | return true; |
1498 | 0 | } |
1499 | | |
1500 | | // Copied from MachineLICM |
1501 | 0 | bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { |
1502 | 0 | if (!DAG.TII->isTriviallyReMaterializable(MI)) |
1503 | 0 | return false; |
1504 | | |
1505 | 0 | for (const MachineOperand &MO : MI.all_uses()) |
1506 | 0 | if (MO.getReg().isVirtual()) |
1507 | 0 | return false; |
1508 | | |
1509 | 0 | return true; |
1510 | 0 | } |
1511 | | |
1512 | | // When removing, we will have to check both beginning and ending of the region. |
1513 | | // When inserting, we will only have to check if we are inserting NewMI in front |
1514 | | // of a scheduling region and do not need to check the ending since we will only |
1515 | | // ever be inserting before an already existing MI. |
1516 | | void GCNScheduleDAGMILive::updateRegionBoundaries( |
1517 | | SmallVectorImpl<std::pair<MachineBasicBlock::iterator, |
1518 | | MachineBasicBlock::iterator>> &RegionBoundaries, |
1519 | 0 | MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) { |
1520 | 0 | unsigned I = 0, E = RegionBoundaries.size(); |
1521 | | // Search for first region of the block where MI is located |
1522 | 0 | while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent()) |
1523 | 0 | ++I; |
1524 | |
|
1525 | 0 | for (; I != E; ++I) { |
1526 | 0 | if (MI->getParent() != RegionBoundaries[I].first->getParent()) |
1527 | 0 | return; |
1528 | | |
1529 | 0 | if (Removing && MI == RegionBoundaries[I].first && |
1530 | 0 | MI == RegionBoundaries[I].second) { |
1531 | | // MI is in a region with size 1, after removing, the region will be |
1532 | | // size 0, set RegionBegin and RegionEnd to pass end of block iterator. |
1533 | 0 | RegionBoundaries[I] = |
1534 | 0 | std::pair(MI->getParent()->end(), MI->getParent()->end()); |
1535 | 0 | return; |
1536 | 0 | } |
1537 | 0 | if (MI == RegionBoundaries[I].first) { |
1538 | 0 | if (Removing) |
1539 | 0 | RegionBoundaries[I] = |
1540 | 0 | std::pair(std::next(MI), RegionBoundaries[I].second); |
1541 | 0 | else |
1542 | | // Inserted NewMI in front of region, set new RegionBegin to NewMI |
1543 | 0 | RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI), |
1544 | 0 | RegionBoundaries[I].second); |
1545 | 0 | return; |
1546 | 0 | } |
1547 | 0 | if (Removing && MI == RegionBoundaries[I].second) { |
1548 | 0 | RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI)); |
1549 | 0 | return; |
1550 | 0 | } |
1551 | 0 | } |
1552 | 0 | } |
1553 | | |
1554 | 0 | static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { |
1555 | 0 | return std::any_of( |
1556 | 0 | DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) { |
1557 | 0 | unsigned Opc = MI->getOpcode(); |
1558 | 0 | return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT; |
1559 | 0 | }); |
1560 | 0 | } |
1561 | | |
1562 | | GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( |
1563 | | MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, |
1564 | | bool RemoveKillFlags) |
1565 | 0 | : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} |
1566 | | |
1567 | 0 | void GCNPostScheduleDAGMILive::schedule() { |
1568 | 0 | HasIGLPInstrs = hasIGLPInstrs(this); |
1569 | 0 | if (HasIGLPInstrs) { |
1570 | 0 | SavedMutations.clear(); |
1571 | 0 | SavedMutations.swap(Mutations); |
1572 | 0 | addMutation(createIGroupLPDAGMutation(/*IsReentry=*/true)); |
1573 | 0 | } |
1574 | |
|
1575 | 0 | ScheduleDAGMI::schedule(); |
1576 | 0 | } |
1577 | | |
1578 | 0 | void GCNPostScheduleDAGMILive::finalizeSchedule() { |
1579 | 0 | if (HasIGLPInstrs) |
1580 | 0 | SavedMutations.swap(Mutations); |
1581 | |
|
1582 | 0 | ScheduleDAGMI::finalizeSchedule(); |
1583 | 0 | } |