Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
GCNSchedStrategy.cpp
Go to the documentation of this file.
1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNRegPressure.h"
31#include "llvm/ADT/STLExtras.h"
34#include "llvm/MC/LaneBitmask.h"
36
37#define DEBUG_TYPE "machine-scheduler"
38
39using namespace llvm;
40
42 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
43 cl::desc("Disable unclustered high register pressure "
44 "reduction scheduling stage."),
45 cl::init(false));
46
48 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
49 cl::desc("Disable clustered low occupancy "
50 "rescheduling for ILP scheduling stage."),
51 cl::init(false));
52
54 "amdgpu-schedule-metric-bias", cl::Hidden,
56 "Sets the bias which adds weight to occupancy vs latency. Set it to "
57 "100 to chase the occupancy only."),
58 cl::init(10));
59
60static cl::opt<bool>
61 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
62 cl::desc("Relax occupancy targets for kernels which are memory "
63 "bound (amdgpu-membound-threshold), or "
64 "Wave Limited (amdgpu-limit-wave-threshold)."),
65 cl::init(false));
66
68 "amdgpu-use-amdgpu-trackers", cl::Hidden,
69 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
70 cl::init(false));
71
72const unsigned ScheduleMetrics::ScaleFactor = 100;
73
78
81
82 MF = &DAG->MF;
83
84 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
85
87 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
89 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
90
92 // Set the initial TargetOccupnacy to the maximum occupancy that we can
93 // achieve for this function. This effectively sets a lower bound on the
94 // 'Critical' register limits in the scheduler.
95 // Allow for lower occupancy targets if kernel is wave limited or memory
96 // bound, and using the relaxed occupancy feature.
100 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
101
102 if (!KnownExcessRP) {
103 VGPRCriticalLimit = std::min(
104 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
106 } else {
107 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
108 // returns a reasonably small number for targets with lots of VGPRs, such
109 // as GFX10 and GFX11.
110 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
111 "VGPRCriticalLimit calculation method.\n");
112 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
113 unsigned Granule =
114 AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
115 unsigned Addressable =
116 AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
117 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
118 VGPRBudget = std::max(VGPRBudget, Granule);
119 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
120 }
121
122 // Subtract error margin and bias from register limits and avoid overflow.
127
128 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
129 << ", VGPRExcessLimit = " << VGPRExcessLimit
130 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
131 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
132}
133
134/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
135/// current register pressure.
136///
137/// This works for the common case, but it has a few exceptions that have been
138/// observed through trial and error:
139/// - Explicit physical register operands
140/// - Subregister definitions
141///
142/// In both of those cases, PressureDiff doesn't represent the actual pressure,
143/// and querying LiveIntervals through the RegPressureTracker is needed to get
144/// an accurate value.
145///
146/// We should eventually only use PressureDiff for maximum performance, but this
147/// already allows 80% of SUs to take the fast path without changing scheduling
148/// at all. Further changes would either change scheduling, or require a lot
149/// more logic to recover an accurate pressure estimate from the PressureDiffs.
150static bool canUsePressureDiffs(const SUnit &SU) {
151 if (!SU.isInstr())
152 return false;
153
154 // Cannot use pressure diffs for subregister defs or with physregs, it's
155 // imprecise in both cases.
156 for (const auto &Op : SU.getInstr()->operands()) {
157 if (!Op.isReg() || Op.isImplicit())
158 continue;
159 if (Op.getReg().isPhysical() ||
160 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
161 return false;
162 }
163 return true;
164}
165
167 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
168 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
169 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
170 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
171 // getDownwardPressure() and getUpwardPressure() make temporary changes to
172 // the tracker, so we need to pass those function a non-const copy.
173 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
174 if (!GCNTrackers) {
175 AtTop
176 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
177 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
178
179 return;
180 }
181
182 // GCNTrackers
183 Pressure.resize(4, 0);
184 MachineInstr *MI = SU->getInstr();
185 GCNRegPressure NewPressure;
186 if (AtTop) {
187 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
188 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
189 } else {
190 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
191 TempUpwardTracker.recede(*MI);
192 NewPressure = TempUpwardTracker.getPressure();
193 }
194 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
195 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
196 NewPressure.getArchVGPRNum();
197 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
198}
199
201 bool AtTop,
202 const RegPressureTracker &RPTracker,
203 const SIRegisterInfo *SRI,
204 unsigned SGPRPressure,
205 unsigned VGPRPressure, bool IsBottomUp) {
206 Cand.SU = SU;
207 Cand.AtTop = AtTop;
208
209 if (!DAG->isTrackingPressure())
210 return;
211
212 Pressure.clear();
213 MaxPressure.clear();
214
215 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
216 // possible over querying the RegPressureTracker.
217 //
218 // RegPressureTracker will make a lot of LIS queries which are very
219 // expensive, it is considered a slow function in this context.
220 //
221 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
222 // trivial lookup into an array. It is pretty much free.
223 //
224 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
225 // PressureDiffs.
226 if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
227 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
229 } else {
230 // Reserve 4 slots.
231 Pressure.resize(4, 0);
232 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
233 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
234
235 for (const auto &Diff : DAG->getPressureDiff(SU)) {
236 if (!Diff.isValid())
237 continue;
238 // PressureDiffs is always bottom-up so if we're working top-down we need
239 // to invert its sign.
240 Pressure[Diff.getPSet()] +=
241 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
242 }
243
244#ifdef EXPENSIVE_CHECKS
245 std::vector<unsigned> CheckPressure, CheckMaxPressure;
246 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
248 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
249 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
250 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
251 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
252 errs() << "Register Pressure is inaccurate when calculated through "
253 "PressureDiff\n"
254 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
255 << ", expected "
256 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
257 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
258 << ", expected "
259 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
260 report_fatal_error("inaccurate register pressure calculation");
261 }
262#endif
263 }
264
265 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
266 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
267
268 // If two instructions increase the pressure of different register sets
269 // by the same amount, the generic scheduler will prefer to schedule the
270 // instruction that increases the set with the least amount of registers,
271 // which in our case would be SGPRs. This is rarely what we want, so
272 // when we report excess/critical register pressure, we do it either
273 // only for VGPRs or only for SGPRs.
274
275 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
276 const unsigned MaxVGPRPressureInc = 16;
277 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
278 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
279
280 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
281 // to increase the likelihood we don't go over the limits. We should improve
282 // the analysis to look through dependencies to find the path with the least
283 // register pressure.
284
285 // We only need to update the RPDelta for instructions that increase register
286 // pressure. Instructions that decrease or keep reg pressure the same will be
287 // marked as RegExcess in tryCandidate() when they are compared with
288 // instructions that increase the register pressure.
289 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
290 HasHighPressure = true;
291 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
292 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
293 }
294
295 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
296 HasHighPressure = true;
297 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
298 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
299 }
300
301 // Register pressure is considered 'CRITICAL' if it is approaching a value
302 // that would reduce the wave occupancy for the execution unit. When
303 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
304 // has the same cost, so we don't need to prefer one over the other.
305
306 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
307 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
308
309 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
310 HasHighPressure = true;
311 if (SGPRDelta > VGPRDelta) {
312 Cand.RPDelta.CriticalMax =
313 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
314 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
315 } else {
316 Cand.RPDelta.CriticalMax =
317 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
318 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
319 }
320 }
321}
322
323// This function is mostly cut and pasted from
324// GenericScheduler::pickNodeFromQueue()
326 const CandPolicy &ZonePolicy,
327 const RegPressureTracker &RPTracker,
328 SchedCandidate &Cand,
329 bool IsBottomUp) {
330 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
332 unsigned SGPRPressure = 0;
333 unsigned VGPRPressure = 0;
334 if (DAG->isTrackingPressure()) {
335 if (!GCNTrackers) {
336 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
337 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
338 } else {
339 GCNRPTracker *T = IsBottomUp
340 ? static_cast<GCNRPTracker *>(&UpwardTracker)
341 : static_cast<GCNRPTracker *>(&DownwardTracker);
342 SGPRPressure = T->getPressure().getSGPRNum();
343 VGPRPressure = T->getPressure().getArchVGPRNum();
344 }
345 }
346 ReadyQueue &Q = Zone.Available;
347 for (SUnit *SU : Q) {
348
349 SchedCandidate TryCand(ZonePolicy);
350 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
351 VGPRPressure, IsBottomUp);
352 // Pass SchedBoundary only when comparing nodes from the same boundary.
353 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
354 tryCandidate(Cand, TryCand, ZoneArg);
355 if (TryCand.Reason != NoCand) {
356 // Initialize resource delta if needed in case future heuristics query it.
357 if (TryCand.ResDelta == SchedResourceDelta())
358 TryCand.initResourceDelta(Zone.DAG, SchedModel);
359 Cand.setBest(TryCand);
361 }
362 }
363}
364
365// This function is mostly cut and pasted from
366// GenericScheduler::pickNodeBidirectional()
368 // Schedule as far as possible in the direction of no choice. This is most
369 // efficient, but also provides the best heuristics for CriticalPSets.
370 if (SUnit *SU = Bot.pickOnlyChoice()) {
371 IsTopNode = false;
372 return SU;
373 }
374 if (SUnit *SU = Top.pickOnlyChoice()) {
375 IsTopNode = true;
376 return SU;
377 }
378 // Set the bottom-up policy based on the state of the current bottom zone and
379 // the instructions outside the zone, including the top zone.
380 CandPolicy BotPolicy;
381 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
382 // Set the top-down policy based on the state of the current top zone and
383 // the instructions outside the zone, including the bottom zone.
384 CandPolicy TopPolicy;
385 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
386
387 // See if BotCand is still valid (because we previously scheduled from Top).
388 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
389 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
390 BotCand.Policy != BotPolicy) {
391 BotCand.reset(CandPolicy());
392 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
393 /*IsBottomUp=*/true);
394 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
395 } else {
397#ifndef NDEBUG
398 if (VerifyScheduling) {
399 SchedCandidate TCand;
400 TCand.reset(CandPolicy());
401 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
402 /*IsBottomUp=*/true);
403 assert(TCand.SU == BotCand.SU &&
404 "Last pick result should correspond to re-picking right now");
405 }
406#endif
407 }
408
409 // Check if the top Q has a better candidate.
410 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
411 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
412 TopCand.Policy != TopPolicy) {
413 TopCand.reset(CandPolicy());
414 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
415 /*IsBottomUp=*/false);
416 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
417 } else {
419#ifndef NDEBUG
420 if (VerifyScheduling) {
421 SchedCandidate TCand;
422 TCand.reset(CandPolicy());
423 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
424 /*IsBottomUp=*/false);
425 assert(TCand.SU == TopCand.SU &&
426 "Last pick result should correspond to re-picking right now");
427 }
428#endif
429 }
430
431 // Pick best from BotCand and TopCand.
432 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
433 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
434 SchedCandidate Cand = BotCand;
435 TopCand.Reason = NoCand;
436 tryCandidate(Cand, TopCand, nullptr);
437 if (TopCand.Reason != NoCand) {
438 Cand.setBest(TopCand);
439 }
440 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
441
442 IsTopNode = Cand.AtTop;
443 return Cand.SU;
444}
445
446// This function is mostly cut and pasted from
447// GenericScheduler::pickNode()
449 if (DAG->top() == DAG->bottom()) {
450 assert(Top.Available.empty() && Top.Pending.empty() &&
451 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
452 return nullptr;
453 }
454 SUnit *SU;
455 do {
456 if (RegionPolicy.OnlyTopDown) {
457 SU = Top.pickOnlyChoice();
458 if (!SU) {
459 CandPolicy NoPolicy;
460 TopCand.reset(NoPolicy);
461 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
462 /*IsBottomUp=*/false);
463 assert(TopCand.Reason != NoCand && "failed to find a candidate");
464 SU = TopCand.SU;
465 }
466 IsTopNode = true;
467 } else if (RegionPolicy.OnlyBottomUp) {
468 SU = Bot.pickOnlyChoice();
469 if (!SU) {
470 CandPolicy NoPolicy;
471 BotCand.reset(NoPolicy);
472 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
473 /*IsBottomUp=*/true);
474 assert(BotCand.Reason != NoCand && "failed to find a candidate");
475 SU = BotCand.SU;
476 }
477 IsTopNode = false;
478 } else {
479 SU = pickNodeBidirectional(IsTopNode);
480 }
481 } while (SU->isScheduled);
482
483 if (SU->isTopReady())
484 Top.removeReady(SU);
485 if (SU->isBottomReady())
486 Bot.removeReady(SU);
487
488 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
489 << *SU->getInstr());
490 return SU;
491}
492
493void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
494 if (GCNTrackers) {
495 MachineInstr *MI = SU->getInstr();
496 IsTopNode ? (void)DownwardTracker.advance(MI, false)
497 : UpwardTracker.recede(*MI);
498 }
499
500 return GenericScheduler::schedNode(SU, IsTopNode);
501}
502
507
510 if (!CurrentStage)
511 CurrentStage = SchedStages.begin();
512 else
513 CurrentStage++;
514
515 return CurrentStage != SchedStages.end();
516}
517
520 return std::next(CurrentStage) != SchedStages.end();
521}
522
524 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
525 return *std::next(CurrentStage);
526}
527
537
542
544 SchedCandidate &TryCand,
545 SchedBoundary *Zone) const {
546 // Initialize the candidate if needed.
547 if (!Cand.isValid()) {
548 TryCand.Reason = NodeOrder;
549 return true;
550 }
551
552 // Avoid spilling by exceeding the register limit.
553 if (DAG->isTrackingPressure() &&
554 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
555 RegExcess, TRI, DAG->MF))
556 return TryCand.Reason != NoCand;
557
558 // Bias PhysReg Defs and copies to their uses and defined respectively.
559 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
560 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
561 return TryCand.Reason != NoCand;
562
563 bool SameBoundary = Zone != nullptr;
564 if (SameBoundary) {
565 // Prioritize instructions that read unbuffered resources by stall cycles.
566 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
567 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
568 return TryCand.Reason != NoCand;
569
570 // Avoid critical resource consumption and balance the schedule.
573 TryCand, Cand, ResourceReduce))
574 return TryCand.Reason != NoCand;
576 Cand.ResDelta.DemandedResources, TryCand, Cand,
578 return TryCand.Reason != NoCand;
579
580 // Unconditionally try to reduce latency.
581 if (tryLatency(TryCand, Cand, *Zone))
582 return TryCand.Reason != NoCand;
583
584 // Weak edges are for clustering and other constraints.
585 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
586 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
587 return TryCand.Reason != NoCand;
588 }
589
590 // Keep clustered nodes together to encourage downstream peephole
591 // optimizations which may reduce resource requirements.
592 //
593 // This is a best effort to set things up for a post-RA pass. Optimizations
594 // like generating loads of multiple registers should ideally be done within
595 // the scheduler pass by combining the loads during DAG postprocessing.
596 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
597 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
598 bool CandIsClusterSucc =
599 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
600 bool TryCandIsClusterSucc =
601 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
602 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
603 Cluster))
604 return TryCand.Reason != NoCand;
605
606 // Avoid increasing the max critical pressure in the scheduled region.
607 if (DAG->isTrackingPressure() &&
609 TryCand, Cand, RegCritical, TRI, DAG->MF))
610 return TryCand.Reason != NoCand;
611
612 // Avoid increasing the max pressure of the entire region.
613 if (DAG->isTrackingPressure() &&
614 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
615 Cand, RegMax, TRI, DAG->MF))
616 return TryCand.Reason != NoCand;
617
618 if (SameBoundary) {
619 // Fall through to original instruction order.
620 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
621 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
622 TryCand.Reason = NodeOrder;
623 return true;
624 }
625 }
626 return false;
627}
628
634
635/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
636/// much as possible. This is achieved by:
637// 1. Prioritize clustered operations before stall latency heuristic.
638// 2. Prioritize long-latency-load before stall latency heuristic.
639///
640/// \param Cand provides the policy and current best candidate.
641/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
642/// \param Zone describes the scheduled zone that we are extending, or nullptr
643/// if Cand is from a different zone than TryCand.
644/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
646 SchedCandidate &TryCand,
647 SchedBoundary *Zone) const {
648 // Initialize the candidate if needed.
649 if (!Cand.isValid()) {
650 TryCand.Reason = NodeOrder;
651 return true;
652 }
653
654 // Bias PhysReg Defs and copies to their uses and defined respectively.
655 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
656 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
657 return TryCand.Reason != NoCand;
658
659 if (DAG->isTrackingPressure()) {
660 // Avoid exceeding the target's limit.
661 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
662 RegExcess, TRI, DAG->MF))
663 return TryCand.Reason != NoCand;
664
665 // Avoid increasing the max critical pressure in the scheduled region.
667 TryCand, Cand, RegCritical, TRI, DAG->MF))
668 return TryCand.Reason != NoCand;
669 }
670
671 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
672 // get more benefit from clausing these memory instructions.
673 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
674 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
675 bool CandIsClusterSucc =
676 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
677 bool TryCandIsClusterSucc =
678 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
679 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
680 Cluster))
681 return TryCand.Reason != NoCand;
682
683 // We only compare a subset of features when comparing nodes between
684 // Top and Bottom boundary. Some properties are simply incomparable, in many
685 // other instances we should only override the other boundary if something
686 // is a clear good pick on one boundary. Skip heuristics that are more
687 // "tie-breaking" in nature.
688 bool SameBoundary = Zone != nullptr;
689 if (SameBoundary) {
690 // For loops that are acyclic path limited, aggressively schedule for
691 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
692 // heuristics to take precedence.
693 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
694 tryLatency(TryCand, Cand, *Zone))
695 return TryCand.Reason != NoCand;
696
697 // MaxMemoryClause-specific: Prioritize long latency memory load
698 // instructions in top-bottom order to hide more latency. The mayLoad check
699 // is used to exclude store-like instructions, which we do not want to
700 // scheduler them too early.
701 bool TryMayLoad =
702 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
703 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
704
705 if (TryMayLoad || CandMayLoad) {
706 bool TryLongLatency =
707 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
708 bool CandLongLatency =
709 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
710
711 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
712 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
713 Cand, Stall))
714 return TryCand.Reason != NoCand;
715 }
716 // Prioritize instructions that read unbuffered resources by stall cycles.
717 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
718 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
719 return TryCand.Reason != NoCand;
720 }
721
722 if (SameBoundary) {
723 // Weak edges are for clustering and other constraints.
724 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
725 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
726 return TryCand.Reason != NoCand;
727 }
728
729 // Avoid increasing the max pressure of the entire region.
730 if (DAG->isTrackingPressure() &&
731 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
732 Cand, RegMax, TRI, DAG->MF))
733 return TryCand.Reason != NoCand;
734
735 if (SameBoundary) {
736 // Avoid critical resource consumption and balance the schedule.
739 TryCand, Cand, ResourceReduce))
740 return TryCand.Reason != NoCand;
742 Cand.ResDelta.DemandedResources, TryCand, Cand,
744 return TryCand.Reason != NoCand;
745
746 // Avoid serializing long latency dependence chains.
747 // For acyclic path limited loops, latency was already checked above.
748 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
749 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
750 return TryCand.Reason != NoCand;
751
752 // Fall through to original instruction order.
753 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
754 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
755 TryCand.Reason = NodeOrder;
756 return true;
757 }
758 }
759
760 return false;
761}
762
764 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
765 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
766 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
767 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
768 RegionLiveOuts(this, /*IsLiveOut=*/true) {
769
770 // We want regions with a single MI to be scheduled so that we can reason
771 // about them correctly during scheduling stages that move MIs between regions
772 // (e.g., rematerialization).
774 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
775 if (RelaxedOcc) {
776 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
777 if (MinOccupancy != StartingOccupancy)
778 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
779 << ".\n");
780 }
781}
782
783std::unique_ptr<GCNSchedStage>
784GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
785 switch (SchedStageID) {
787 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
789 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
791 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
793 return std::make_unique<PreRARematStage>(SchedStageID, *this);
795 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
797 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
798 *this);
799 }
800
801 llvm_unreachable("Unknown SchedStageID.");
802}
803
805 // Collect all scheduling regions. The actual scheduling is performed in
806 // GCNScheduleDAGMILive::finalizeSchedule.
807 Regions.push_back(std::pair(RegionBegin, RegionEnd));
808}
809
811GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
813 RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
814 &LiveIns[RegionIdx]);
815 return RPTracker.moveMaxPressure();
816}
817
819 MachineBasicBlock::iterator RegionEnd) {
820 auto REnd = RegionEnd == RegionBegin->getParent()->end()
821 ? std::prev(RegionEnd)
822 : RegionEnd;
823 return &*skipDebugInstructionsBackward(REnd, RegionBegin);
824}
825
826void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
827 const MachineBasicBlock *MBB) {
828 GCNDownwardRPTracker RPTracker(*LIS);
829
830 // If the block has the only successor then live-ins of that successor are
831 // live-outs of the current block. We can reuse calculated live set if the
832 // successor will be sent to scheduling past current block.
833
834 // However, due to the bug in LiveInterval analysis it may happen that two
835 // predecessors of the same successor block have different lane bitmasks for
836 // a live-out register. Workaround that by sticking to one-to-one relationship
837 // i.e. one predecessor with one successor block.
838 const MachineBasicBlock *OnlySucc = nullptr;
839 if (MBB->succ_size() == 1) {
840 auto *Candidate = *MBB->succ_begin();
841 if (!Candidate->empty() && Candidate->pred_size() == 1) {
842 SlotIndexes *Ind = LIS->getSlotIndexes();
843 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
844 OnlySucc = Candidate;
845 }
846 }
847
848 // Scheduler sends regions from the end of the block upwards.
849 size_t CurRegion = RegionIdx;
850 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
851 if (Regions[CurRegion].first->getParent() != MBB)
852 break;
853 --CurRegion;
854
855 auto I = MBB->begin();
856 auto LiveInIt = MBBLiveIns.find(MBB);
857 auto &Rgn = Regions[CurRegion];
858 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
859 if (LiveInIt != MBBLiveIns.end()) {
860 auto LiveIn = std::move(LiveInIt->second);
861 RPTracker.reset(*MBB->begin(), &LiveIn);
862 MBBLiveIns.erase(LiveInIt);
863 } else {
864 I = Rgn.first;
865 auto LRS = BBLiveInMap.lookup(NonDbgMI);
866#ifdef EXPENSIVE_CHECKS
867 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
868#endif
869 RPTracker.reset(*I, &LRS);
870 }
871
872 for (;;) {
873 I = RPTracker.getNext();
874
875 if (Regions[CurRegion].first == I || NonDbgMI == I) {
876 LiveIns[CurRegion] = RPTracker.getLiveRegs();
877 RPTracker.clearMaxPressure();
878 }
879
880 if (Regions[CurRegion].second == I) {
881 Pressure[CurRegion] = RPTracker.moveMaxPressure();
882 if (CurRegion-- == RegionIdx)
883 break;
884 auto &Rgn = Regions[CurRegion];
885 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
886 }
887 RPTracker.advanceToNext();
888 RPTracker.advanceBeforeNext();
889 }
890
891 if (OnlySucc) {
892 if (I != MBB->end()) {
893 RPTracker.advanceToNext();
894 RPTracker.advance(MBB->end());
895 }
896 RPTracker.advanceBeforeNext();
897 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
898 }
899}
900
902GCNScheduleDAGMILive::getRegionLiveInMap() const {
903 assert(!Regions.empty());
904 std::vector<MachineInstr *> RegionFirstMIs;
905 RegionFirstMIs.reserve(Regions.size());
906 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
907 RegionFirstMIs.push_back(
909
910 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
911}
912
914GCNScheduleDAGMILive::getRegionLiveOutMap() const {
915 assert(!Regions.empty());
916 std::vector<MachineInstr *> RegionLastMIs;
917 RegionLastMIs.reserve(Regions.size());
918 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
919 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
920
921 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
922}
923
925 IdxToInstruction.clear();
926
927 RegionLiveRegMap =
928 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
929 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
930 MachineInstr *RegionKey =
931 IsLiveOut
932 ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
933 : &*DAG->Regions[I].first;
934 IdxToInstruction[I] = RegionKey;
935 }
936}
937
939 // Start actual scheduling here. This function is called by the base
940 // MachineScheduler after all regions have been recorded by
941 // GCNScheduleDAGMILive::schedule().
942 LiveIns.resize(Regions.size());
943 Pressure.resize(Regions.size());
944 RegionsWithHighRP.resize(Regions.size());
945 RegionsWithExcessRP.resize(Regions.size());
946 RegionsWithIGLPInstrs.resize(Regions.size());
947 RegionsWithHighRP.reset();
948 RegionsWithExcessRP.reset();
949 RegionsWithIGLPInstrs.reset();
950
951 runSchedStages();
952}
953
954void GCNScheduleDAGMILive::runSchedStages() {
955 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
956
957 if (!Regions.empty()) {
958 BBLiveInMap = getRegionLiveInMap();
959 if (GCNTrackers)
960 RegionLiveOuts.buildLiveRegMap();
961 }
962
963 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
964 while (S.advanceStage()) {
965 auto Stage = createSchedStage(S.getCurrentStage());
966 if (!Stage->initGCNSchedStage())
967 continue;
968
969 for (auto Region : Regions) {
970 RegionBegin = Region.first;
971 RegionEnd = Region.second;
972 // Setup for scheduling the region and check whether it should be skipped.
973 if (!Stage->initGCNRegion()) {
974 Stage->advanceRegion();
975 exitRegion();
976 continue;
977 }
978
979 if (GCNTrackers) {
980 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
981 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
982 GCNRPTracker::LiveRegSet *RegionLiveIns =
983 &LiveIns[Stage->getRegionIdx()];
984
985 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
986 ->reset(MRI, *RegionLiveIns);
987 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
988 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
989 Stage->getRegionIdx()));
990 }
991
993 Stage->finalizeGCNRegion();
994 }
995
996 Stage->finalizeGCNSchedStage();
997 }
998}
999
1000#ifndef NDEBUG
1002 switch (StageID) {
1004 OS << "Max Occupancy Initial Schedule";
1005 break;
1007 OS << "Unclustered High Register Pressure Reschedule";
1008 break;
1010 OS << "Clustered Low Occupancy Reschedule";
1011 break;
1013 OS << "Pre-RA Rematerialize";
1014 break;
1016 OS << "Max ILP Initial Schedule";
1017 break;
1019 OS << "Max memory clause Initial Schedule";
1020 break;
1021 }
1022
1023 return OS;
1024}
1025#endif
1026
1030
1032 if (!DAG.LIS)
1033 return false;
1034
1035 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1036 return true;
1037}
1038
1041 return false;
1042
1044 return false;
1045
1046 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1047 return false;
1048
1049 SavedMutations.swap(DAG.Mutations);
1050 DAG.addMutation(
1052
1053 InitialOccupancy = DAG.MinOccupancy;
1054 // Aggressivly try to reduce register pressure in the unclustered high RP
1055 // stage. Temporarily increase occupancy target in the region.
1056 S.SGPRLimitBias = S.HighRPSGPRBias;
1057 S.VGPRLimitBias = S.HighRPVGPRBias;
1058 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1059 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1060
1061 LLVM_DEBUG(
1062 dbgs()
1063 << "Retrying function scheduling without clustering. "
1064 "Aggressivly try to reduce register pressure to achieve occupancy "
1065 << DAG.MinOccupancy << ".\n");
1066
1067 return true;
1068}
1069
1072 return false;
1073
1075 return false;
1076
1077 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1078 // been dropped. All regions will have already been scheduled with the ideal
1079 // occupancy targets.
1080 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1081 return false;
1082
1083 LLVM_DEBUG(
1084 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1085 << DAG.MinOccupancy << ".\n");
1086 return true;
1087}
1088
1089/// Allows to easily filter for this stage's debug output.
1090#define REMAT_PREFIX "[PreRARemat] "
1091#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1092
1094 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1095 // regions inbetween the defs and region we sinked the def to. Will need to be
1096 // fixed if there is another pass after this pass.
1097 assert(!S.hasNextStage());
1098
1099 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
1100 return false;
1101
1102 // Before performing any IR modification record the parent region of each MI
1103 // and the parent MBB of each region.
1104 const unsigned NumRegions = DAG.Regions.size();
1105 RegionBB.reserve(NumRegions);
1106 for (unsigned I = 0; I < NumRegions; ++I) {
1107 RegionBoundaries Region = DAG.Regions[I];
1108 for (auto MI = Region.first; MI != Region.second; ++MI)
1109 MIRegion.insert({&*MI, I});
1110 RegionBB.push_back(Region.first->getParent());
1111 }
1112
1113 if (!canIncreaseOccupancyOrReduceSpill())
1114 return false;
1115
1116 // Rematerialize identified instructions and update scheduler's state.
1117 rematerialize();
1118 if (GCNTrackers)
1119 DAG.RegionLiveOuts.buildLiveRegMap();
1120 REMAT_DEBUG({
1121 dbgs() << "Retrying function scheduling with new min. occupancy of "
1122 << AchievedOcc << " from rematerializing (original was "
1123 << DAG.MinOccupancy;
1124 if (TargetOcc)
1125 dbgs() << ", target was " << *TargetOcc;
1126 dbgs() << ")\n";
1127 });
1128
1129 if (AchievedOcc > DAG.MinOccupancy) {
1130 DAG.MinOccupancy = AchievedOcc;
1132 MFI.increaseOccupancy(MF, DAG.MinOccupancy);
1133 }
1134 return true;
1135}
1136
1138 DAG.finishBlock();
1139 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1140}
1141
1143 SavedMutations.swap(DAG.Mutations);
1144 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1145 if (DAG.MinOccupancy > InitialOccupancy) {
1147 << " stage successfully increased occupancy to "
1148 << DAG.MinOccupancy << '\n');
1149 }
1150
1152}
1153
1155 // Check whether this new region is also a new block.
1156 if (DAG.RegionBegin->getParent() != CurrentMBB)
1157 setupNewBlock();
1158
1159 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1160 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1161
1162 // Skip empty scheduling regions (0 or 1 schedulable instructions).
1163 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
1164 return false;
1165
1166 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1167 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1168 << " " << CurrentMBB->getName()
1169 << "\n From: " << *DAG.begin() << " To: ";
1170 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1171 else dbgs() << "End";
1172 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1173
1174 // Save original instruction order before scheduling for possible revert.
1175 Unsched.clear();
1176 Unsched.reserve(DAG.NumRegionInstrs);
1179 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1180 for (auto &I : DAG) {
1181 Unsched.push_back(&I);
1182 if (SII->isIGLPMutationOnly(I.getOpcode()))
1183 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1184 }
1185 } else {
1186 for (auto &I : DAG)
1187 Unsched.push_back(&I);
1188 }
1189
1190 PressureBefore = DAG.Pressure[RegionIdx];
1191
1192 LLVM_DEBUG(
1193 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1194 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1195 << "Region live-in pressure: "
1196 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1197 << "Region register pressure: " << print(PressureBefore));
1198
1199 S.HasHighPressure = false;
1200 S.KnownExcessRP = isRegionWithExcessRP();
1201
1202 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1204 SavedMutations.clear();
1205 SavedMutations.swap(DAG.Mutations);
1206 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1208 DAG.addMutation(createIGroupLPDAGMutation(
1209 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1211 }
1212
1213 return true;
1214}
1215
1217 // Only reschedule regions that have excess register pressure (i.e. spilling)
1218 // or had minimum occupancy at the beginning of the stage (as long as
1219 // rescheduling of previous regions did not make occupancy drop back down to
1220 // the initial minimum).
1221 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1222 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1223 (DAG.MinOccupancy <= InitialOccupancy ||
1224 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1225 InitialOccupancy))
1226 return false;
1227
1229}
1230
1232 // We may need to reschedule this region if it wasn't rescheduled in the last
1233 // stage, or if we found it was testing critical register pressure limits in
1234 // the unclustered reschedule stage. The later is because we may not have been
1235 // able to raise the min occupancy in the previous stage so the region may be
1236 // overly constrained even if it was already rescheduled.
1237 if (!DAG.RegionsWithHighRP[RegionIdx])
1238 return false;
1239
1241}
1242
1244 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1245}
1246
1248 if (CurrentMBB)
1249 DAG.finishBlock();
1250
1251 CurrentMBB = DAG.RegionBegin->getParent();
1252 DAG.startBlock(CurrentMBB);
1253 // Get real RP for the region if it hasn't be calculated before. After the
1254 // initial schedule stage real RP will be collected after scheduling.
1258 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1259}
1260
1262 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1263 if (S.HasHighPressure)
1264 DAG.RegionsWithHighRP[RegionIdx] = true;
1265
1266 // Revert scheduling if we have dropped occupancy or there is some other
1267 // reason that the original schedule is better.
1269
1270 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1272 SavedMutations.swap(DAG.Mutations);
1273
1274 DAG.exitRegion();
1275 advanceRegion();
1276}
1277
1279 // Check the results of scheduling.
1280 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1281
1282 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1283 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1284
1285 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1286
1287 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1288 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1289 DAG.Pressure[RegionIdx] = PressureAfter;
1290
1291 // Early out if we have achieved the occupancy target.
1292 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1293 return;
1294 }
1295
1296 unsigned TargetOccupancy = std::min(
1297 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1298 unsigned WavesAfter = std::min(
1299 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1300 unsigned WavesBefore = std::min(
1301 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1302 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1303 << ", after " << WavesAfter << ".\n");
1304
1305 // We may not be able to keep the current target occupancy because of the just
1306 // scheduled region. We might still be able to revert scheduling if the
1307 // occupancy before was higher, or if the current schedule has register
1308 // pressure higher than the excess limits which could lead to more spilling.
1309 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1310
1311 // Allow memory bound functions to drop to 4 waves if not limited by an
1312 // attribute.
1313 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1314 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1315 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1316 << MFI.getMinAllowedOccupancy() << " waves\n");
1317 NewOccupancy = WavesAfter;
1318 }
1319
1320 if (NewOccupancy < DAG.MinOccupancy) {
1321 DAG.MinOccupancy = NewOccupancy;
1322 MFI.limitOccupancy(DAG.MinOccupancy);
1323 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1324 << DAG.MinOccupancy << ".\n");
1325 }
1326 // The maximum number of arch VGPR on non-unified register file, or the
1327 // maximum VGPR + AGPR in the unified register file case.
1328 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1329 // The maximum number of arch VGPR for both unified and non-unified register
1330 // file.
1331 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1332 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1333
1334 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1335 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1336 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1337 PressureAfter.getSGPRNum() > MaxSGPRs) {
1338 DAG.RegionsWithHighRP[RegionIdx] = true;
1339 DAG.RegionsWithExcessRP[RegionIdx] = true;
1340 }
1341
1342 // Revert if this region's schedule would cause a drop in occupancy or
1343 // spilling.
1344 if (shouldRevertScheduling(WavesAfter))
1346 else
1347 DAG.Pressure[RegionIdx] = PressureAfter;
1348}
1349
1350unsigned
1351GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1352 DenseMap<unsigned, unsigned> &ReadyCycles,
1353 const TargetSchedModel &SM) {
1354 unsigned ReadyCycle = CurrCycle;
1355 for (auto &D : SU.Preds) {
1356 if (D.isAssignedRegDep()) {
1357 MachineInstr *DefMI = D.getSUnit()->getInstr();
1358 unsigned Latency = SM.computeInstrLatency(DefMI);
1359 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1360 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1361 }
1362 }
1363 ReadyCycles[SU.NodeNum] = ReadyCycle;
1364 return ReadyCycle;
1365}
1366
1367#ifndef NDEBUG
1369 bool operator()(std::pair<MachineInstr *, unsigned> A,
1370 std::pair<MachineInstr *, unsigned> B) const {
1371 return A.second < B.second;
1372 }
1373};
1374
1375static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1376 EarlierIssuingCycle> &ReadyCycles) {
1377 if (ReadyCycles.empty())
1378 return;
1379 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1380 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1381 << " ##################\n# Cycle #\t\t\tInstruction "
1382 " "
1383 " \n";
1384 unsigned IPrev = 1;
1385 for (auto &I : ReadyCycles) {
1386 if (I.second > IPrev + 1)
1387 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1388 << " CYCLES DETECTED ******************************\n\n";
1389 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1390 IPrev = I.second;
1391 }
1392}
1393#endif
1394
1396GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1397#ifndef NDEBUG
1398 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1399 ReadyCyclesSorted;
1400#endif
1401 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1402 unsigned SumBubbles = 0;
1403 DenseMap<unsigned, unsigned> ReadyCycles;
1404 unsigned CurrCycle = 0;
1405 for (auto &SU : InputSchedule) {
1406 unsigned ReadyCycle =
1407 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1408 SumBubbles += ReadyCycle - CurrCycle;
1409#ifndef NDEBUG
1410 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1411#endif
1412 CurrCycle = ++ReadyCycle;
1413 }
1414#ifndef NDEBUG
1415 LLVM_DEBUG(
1416 printScheduleModel(ReadyCyclesSorted);
1417 dbgs() << "\n\t"
1418 << "Metric: "
1419 << (SumBubbles
1420 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1421 : 1)
1422 << "\n\n");
1423#endif
1424
1425 return ScheduleMetrics(CurrCycle, SumBubbles);
1426}
1427
1430#ifndef NDEBUG
1431 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1432 ReadyCyclesSorted;
1433#endif
1434 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1435 unsigned SumBubbles = 0;
1436 DenseMap<unsigned, unsigned> ReadyCycles;
1437 unsigned CurrCycle = 0;
1438 for (auto &MI : DAG) {
1439 SUnit *SU = DAG.getSUnit(&MI);
1440 if (!SU)
1441 continue;
1442 unsigned ReadyCycle =
1443 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1444 SumBubbles += ReadyCycle - CurrCycle;
1445#ifndef NDEBUG
1446 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1447#endif
1448 CurrCycle = ++ReadyCycle;
1449 }
1450#ifndef NDEBUG
1451 LLVM_DEBUG(
1452 printScheduleModel(ReadyCyclesSorted);
1453 dbgs() << "\n\t"
1454 << "Metric: "
1455 << (SumBubbles
1456 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1457 : 1)
1458 << "\n\n");
1459#endif
1460
1461 return ScheduleMetrics(CurrCycle, SumBubbles);
1462}
1463
1464bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1465 if (WavesAfter < DAG.MinOccupancy)
1466 return true;
1467
1468 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
1469 if (DAG.MFI.isDynamicVGPREnabled()) {
1470 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1471 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
1472 PressureBefore.getVGPRNum(false));
1473 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1474 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
1475 PressureAfter.getVGPRNum(false));
1476 if (BlocksAfter > BlocksBefore)
1477 return true;
1478 }
1479
1480 return false;
1481}
1482
1485 return false;
1486
1488 return true;
1489
1490 if (mayCauseSpilling(WavesAfter))
1491 return true;
1492
1493 return false;
1494}
1495
1497 // If RP is not reduced in the unclustered reschedule stage, revert to the
1498 // old schedule.
1499 if ((WavesAfter <=
1500 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
1501 mayCauseSpilling(WavesAfter)) ||
1503 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1504 return true;
1505 }
1506
1507 // Do not attempt to relax schedule even more if we are already spilling.
1509 return false;
1510
1511 LLVM_DEBUG(
1512 dbgs()
1513 << "\n\t *** In shouldRevertScheduling ***\n"
1514 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
1515 ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
1516 LLVM_DEBUG(
1517 dbgs()
1518 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
1520 unsigned OldMetric = MBefore.getMetric();
1521 unsigned NewMetric = MAfter.getMetric();
1522 unsigned WavesBefore = std::min(
1523 S.getTargetOccupancy(),
1524 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
1525 unsigned Profit =
1526 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1528 NewMetric) /
1530 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1531 << MAfter << "Profit: " << Profit << "\n");
1532 return Profit < ScheduleMetrics::ScaleFactor;
1533}
1534
1537 return false;
1538
1540 return true;
1541
1542 if (mayCauseSpilling(WavesAfter))
1543 return true;
1544
1545 return false;
1546}
1547
1549 return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
1550 mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
1551}
1552
1554 if (mayCauseSpilling(WavesAfter))
1555 return true;
1556
1557 return false;
1558}
1559
1561 unsigned WavesAfter) {
1562 return mayCauseSpilling(WavesAfter);
1563}
1564
1565bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1566 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1568 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1569 return true;
1570 }
1571
1572 return false;
1573}
1574
1576 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1577 DAG.RegionEnd = DAG.RegionBegin;
1578 int SkippedDebugInstr = 0;
1579 for (MachineInstr *MI : Unsched) {
1580 if (MI->isDebugInstr()) {
1581 ++SkippedDebugInstr;
1582 continue;
1583 }
1584
1585 if (MI->getIterator() != DAG.RegionEnd) {
1586 DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI);
1587 if (!MI->isDebugInstr())
1588 DAG.LIS->handleMove(*MI, true);
1589 }
1590
1591 // Reset read-undef flags and update them later.
1592 for (auto &Op : MI->all_defs())
1593 Op.setIsUndef(false);
1594 RegisterOperands RegOpers;
1595 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1596 if (!MI->isDebugInstr()) {
1597 if (DAG.ShouldTrackLaneMasks) {
1598 // Adjust liveness and add missing dead+read-undef flags.
1599 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1600 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1601 } else {
1602 // Adjust for missing dead-def flags.
1603 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1604 }
1605 }
1606 DAG.RegionEnd = MI->getIterator();
1607 ++DAG.RegionEnd;
1608 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1609 }
1610
1611 // After reverting schedule, debug instrs will now be at the end of the block
1612 // and RegionEnd will point to the first debug instr. Increment RegionEnd
1613 // pass debug instrs to the actual end of the scheduling region.
1614 while (SkippedDebugInstr-- > 0)
1615 ++DAG.RegionEnd;
1616
1617 // If Unsched.front() instruction is a debug instruction, this will actually
1618 // shrink the region since we moved all debug instructions to the end of the
1619 // block. Find the first instruction that is not a debug instruction.
1620 DAG.RegionBegin = Unsched.front()->getIterator();
1621 if (DAG.RegionBegin->isDebugInstr()) {
1622 for (MachineInstr *MI : Unsched) {
1623 if (MI->isDebugInstr())
1624 continue;
1625 DAG.RegionBegin = MI->getIterator();
1626 break;
1627 }
1628 }
1629
1630 // Then move the debug instructions back into their correct place and set
1631 // RegionBegin and RegionEnd if needed.
1632 DAG.placeDebugValues();
1633
1634 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1635}
1636
1637bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
1638 const Function &F = MF.getFunction();
1639
1640 // Maps optimizable regions (i.e., regions at minimum and register-limited
1641 // occupancy, or regions with spilling) to the target RP we would like to
1642 // reach.
1644 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
1645 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
1646 auto ResetTargetRegions = [&]() {
1647 OptRegions.clear();
1648 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1649 const GCNRegPressure &RP = DAG.Pressure[I];
1650 GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
1651 if (!Target.satisfied())
1652 OptRegions.insert({I, Target});
1653 }
1654 };
1655
1656 ResetTargetRegions();
1657 if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
1658 // In addition to register usage being above addressable limits, occupancy
1659 // below the minimum is considered like "spilling" as well.
1660 TargetOcc = std::nullopt;
1661 } else {
1662 // There is no spilling and room to improve occupancy; set up "increased
1663 // occupancy targets" for all regions.
1664 TargetOcc = DAG.MinOccupancy + 1;
1665 unsigned VGPRBlockSize =
1666 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
1667 MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
1668 MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
1669 ResetTargetRegions();
1670 }
1671 REMAT_DEBUG({
1672 dbgs() << "Analyzing ";
1673 MF.getFunction().printAsOperand(dbgs(), false);
1674 dbgs() << ": ";
1675 if (OptRegions.empty()) {
1676 dbgs() << "no objective to achieve, occupancy is maximal at "
1677 << MFI.getMaxWavesPerEU();
1678 } else if (!TargetOcc) {
1679 dbgs() << "reduce spilling (minimum target occupancy is "
1680 << MFI.getMinWavesPerEU() << ')';
1681 } else {
1682 dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
1683 << TargetOcc;
1684 }
1685 dbgs() << '\n';
1686 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1687 if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
1688 dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
1689 << '\n';
1690 }
1691 }
1692 });
1693 if (OptRegions.empty())
1694 return false;
1695
1696 // Accounts for a reduction in RP in an optimizable region. Returns whether we
1697 // estimate that we have identified enough rematerialization opportunities to
1698 // achieve our goal, and sets Progress to true when this particular reduction
1699 // in pressure was helpful toward that goal.
1700 auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
1701 bool &Progress) -> bool {
1702 GCNRPTarget &Target = OptIt->getSecond();
1703 if (!Target.isSaveBeneficial(Reg))
1704 return false;
1705 Progress = true;
1706 Target.saveReg(Reg, Mask, DAG.MRI);
1707 if (Target.satisfied())
1708 OptRegions.erase(OptIt->getFirst());
1709 return OptRegions.empty();
1710 };
1711
1712 // We need up-to-date live-out info. to query live-out register masks in
1713 // regions containing rematerializable instructions.
1714 DAG.RegionLiveOuts.buildLiveRegMap();
1715
1716 // Cache set of registers that are going to be rematerialized.
1717 DenseSet<unsigned> RematRegs;
1718
1719 // Identify rematerializable instructions in the function.
1720 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1721 auto Region = DAG.Regions[I];
1722 for (auto MI = Region.first; MI != Region.second; ++MI) {
1723 // The instruction must be rematerializable.
1724 MachineInstr &DefMI = *MI;
1725 if (!isReMaterializable(DefMI))
1726 continue;
1727
1728 // We only support rematerializing virtual registers with one definition.
1729 Register Reg = DefMI.getOperand(0).getReg();
1730 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
1731 continue;
1732
1733 // We only care to rematerialize the instruction if it has a single
1734 // non-debug user in a different region. The using MI may not belong to a
1735 // region if it is a lone region terminator.
1736 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
1737 if (!UseMI)
1738 continue;
1739 auto UseRegion = MIRegion.find(UseMI);
1740 if (UseRegion != MIRegion.end() && UseRegion->second == I)
1741 continue;
1742
1743 // Do not rematerialize an instruction if it uses or is used by an
1744 // instruction that we have designated for rematerialization.
1745 // FIXME: Allow for rematerialization chains: this requires 1. updating
1746 // remat points to account for uses that are rematerialized, and 2. either
1747 // rematerializing the candidates in careful ordering, or deferring the
1748 // MBB RP walk until the entire chain has been rematerialized.
1749 if (Rematerializations.contains(UseMI) ||
1750 llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
1751 return MO.isReg() && RematRegs.contains(MO.getReg());
1752 }))
1753 continue;
1754
1755 // Do not rematerialize an instruction it it uses registers that aren't
1756 // available at its use. This ensures that we are not extending any live
1757 // range while rematerializing.
1758 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
1759 if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
1760 *DAG.TII))
1761 continue;
1762
1763 REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
1764 RematInstruction &Remat =
1765 Rematerializations.try_emplace(&DefMI, UseMI).first->second;
1766
1767 bool RematUseful = false;
1768 if (auto It = OptRegions.find(I); It != OptRegions.end()) {
1769 // Optimistically consider that moving the instruction out of its
1770 // defining region will reduce RP in the latter; this assumes that
1771 // maximum RP in the region is reached somewhere between the defining
1772 // instruction and the end of the region.
1773 REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
1774 LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
1775 if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
1776 return true;
1777 }
1778
1779 for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
1780 // We are only collecting regions in which the register is a live-in
1781 // (and may be live-through).
1782 auto It = DAG.LiveIns[LIRegion].find(Reg);
1783 if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
1784 continue;
1785 Remat.LiveInRegions.insert(LIRegion);
1786
1787 // Account for the reduction in RP due to the rematerialization in an
1788 // optimizable region in which the defined register is a live-in. This
1789 // is exact for live-through region but optimistic in the using region,
1790 // where RP is actually reduced only if maximum RP is reached somewhere
1791 // between the beginning of the region and the rematerializable
1792 // instruction's use.
1793 if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
1794 REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
1795 if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
1796 RematUseful))
1797 return true;
1798 }
1799 }
1800
1801 // If the instruction is not a live-in or live-out in any optimizable
1802 // region then there is no point in rematerializing it.
1803 if (!RematUseful) {
1804 Rematerializations.pop_back();
1805 REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n");
1806 } else {
1807 RematRegs.insert(Reg);
1808 }
1809 }
1810 }
1811
1812 if (TargetOcc) {
1813 // We were trying to increase occupancy but failed, abort the stage.
1814 REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
1815 Rematerializations.clear();
1816 return false;
1817 }
1818 REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
1819 return !Rematerializations.empty();
1820}
1821
1822void PreRARematStage::rematerialize() {
1823 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
1824
1825 // Collect regions whose RP changes in unpredictable way; we will have to
1826 // fully recompute their RP after all rematerailizations.
1827 DenseSet<unsigned> RecomputeRP;
1828
1829 // Rematerialize all instructions.
1830 for (auto &[DefMI, Remat] : Rematerializations) {
1831 MachineBasicBlock::iterator InsertPos(Remat.UseMI);
1833 unsigned DefRegion = MIRegion.at(DefMI);
1834
1835 // Rematerialize DefMI to its use block.
1836 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
1837 AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
1838 Remat.RematMI = &*std::prev(InsertPos);
1839 DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
1840
1841 // Update region boundaries in regions we sinked from (remove defining MI)
1842 // and to (insert MI rematerialized in use block). Only then we can erase
1843 // the original MI.
1844 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
1845 auto UseRegion = MIRegion.find(Remat.UseMI);
1846 if (UseRegion != MIRegion.end()) {
1847 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
1848 Remat.RematMI);
1849 }
1850 DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
1852
1853 // Collect all regions impacted by the rematerialization and update their
1854 // live-in/RP information.
1855 for (unsigned I : Remat.LiveInRegions) {
1856 ImpactedRegions.insert({I, DAG.Pressure[I]});
1857 GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
1858
1859#ifdef EXPENSIVE_CHECKS
1860 // All uses are known to be available / live at the remat point. Thus, the
1861 // uses should already be live in to the region.
1862 for (MachineOperand &MO : DefMI->operands()) {
1863 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
1864 continue;
1865
1866 Register UseReg = MO.getReg();
1867 if (!UseReg.isVirtual())
1868 continue;
1869
1870 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
1871 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
1872 if (LI.hasSubRanges() && MO.getSubReg())
1873 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
1874
1875 LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
1876 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
1877 // If this register has lanes not covered by the LiveIns, be sure they
1878 // do not map to any subrange. ref:
1879 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
1880 if (UncoveredLanes.any()) {
1881 assert(LI.hasSubRanges());
1882 for (LiveInterval::SubRange &SR : LI.subranges())
1883 assert((SR.LaneMask & UncoveredLanes).none());
1884 }
1885 }
1886#endif
1887
1888 // The register is no longer a live-in in all regions but the one that
1889 // contains the single use. In live-through regions, maximum register
1890 // pressure decreases predictably so we can directly update it. In the
1891 // using region, maximum RP may or may not decrease, so we will mark it
1892 // for re-computation after all materializations have taken place.
1893 LaneBitmask PrevMask = RegionLiveIns[Reg];
1894 RegionLiveIns.erase(Reg);
1895 RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
1896 if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
1897 DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
1898 else
1899 RecomputeRP.insert(I);
1900 }
1901 // RP in the region from which the instruction was rematerialized may or may
1902 // not decrease.
1903 ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
1904 RecomputeRP.insert(DefRegion);
1905
1906 // Recompute live interval to reflect the register's rematerialization.
1907 Register RematReg = Remat.RematMI->getOperand(0).getReg();
1908 DAG.LIS->removeInterval(RematReg);
1909 DAG.LIS->createAndComputeVirtRegInterval(RematReg);
1910 }
1911
1912 // All regions impacted by at least one rematerialization must be rescheduled.
1913 // Maximum pressure must also be recomputed for all regions where it changed
1914 // non-predictably and checked against the target occupancy.
1915 unsigned DynamicVGPRBlockSize =
1916 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
1917 AchievedOcc = MFI.getMaxWavesPerEU();
1918 for (auto &[I, OriginalRP] : ImpactedRegions) {
1919 bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
1920 RescheduleRegions[I] = !IsEmptyRegion;
1921 if (!RecomputeRP.contains(I))
1922 continue;
1923
1924 GCNRegPressure RP;
1925 if (IsEmptyRegion) {
1926 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
1927 } else {
1928 GCNDownwardRPTracker RPT(*DAG.LIS);
1929 auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
1930 DAG.Regions[I].second);
1931 if (NonDbgMI == DAG.Regions[I].second) {
1932 // Region is non-empty but contains only debug instructions.
1933 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
1934 } else {
1935 RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
1936 RPT.advance(DAG.Regions[I].second);
1937 RP = RPT.moveMaxPressure();
1938 }
1939 }
1940 DAG.Pressure[I] = RP;
1941 AchievedOcc =
1942 std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
1943 }
1944 REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
1945}
1946
1947// Copied from MachineLICM
1948bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
1949 if (!DAG.TII->isReMaterializable(MI))
1950 return false;
1951
1952 for (const MachineOperand &MO : MI.all_uses()) {
1953 // We can't remat physreg uses, unless it is a constant or an ignorable
1954 // use (e.g. implicit exec use on VALU instructions)
1955 if (MO.getReg().isPhysical()) {
1956 if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
1957 continue;
1958 return false;
1959 }
1960 }
1961
1962 return true;
1963}
1964
1966 // We consider that reducing spilling is always beneficial so we never
1967 // rollback rematerializations in such cases. It's also possible that
1968 // rescheduling lowers occupancy over the one achieved just through remats, in
1969 // which case we do not want to rollback either (the rescheduling was already
1970 // reverted in PreRARematStage::shouldRevertScheduling in such cases).
1971 unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
1972 if (!TargetOcc || MaxOcc >= *TargetOcc)
1973 return;
1974
1975 REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
1976 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
1977
1978 // Rollback the rematerializations.
1979 for (const auto &[DefMI, Remat] : Rematerializations) {
1980 MachineInstr &RematMI = *Remat.RematMI;
1981 unsigned DefRegion = MIRegion.at(DefMI);
1982 MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
1983 MachineBasicBlock *MBB = RegionBB[DefRegion];
1984 Register Reg = RematMI.getOperand(0).getReg();
1985
1986 // Re-rematerialize MI at the end of its original region. Note that it may
1987 // not be rematerialized exactly in the same position as originally within
1988 // the region, but it should not matter much.
1989 TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI,
1990 *DAG.TRI);
1991 MachineInstr *NewMI = &*std::prev(InsertPos);
1992 DAG.LIS->InsertMachineInstrInMaps(*NewMI);
1993
1994 auto UseRegion = MIRegion.find(Remat.UseMI);
1995 if (UseRegion != MIRegion.end()) {
1996 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
1997 nullptr);
1998 }
1999 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
2000
2001 // Erase rematerialized MI.
2002 DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
2003 RematMI.eraseFromParent();
2004
2005 // Recompute live interval for the re-rematerialized register
2006 DAG.LIS->removeInterval(Reg);
2007 DAG.LIS->createAndComputeVirtRegInterval(Reg);
2008
2009 // Re-add the register as a live-in in all regions it used to be one in.
2010 for (unsigned LIRegion : Remat.LiveInRegions)
2011 DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
2012 }
2013
2014 // Reset RP in all impacted regions.
2015 for (auto &[I, OriginalRP] : ImpactedRegions)
2016 DAG.Pressure[I] = OriginalRP;
2017
2019}
2020
2021void GCNScheduleDAGMILive::updateRegionBoundaries(
2023 MachineInstr *NewMI) {
2024 assert((!NewMI || NewMI != RegionBounds.second) &&
2025 "cannot remove at region end");
2026
2027 if (RegionBounds.first == RegionBounds.second) {
2028 assert(NewMI && "cannot remove from an empty region");
2029 RegionBounds.first = NewMI;
2030 return;
2031 }
2032
2033 // We only care for modifications at the beginning of a non-empty region since
2034 // the upper region boundary is exclusive.
2035 if (MI != RegionBounds.first)
2036 return;
2037 if (!NewMI)
2038 RegionBounds.first = std::next(MI); // Removal
2039 else
2040 RegionBounds.first = NewMI; // Insertion
2041}
2042
2044 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
2045 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
2046 return SII->isIGLPMutationOnly(MI->getOpcode());
2047 });
2048}
2049
2051 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
2052 bool RemoveKillFlags)
2054
2056 HasIGLPInstrs = hasIGLPInstrs(this);
2057 if (HasIGLPInstrs) {
2058 SavedMutations.clear();
2059 SavedMutations.swap(Mutations);
2061 }
2062
2064}
2065
2067 if (HasIGLPInstrs)
2068 SavedMutations.swap(Mutations);
2069
2071}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the GCNRegPressure class, which tracks registry pressure by bookkeeping number of S...
static cl::opt< bool > GCNTrackers("amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false))
static cl::opt< bool > DisableClusteredLowOccupancy("amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, cl::desc("Disable clustered low occupancy " "rescheduling for ILP scheduling stage."), cl::init(false))
#define REMAT_PREFIX
Allows to easily filter for this stage's debug output.
static MachineInstr * getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd)
static cl::opt< bool > RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, cl::desc("Relax occupancy targets for kernels which are memory " "bound (amdgpu-membound-threshold), or " "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false))
#define REMAT_DEBUG(X)
static cl::opt< bool > DisableUnclusterHighRP("amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustered high register pressure " "reduction scheduling stage."), cl::init(false))
static void printScheduleModel(std::set< std::pair< MachineInstr *, unsigned >, EarlierIssuingCycle > &ReadyCycles)
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG)
static bool canUsePressureDiffs(const SUnit &SU)
Checks whether SU can use the cached DAG pressure diffs to compute the current register pressure.
static void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector< unsigned > &Pressure, std::vector< unsigned > &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG, const SIRegisterInfo *SRI)
static cl::opt< unsigned > ScheduleMetricBias("amdgpu-schedule-metric-bias", cl::Hidden, cl::desc("Sets the bias which adds weight to occupancy vs latency. Set it to " "100 to chase the occupancy only."), cl::init(10))
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool shouldRevertScheduling(unsigned WavesAfter) override
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
bool erase(const KeyT &Val)
Definition DenseMap.h:311
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:213
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const
Mostly copy/paste from CodeGen/RegisterPressure.cpp Calculate the impact MI will have on CurPressure ...
GCNMaxILPSchedStrategy(const MachineSchedContext *C)
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as much as possible.
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C)
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, bool IsLegacyScheduler=false)
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNPostScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
Models a register pressure target, allowing to evaluate and track register savings against that targe...
GCNRegPressure getPressure() const
DenseMap< unsigned, LaneBitmask > LiveRegSet
GCNSchedStrategy & S
GCNRegPressure PressureBefore
bool isRegionWithExcessRP() const
bool mayCauseSpilling(unsigned WavesAfter)
ScheduleMetrics getScheduleMetrics(const std::vector< SUnit > &InputSchedule)
GCNScheduleDAGMILive & DAG
const GCNSchedStageID StageID
std::vector< MachineInstr * > Unsched
GCNRegPressure PressureAfter
MachineFunction & MF
SIMachineFunctionInfo & MFI
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, DenseMap< unsigned, unsigned > &ReadyCycles, const TargetSchedModel &SM)
virtual void finalizeGCNSchedStage()
virtual bool initGCNSchedStage()
virtual bool shouldRevertScheduling(unsigned WavesAfter)
std::vector< std::unique_ptr< ScheduleDAGMutation > > SavedMutations
GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
MachineBasicBlock * CurrentMBB
const GCNSubtarget & ST
This is a minimal scheduler strategy.
GCNDownwardRPTracker DownwardTracker
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
SUnit * pickNodeBidirectional(bool &IsTopNode)
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool IsBottomUp)
std::vector< unsigned > MaxPressure
GCNSchedStageID getCurrentStage()
SmallVectorImpl< GCNSchedStageID >::iterator CurrentStage
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
GCNDownwardRPTracker * getDownwardTracker()
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
GCNUpwardRPTracker UpwardTracker
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
GCNUpwardRPTracker * getUpwardTracker()
GCNSchedStageID getNextStage() const
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
void recede(const MachineInstr &MI)
Move to the state of RP just before the MI .
void traceCandidate(const SchedCandidate &Cand)
LLVM_ABI void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone)
Set the CandPolicy given a scheduling zone given the current resources and latencies inside and outsi...
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const MachineSchedContext * Context
const TargetRegisterInfo * TRI
SchedCandidate BotCand
Candidate last picked from Bot boundary.
SchedCandidate TopCand
Candidate last picked from Top boundary.
virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Apply a set of heuristics to a new candidate.
ScheduleDAGMILive * DAG
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
GenericScheduler(const MachineSchedContext *C)
bool shouldRevertScheduling(unsigned WavesAfter) override
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
Register getReg(unsigned Idx) const
Get the register for the operand index.
Representation of each machine instruction.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
unsigned getSubReg() const
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool initGCNSchedStage() override
Capture a change in pressure for a single pressure set.
Helpers for implementing custom MachineSchedStrategy classes.
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
LLVM_ABI void getUpwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction bottom-up.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
List of registers defined and used by a machine instruction.
LLVM_ABI void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, bool TrackLaneMasks, bool IgnoreDead)
Analyze the given instruction MI and fill in the Uses, Defs and DeadDefs list based on the MachineOpe...
LLVM_ABI void adjustLaneLiveness(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, SlotIndex Pos, MachineInstr *AddFlagsMI=nullptr)
Use liveness information to find out which uses/defs are partially undefined/dead and adjust the VReg...
LLVM_ABI void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS)
Use liveness information to find dead defs not marked with a dead flag and move them to the DeadDefs ...
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
bool isIGLPMutationOnly(unsigned Opcode) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned NodeNum
Entry # of node in the node vector.
unsigned short Latency
Node latency.
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
bool isBottomReady() const
bool isTopReady() const
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
A ScheduleDAG for scheduling lists of MachineInstr.
bool ScheduleSingleMIRegions
True if regions with a single MI should be scheduled.
MachineBasicBlock::iterator RegionEnd
The end of the range to be scheduled.
virtual void finalizeSchedule()
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
virtual void exitRegion()
Called when the scheduler has finished scheduling the current region.
bool RemoveKillFlags
True if the DAG builder should remove kill flags (in preparation for rescheduling).
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
RegPressureTracker RPTracker
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
std::vector< std::unique_ptr< ScheduleDAGMutation > > Mutations
Ordered list of DAG postprocessing steps.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
MachineFunction & MF
Machine function.
static const unsigned ScaleFactor
unsigned getMetric() const
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getMBBStartIdx(unsigned Num) const
Returns the first index in the given basic block number.
Provide an instruction scheduling machine model to CodeGen passes.
Target - Wrapper for Target specific information.
bool shouldRevertScheduling(unsigned WavesAfter) override
static bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, const LiveIntervals &LIS, const MachineRegisterInfo &MRI, const TargetInstrInfo &TII)
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize)
unsigned getDynamicVGPRBlockSize(const Function &F)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, Range &&LiveRegs)
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
std::pair< MachineBasicBlock::iterator, MachineBasicBlock::iterator > RegionBoundaries
A region's boundaries i.e.
LLVM_ABI cl::opt< bool > VerifyScheduling
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
IterT skipDebugInstructionsBackward(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It until it points to a non-debug instruction or to Begin and return the resulting iterator...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
DWARFExpression::Operation Op
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1847
DenseMap< MachineInstr *, GCNRPTracker::LiveRegSet > getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS)
creates a map MachineInstr -> LiveRegSet R - range of iterators on instructions After - upon entry or...
GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop)
Minimize physical register live ranges.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
bool operator()(std::pair< MachineInstr *, unsigned > A, std::pair< MachineInstr *, unsigned > B) const
unsigned getArchVGPRNum() const
unsigned getAGPRNum() const
unsigned getSGPRNum() const
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void reset(const CandPolicy &NewPolicy)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
constexpr bool any() const
Definition LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...