Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit cf60af8

Browse files
authored
[AMDGPU][Scheduler] Revert all regions when remat fails to increase occ. (llvm#177205)
When the rematerialization stage fails to increase occupancy in all regions, the current implementation only reverts the effect of re-scheduling in regions in which the increased occupancy target could not be achieved. However, given that re-scheduling with a higher occupancy target puts more pressure on the scheduler to achieve lower maximum RP at the cost of potentially lower ILP as well, region schedules made with higher occupancy targets are generally less desirable if the whole function is not able to meet that target. Therefore, if at least one region cannot reach its target, it makes sense to revert re-scheduling in all affected regions to go back to a schedule that was made with a lower occupancy target. This implements such logic for the rematerialization stage, and adds a test to showcase that re-scheduling is indeed interrupted/reverted as soon as a re-scheduled region that does not meet the increased target occupancy is encountered. As a minor improvement, this also sets higher occupancy targets for re-scheduling at the end of stage initialization in some cases. In cases where rematerializations alone are not able to achieve the target, this can push the scheduler to be more aggressive in reducing RP and achieve the target.
1 parent 9e5deb9 commit cf60af8

4 files changed

Lines changed: 218 additions & 33 deletions

File tree

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 58 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "llvm/ADT/STLExtras.h"
3232
#include "llvm/CodeGen/CalcSpillWeights.h"
3333
#include "llvm/CodeGen/MachineCycleAnalysis.h"
34+
#include "llvm/CodeGen/MachineOperand.h"
3435
#include "llvm/CodeGen/RegisterClassInfo.h"
3536
#include "llvm/MC/LaneBitmask.h"
3637
#include "llvm/Support/ErrorHandling.h"
@@ -1422,11 +1423,7 @@ bool PreRARematStage::initGCNSchedStage() {
14221423
dbgs() << ")\n";
14231424
});
14241425

1425-
if (AchievedOcc > DAG.MinOccupancy) {
1426-
DAG.MinOccupancy = AchievedOcc;
1427-
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1428-
MFI.increaseOccupancy(MF, DAG.MinOccupancy);
1429-
}
1426+
DAG.setTargetOccupancy(getStageTargetOccupancy());
14301427
return true;
14311428
}
14321429

@@ -1537,10 +1534,8 @@ bool UnclusteredHighRPStage::initGCNRegion() {
15371534
// occupancy changes in the DAG and MFI.
15381535
if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
15391536
IsAnyRegionScheduled = true;
1540-
if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) {
1541-
DAG.MinOccupancy = TempTargetOccupancy;
1542-
MFI.increaseOccupancy(MF, TempTargetOccupancy);
1543-
}
1537+
if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1538+
DAG.setTargetOccupancy(TempTargetOccupancy);
15441539
}
15451540
return IsSchedulingThisRegion;
15461541
}
@@ -1589,6 +1584,23 @@ void GCNSchedStage::finalizeGCNRegion() {
15891584
SavedMutations.swap(DAG.Mutations);
15901585
}
15911586

1587+
void PreRARematStage::finalizeGCNRegion() {
1588+
GCNSchedStage::finalizeGCNRegion();
1589+
// When the goal is to increase occupancy, all regions must reach the target
1590+
// occupancy for rematerializations to be possibly useful, otherwise we will
1591+
// just hurt latency for no benefit. If minimum occupancy drops below the
1592+
// target there is no point in trying to re-schedule further regions.
1593+
if (!TargetOcc)
1594+
return;
1595+
RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore);
1596+
if (DAG.MinOccupancy < *TargetOcc) {
1597+
REMAT_DEBUG(dbgs() << "Region " << RegionIdx
1598+
<< " cannot meet occupancy target, interrupting "
1599+
"re-scheduling in all regions\n");
1600+
RescheduleRegions.reset();
1601+
}
1602+
}
1603+
15921604
void GCNSchedStage::checkScheduling() {
15931605
// Check the results of scheduling.
15941606
PressureAfter = DAG.getRealRegPressure(RegionIdx);
@@ -1862,8 +1874,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
18621874
}
18631875

18641876
bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1865-
return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
1866-
mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
1877+
return mayCauseSpilling(WavesAfter);
18671878
}
18681879

18691880
bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -2487,6 +2498,10 @@ bool RewriteMFMAFormStage::rewrite(
24872498
return true;
24882499
}
24892500

2501+
unsigned PreRARematStage::getStageTargetOccupancy() const {
2502+
return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
2503+
}
2504+
24902505
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
24912506
const Function &F = MF.getFunction();
24922507

@@ -2828,13 +2843,31 @@ bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
28282843

28292844
void PreRARematStage::finalizeGCNSchedStage() {
28302845
// We consider that reducing spilling is always beneficial so we never
2831-
// rollback rematerializations in such cases. It's also possible that
2832-
// rescheduling lowers occupancy over the one achieved just through remats, in
2833-
// which case we do not want to rollback either (the rescheduling was already
2834-
// reverted in PreRARematStage::shouldRevertScheduling in such cases).
2835-
unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
2836-
if (!TargetOcc || MaxOcc >= *TargetOcc)
2846+
// rollback rematerializations or revert scheduling in such cases.
2847+
if (!TargetOcc)
2848+
return;
2849+
2850+
// When increasing occupancy, it is possible that re-scheduling is not able to
2851+
// achieve the target occupancy in all regions, in which case re-scheduling in
2852+
// all regions should be reverted.
2853+
if (DAG.MinOccupancy >= *TargetOcc)
2854+
return;
2855+
for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
2856+
REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
2857+
<< '\n');
2858+
DAG.Pressure[RegionIdx] = MaxPressure;
2859+
modifyRegionSchedule(RegionIdx, RegionBB[RegionIdx], OrigMIOrder);
2860+
}
2861+
2862+
// It is possible that re-scheduling lowers occupancy over the one achieved
2863+
// just through rematerializations, in which case we revert re-scheduling in
2864+
// all regions but do not roll back rematerializations.
2865+
if (AchievedOcc >= *TargetOcc) {
2866+
DAG.setTargetOccupancy(AchievedOcc);
28372867
return;
2868+
}
2869+
// Reset the target occupancy to what it was pre-rematerialization.
2870+
DAG.setTargetOccupancy(*TargetOcc - 1);
28382871

28392872
REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
28402873
const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
@@ -2903,6 +2936,14 @@ void GCNScheduleDAGMILive::updateRegionBoundaries(
29032936
RegionBounds.first = NewMI; // Insertion
29042937
}
29052938

2939+
void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
2940+
MinOccupancy = TargetOccupancy;
2941+
if (MFI.getOccupancy() < TargetOccupancy)
2942+
MFI.increaseOccupancy(MF, MinOccupancy);
2943+
else
2944+
MFI.limitOccupancy(MinOccupancy);
2945+
}
2946+
29062947
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
29072948
const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
29082949
return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,9 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
313313
MachineBasicBlock::iterator MI,
314314
MachineInstr *NewMI);
315315

316+
/// Makes the scheduler try to achieve an occupancy of \p TargetOccupancy.
317+
void setTargetOccupancy(unsigned TargetOccupancy);
318+
316319
void runSchedStages();
317320

318321
std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID);
@@ -555,9 +558,30 @@ class PreRARematStage : public GCNSchedStage {
555558
/// objective is spilling reduction.
556559
std::optional<unsigned> TargetOcc;
557560
/// Achieved occupancy *only* through rematerializations (pre-rescheduling).
558-
/// Smaller than or equal to the target occupancy.
559561
unsigned AchievedOcc;
560562

563+
/// State of a region pre-re-scheduling but post-rematerializations that we
564+
/// must keep to be able to revert re-scheduling effects.
565+
struct RegionSchedRevert {
566+
/// Region number;
567+
unsigned RegionIdx;
568+
/// Original instruction order (both debug and non-debug MIs).
569+
std::vector<MachineInstr *> OrigMIOrder;
570+
/// Maximum pressure recorded in the region.
571+
GCNRegPressure MaxPressure;
572+
573+
RegionSchedRevert(unsigned RegionIdx, ArrayRef<MachineInstr *> OrigMIOrder,
574+
const GCNRegPressure &MaxPressure)
575+
: RegionIdx(RegionIdx), OrigMIOrder(OrigMIOrder),
576+
MaxPressure(MaxPressure) {}
577+
};
578+
/// After re-scheduling, contains pre-re-scheduling data for all re-scheduled
579+
/// regions.
580+
SmallVector<RegionSchedRevert> RegionReverts;
581+
582+
/// Returns the occupancy the stage is trying to achieve.
583+
unsigned getStageTargetOccupancy() const;
584+
561585
/// Returns whether remat can reduce spilling or increase function occupancy
562586
/// by 1 through rematerialization. If it can do one, collects instructions in
563587
/// PreRARematStage::Rematerializations and sets the target occupancy in
@@ -582,6 +606,8 @@ class PreRARematStage : public GCNSchedStage {
582606

583607
bool initGCNRegion() override;
584608

609+
void finalizeGCNRegion() override;
610+
585611
bool shouldRevertScheduling(unsigned WavesAfter) override;
586612

587613
PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)

llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
define void @sink_and_inc_idx_when_skipping_small_regions_2() "amdgpu-flat-work-group-size"="1,64" {
1111
ret void
1212
}
13+
14+
define void @test_occ_inc_revert_all_regions() {
15+
ret void
16+
}
1317
---
1418
name: sink_and_inc_idx_when_skipping_small_region_1
1519
tracksRegLiveness: true
@@ -154,3 +158,117 @@ body: |
154158
S_NOP 0, implicit %22
155159
S_ENDPGM 0
156160
...
161+
# bb.1 cannot meet the occupancy target even by rematerializing %64 into it
162+
# even though rematerialization heuristics believes it can; scheduling should
163+
# be interrupted and reverted in all re-scheduled regions.
164+
---
165+
name: test_occ_inc_revert_all_regions
166+
tracksRegLiveness: true
167+
machineFunctionInfo:
168+
isEntryFunction: true
169+
body: |
170+
; DEBUG: Machine code for function test_occ_inc_revert_all_regions: IsSSA, NoPHIs, TracksLiveness
171+
; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 7 from rematerializing (original was 7, target was 8)
172+
; DEBUG: Region 1 cannot meet occupancy target, interrupting re-scheduling in all regions
173+
; DEBUG: Reverting re-scheduling in region 0
174+
; DEBUG: Reverting re-scheduling in region 1
175+
; DEBUG-NOT: Reverting re-scheduling in region 3
176+
; DEBUG-NOT: Reverting re-scheduling in region 4
177+
bb.0:
178+
successors: %bb.1
179+
180+
%0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
181+
%1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
182+
%2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
183+
%3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
184+
%4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
185+
%5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
186+
%6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
187+
%7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
188+
%8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
189+
%9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
190+
%10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
191+
%11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
192+
%12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
193+
%13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
194+
%14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
195+
%15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
196+
%16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
197+
%17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
198+
%18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
199+
%19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
200+
%20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
201+
%21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
202+
%22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
203+
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
204+
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
205+
%25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
206+
%26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
207+
%27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
208+
%28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
209+
%29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
210+
%30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
211+
%31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
212+
213+
%64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
214+
215+
bb.1:
216+
successors: %bb.2
217+
218+
S_NOP 0, implicit %64
219+
220+
bb.2:
221+
successors: %bb.3
222+
223+
S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
224+
S_NOP 0, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15
225+
S_NOP 0, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23
226+
S_NOP 0, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
227+
228+
bb.3:
229+
successors: %bb.4
230+
231+
%32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
232+
%33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
233+
%34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
234+
%35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
235+
%36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
236+
%37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
237+
%38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
238+
%39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
239+
%40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
240+
%41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
241+
%42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
242+
%43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
243+
%44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
244+
%45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
245+
%46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
246+
%47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
247+
%48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
248+
%49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
249+
%50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
250+
%51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
251+
%52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
252+
%53:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
253+
%54:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
254+
%55:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
255+
%56:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
256+
%57:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
257+
%58:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
258+
%59:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
259+
%60:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
260+
%61:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
261+
%62:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
262+
%63:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
263+
264+
%65:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode
265+
266+
bb.4:
267+
S_NOP 0, implicit %32, implicit %33, implicit %34, implicit %35, implicit %36, implicit %37, implicit %38, implicit %39
268+
S_NOP 0, implicit %40, implicit %41, implicit %42, implicit %43, implicit %44, implicit %45, implicit %46, implicit %47
269+
S_NOP 0, implicit %48, implicit %49, implicit %50, implicit %51, implicit %52, implicit %53, implicit %54, implicit %55
270+
S_NOP 0, implicit %56, implicit %57, implicit %58, implicit %59, implicit %60, implicit %61, implicit %62, implicit %63
271+
S_NOP 0, implicit %65
272+
273+
S_ENDPGM 0
274+
...

0 commit comments

Comments
 (0)