46#define DEBUG_TYPE "si-insert-waitcnts"
49 "Force emit s_waitcnt expcnt(0) instrs");
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
53 "Force emit s_waitcnt vmcnt(0) instrs");
57 cl::desc(
"Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc(
"Force all waitcnt load counters to wait until 0"),
77 SAMPLE_CNT = NUM_NORMAL_INST_CNTS,
81 NUM_EXTENDED_INST_CNTS,
82 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
96auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
97 return enum_seq(LOAD_CNT, MaxCounter);
100using RegInterval = std::pair<int, int>;
102struct HardwareLimits {
106 unsigned StorecntMax;
107 unsigned SamplecntMax;
113#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
115 DECL(VMEM_READ_ACCESS) \
116 DECL(VMEM_SAMPLER_READ_ACCESS) \
117 DECL(VMEM_BVH_READ_ACCESS) \
118 DECL(VMEM_WRITE_ACCESS) \
119 DECL(SCRATCH_WRITE_ACCESS) \
129 DECL(EXP_POS_ACCESS) \
130 DECL(EXP_PARAM_ACCESS) \
135#define AMDGPU_EVENT_ENUM(Name) Name,
140#undef AMDGPU_EVENT_ENUM
142#define AMDGPU_EVENT_NAME(Name) #Name,
146#undef AMDGPU_EVENT_NAME
156enum RegisterMapping {
157 SQ_MAX_PGM_VGPRS = 2048,
159 SQ_MAX_PGM_SGPRS = 128,
165 FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS,
167 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS,
168 NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
170 SCC = NUM_ALL_ALLOCATABLE
191static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
192 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
193 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
194 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
202static bool isNormalMode(InstCounterType MaxCounter) {
203 return MaxCounter == NUM_NORMAL_INST_CNTS;
208 assert(updateVMCntOnly(Inst));
210 return VMEM_NOSAMPLER;
224 return VMEM_NOSAMPLER;
236 return Wait.StoreCnt;
238 return Wait.SampleCnt;
251 unsigned &WC = getCounterRef(
Wait,
T);
252 WC = std::min(WC,
Count);
256 getCounterRef(
Wait,
T) = ~0
u;
260 return getCounterRef(
Wait,
T);
264InstCounterType eventCounter(
const unsigned *masks, WaitEventType
E) {
265 for (
auto T : inst_counter_types()) {
266 if (masks[
T] & (1 <<
E))
272class WaitcntBrackets;
280class WaitcntGenerator {
282 const GCNSubtarget *ST =
nullptr;
283 const SIInstrInfo *TII =
nullptr;
284 AMDGPU::IsaVersion IV;
285 InstCounterType MaxCounter;
289 WaitcntGenerator() =
default;
290 WaitcntGenerator(
const MachineFunction &MF, InstCounterType MaxCounter)
291 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
298 bool isOptNone()
const {
return OptNone; }
312 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
313 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
317 bool promoteSoftWaitCnt(MachineInstr *Waitcnt)
const;
321 virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
323 AMDGPU::Waitcnt
Wait) = 0;
327 virtual const unsigned *getWaitEventMask()
const = 0;
331 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
333 virtual ~WaitcntGenerator() =
default;
336 static constexpr unsigned
337 eventMask(std::initializer_list<WaitEventType> Events) {
339 for (
auto &
E : Events)
346class WaitcntGeneratorPreGFX12 :
public WaitcntGenerator {
348 WaitcntGeneratorPreGFX12() =
default;
349 WaitcntGeneratorPreGFX12(
const MachineFunction &MF)
350 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
353 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
354 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
357 bool createNewWaitcnt(MachineBasicBlock &
Block,
359 AMDGPU::Waitcnt
Wait)
override;
361 const unsigned *getWaitEventMask()
const override {
364 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
365 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
366 VMEM_BVH_READ_ACCESS}),
367 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
368 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
369 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
370 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
376 return WaitEventMaskForInstPreGFX12;
379 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
382class WaitcntGeneratorGFX12Plus :
public WaitcntGenerator {
384 WaitcntGeneratorGFX12Plus() =
default;
385 WaitcntGeneratorGFX12Plus(
const MachineFunction &MF,
386 InstCounterType MaxCounter)
387 : WaitcntGenerator(MF, MaxCounter) {}
390 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
391 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
394 bool createNewWaitcnt(MachineBasicBlock &
Block,
396 AMDGPU::Waitcnt
Wait)
override;
398 const unsigned *getWaitEventMask()
const override {
401 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
402 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
403 eventMask({LDS_ACCESS, GDS_ACCESS}),
404 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
405 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
406 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
407 eventMask({VMEM_SAMPLER_READ_ACCESS}),
408 eventMask({VMEM_BVH_READ_ACCESS}),
409 eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
410 eventMask({VMEM_GROUP, SMEM_GROUP})};
412 return WaitEventMaskForInstGFX12Plus;
415 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
418class SIInsertWaitcnts {
420 const GCNSubtarget *ST;
421 InstCounterType SmemAccessCounter;
422 InstCounterType MaxCounter;
423 const unsigned *WaitEventMaskForInst;
426 const SIInstrInfo *TII =
nullptr;
427 const SIRegisterInfo *TRI =
nullptr;
428 const MachineRegisterInfo *MRI =
nullptr;
430 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
431 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
432 MachineLoopInfo *MLI;
433 MachinePostDominatorTree *PDT;
437 std::unique_ptr<WaitcntBrackets> Incoming;
441 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
443 bool ForceEmitWaitcnt[NUM_INST_CNTS];
448 WaitcntGeneratorPreGFX12 WCGPreGFX12;
449 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
451 WaitcntGenerator *WCG =
nullptr;
455 DenseSet<MachineInstr *> ReleaseVGPRInsts;
457 HardwareLimits Limits;
460 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
462 : MLI(MLI), PDT(PDT), AA(AA) {
463 (void)ForceExpCounter;
464 (void)ForceLgkmCounter;
465 (void)ForceVMCounter;
468 unsigned getWaitCountMax(InstCounterType
T)
const {
471 return Limits.LoadcntMax;
473 return Limits.DscntMax;
475 return Limits.ExpcntMax;
477 return Limits.StorecntMax;
479 return Limits.SamplecntMax;
481 return Limits.BvhcntMax;
483 return Limits.KmcntMax;
485 return Limits.XcntMax;
492 bool shouldFlushVmCnt(MachineLoop *
ML,
const WaitcntBrackets &Brackets);
493 bool isPreheaderToFlush(MachineBasicBlock &
MBB,
494 const WaitcntBrackets &ScoreBrackets);
495 bool isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const;
496 bool run(MachineFunction &MF);
498 bool isForceEmitWaitcnt()
const {
499 for (
auto T : inst_counter_types())
500 if (ForceEmitWaitcnt[
T])
505 void setForceEmitWaitcnt() {
511 ForceEmitWaitcnt[
EXP_CNT] =
true;
513 ForceEmitWaitcnt[
EXP_CNT] =
false;
518 ForceEmitWaitcnt[DS_CNT] =
true;
519 ForceEmitWaitcnt[KM_CNT] =
true;
521 ForceEmitWaitcnt[DS_CNT] =
false;
522 ForceEmitWaitcnt[KM_CNT] =
false;
527 ForceEmitWaitcnt[LOAD_CNT] =
true;
528 ForceEmitWaitcnt[SAMPLE_CNT] =
true;
529 ForceEmitWaitcnt[BVH_CNT] =
true;
531 ForceEmitWaitcnt[LOAD_CNT] =
false;
532 ForceEmitWaitcnt[SAMPLE_CNT] =
false;
533 ForceEmitWaitcnt[BVH_CNT] =
false;
540 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
542 case AMDGPU::GLOBAL_INV:
543 return VMEM_READ_ACCESS;
544 case AMDGPU::GLOBAL_WB:
545 case AMDGPU::GLOBAL_WBINV:
546 return VMEM_WRITE_ACCESS;
552 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
553 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
564 if (TII->mayAccessScratchThroughFlat(Inst))
565 return SCRATCH_WRITE_ACCESS;
566 return VMEM_WRITE_ACCESS;
569 return VMEM_READ_ACCESS;
570 return VmemReadMapping[getVmemType(Inst)];
573 bool hasXcnt()
const {
return ST->hasWaitXCnt(); }
575 bool mayAccessVMEMThroughFlat(
const MachineInstr &
MI)
const;
576 bool mayAccessLDSThroughFlat(
const MachineInstr &
MI)
const;
577 bool isVmemAccess(
const MachineInstr &
MI)
const;
578 bool generateWaitcntInstBefore(MachineInstr &
MI,
579 WaitcntBrackets &ScoreBrackets,
580 MachineInstr *OldWaitcntInstr,
582 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
584 MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
585 MachineInstr *OldWaitcntInstr);
586 void updateEventWaitcntAfter(MachineInstr &Inst,
587 WaitcntBrackets *ScoreBrackets);
589 MachineBasicBlock *
Block)
const;
590 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
591 WaitcntBrackets &ScoreBrackets);
592 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
593 WaitcntBrackets &ScoreBrackets);
594 static bool asynchronouslyWritesSCC(
unsigned Opcode);
605class WaitcntBrackets {
607 WaitcntBrackets(
const SIInsertWaitcnts *Context) : Context(Context) {}
609 bool isSmemCounter(InstCounterType
T)
const {
610 return T == Context->SmemAccessCounter ||
T == X_CNT;
613 unsigned getSgprScoresIdx(InstCounterType
T)
const {
614 assert(isSmemCounter(
T) &&
"Invalid SMEM counter");
615 return T == X_CNT ? 1 : 0;
618 unsigned getScoreLB(InstCounterType
T)
const {
623 unsigned getScoreUB(InstCounterType
T)
const {
628 unsigned getScoreRange(InstCounterType
T)
const {
629 return getScoreUB(
T) - getScoreLB(
T);
632 unsigned getRegScore(
int GprNo, InstCounterType
T)
const {
633 if (GprNo < NUM_ALL_VGPRS)
634 return VgprScores[
T][GprNo];
636 if (GprNo < NUM_ALL_ALLOCATABLE)
637 return SgprScores[getSgprScoresIdx(
T)][GprNo - NUM_ALL_VGPRS];
645 RegInterval getRegInterval(
const MachineInstr *
MI,
646 const MachineRegisterInfo *
MRI,
647 const SIRegisterInfo *
TRI,
648 const MachineOperand &
Op)
const;
650 bool counterOutOfOrder(InstCounterType
T)
const;
651 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const;
652 void simplifyWaitcnt(InstCounterType
T,
unsigned &
Count)
const;
654 void determineWait(InstCounterType
T, RegInterval
Interval,
655 AMDGPU::Waitcnt &
Wait)
const;
656 void determineWait(InstCounterType
T,
int RegNo,
657 AMDGPU::Waitcnt &
Wait)
const {
658 determineWait(
T, {RegNo, RegNo + 1},
Wait);
660 void tryClearSCCWriteEvent(MachineInstr *Inst);
662 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
663 void applyWaitcnt(InstCounterType
T,
unsigned Count);
664 void applyXcnt(
const AMDGPU::Waitcnt &
Wait);
665 void updateByEvent(
const SIInstrInfo *
TII,
const SIRegisterInfo *
TRI,
666 const MachineRegisterInfo *
MRI, WaitEventType
E,
669 unsigned hasPendingEvent()
const {
return PendingEvents; }
670 unsigned hasPendingEvent(WaitEventType
E)
const {
671 return PendingEvents & (1 <<
E);
673 unsigned hasPendingEvent(InstCounterType
T)
const {
674 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[
T];
675 assert((HasPending != 0) == (getScoreRange(
T) != 0));
679 bool hasMixedPendingEvents(InstCounterType
T)
const {
680 unsigned Events = hasPendingEvent(
T);
682 return Events & (Events - 1);
685 bool hasPendingFlat()
const {
686 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
687 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
688 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
689 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
692 void setPendingFlat() {
693 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
694 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
697 bool hasPendingGDS()
const {
698 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
701 unsigned getPendingGDSWait()
const {
702 return std::min(getScoreUB(DS_CNT) - LastGDS,
703 Context->getWaitCountMax(DS_CNT) - 1);
706 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
710 bool hasOtherPendingVmemTypes(RegInterval
Interval, VmemType V)
const {
712 assert(RegNo < NUM_ALL_VGPRS);
713 if (VgprVmemTypes[RegNo] & ~(1 << V))
719 void clearVgprVmemTypes(RegInterval
Interval) {
721 assert(RegNo < NUM_ALL_VGPRS);
722 VgprVmemTypes[RegNo] = 0;
726 void setStateOnFunctionEntryOrReturn() {
727 setScoreUB(STORE_CNT,
728 getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
729 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
732 ArrayRef<const MachineInstr *> getLDSDMAStores()
const {
736 bool hasPointSampleAccel(
const MachineInstr &
MI)
const;
737 bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
740 void print(raw_ostream &)
const;
750 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
751 unsigned OtherScore);
753 void setScoreLB(InstCounterType
T,
unsigned Val) {
758 void setScoreUB(InstCounterType
T,
unsigned Val) {
765 if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
766 ScoreLBs[
EXP_CNT] = ScoreUBs[
EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
769 void setRegScore(
int GprNo, InstCounterType
T,
unsigned Val) {
770 setScoreByInterval({GprNo, GprNo + 1},
T, Val);
773 void setScoreByInterval(RegInterval
Interval, InstCounterType CntTy,
776 void setScoreByOperand(
const MachineInstr *
MI,
const SIRegisterInfo *
TRI,
777 const MachineRegisterInfo *
MRI,
778 const MachineOperand &
Op, InstCounterType CntTy,
781 const SIInsertWaitcnts *Context;
783 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
784 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
785 unsigned PendingEvents = 0;
787 unsigned LastFlat[NUM_INST_CNTS] = {0};
789 unsigned LastGDS = 0;
794 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
799 unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
801 unsigned SCCScore = 0;
803 const MachineInstr *PendingSCCWrite =
nullptr;
806 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
809 SmallVector<
const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
815 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
817 bool runOnMachineFunction(MachineFunction &MF)
override;
819 StringRef getPassName()
const override {
820 return "SI insert wait instructions";
823 void getAnalysisUsage(AnalysisUsage &AU)
const override {
826 AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
835RegInterval WaitcntBrackets::getRegInterval(
const MachineInstr *
MI,
839 if (
Op.getReg() == AMDGPU::SCC)
842 if (!
TRI->isInAllocatableClass(
Op.getReg()))
852 unsigned RegIdx =
TRI->getHWRegIndex(MCReg);
854 const TargetRegisterClass *RC =
TRI->getPhysRegBaseClass(
Op.getReg());
855 unsigned Size =
TRI->getRegSizeInBits(*RC);
858 if (
TRI->isVectorRegister(*
MRI,
Op.getReg())) {
863 Result.first += AGPR_OFFSET;
868 if (
Size == 16 &&
Context->ST->hasD16Writes32BitVgpr()) {
876 }
else if (
TRI->isSGPRReg(*
MRI,
Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
879 Result.first = RegIdx + NUM_ALL_VGPRS;
888void WaitcntBrackets::setScoreByInterval(RegInterval
Interval,
889 InstCounterType CntTy,
892 if (RegNo < NUM_ALL_VGPRS) {
893 VgprUB = std::max(VgprUB, RegNo);
894 VgprScores[CntTy][RegNo] = Score;
895 }
else if (RegNo < NUM_ALL_ALLOCATABLE) {
896 SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
897 SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
905void WaitcntBrackets::setScoreByOperand(
const MachineInstr *
MI,
906 const SIRegisterInfo *
TRI,
907 const MachineRegisterInfo *
MRI,
908 const MachineOperand &
Op,
909 InstCounterType CntTy,
unsigned Score) {
911 setScoreByInterval(
Interval, CntTy, Score);
919bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
924 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
934bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
935 const MachineInstr &
MI, RegInterval
Interval)
const {
936 if (!hasPointSampleAccel(
MI))
939 return hasOtherPendingVmemTypes(
Interval, VMEM_NOSAMPLER);
942void WaitcntBrackets::updateByEvent(
const SIInstrInfo *
TII,
943 const SIRegisterInfo *
TRI,
944 const MachineRegisterInfo *
MRI,
945 WaitEventType
E, MachineInstr &Inst) {
946 InstCounterType
T = eventCounter(
Context->WaitEventMaskForInst,
E);
948 unsigned UB = getScoreUB(
T);
949 unsigned CurrScore = UB + 1;
955 PendingEvents |= 1 <<
E;
956 setScoreUB(
T, CurrScore);
964 if (
const auto *AddrOp =
TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
965 setScoreByOperand(&Inst,
TRI,
MRI, *AddrOp, EXP_CNT, CurrScore);
968 if (
const auto *Data0 =
969 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
970 setScoreByOperand(&Inst,
TRI,
MRI, *Data0, EXP_CNT, CurrScore);
971 if (
const auto *Data1 =
972 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
973 setScoreByOperand(&Inst,
TRI,
MRI, *Data1, EXP_CNT, CurrScore);
976 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
977 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
978 for (
const MachineOperand &
Op : Inst.
all_uses()) {
979 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
980 setScoreByOperand(&Inst,
TRI,
MRI,
Op, EXP_CNT, CurrScore);
983 }
else if (
TII->isFLAT(Inst)) {
985 setScoreByOperand(&Inst,
TRI,
MRI,
986 *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
989 setScoreByOperand(&Inst,
TRI,
MRI,
990 *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
993 }
else if (
TII->isMIMG(Inst)) {
998 setScoreByOperand(&Inst,
TRI,
MRI,
999 *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1000 EXP_CNT, CurrScore);
1002 }
else if (
TII->isMTBUF(Inst)) {
1006 }
else if (
TII->isMUBUF(Inst)) {
1011 setScoreByOperand(&Inst,
TRI,
MRI,
1012 *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1013 EXP_CNT, CurrScore);
1015 }
else if (
TII->isLDSDIR(Inst)) {
1017 setScoreByOperand(&Inst,
TRI,
MRI,
1018 *
TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1019 EXP_CNT, CurrScore);
1021 if (
TII->isEXP(Inst)) {
1026 for (MachineOperand &DefMO : Inst.
all_defs()) {
1027 if (
TRI->isVGPR(*
MRI, DefMO.getReg())) {
1028 setScoreByOperand(&Inst,
TRI,
MRI, DefMO, EXP_CNT, CurrScore);
1032 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1033 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1034 setScoreByOperand(&Inst,
TRI,
MRI,
Op, EXP_CNT, CurrScore);
1037 }
else if (
T == X_CNT) {
1038 for (
const MachineOperand &
Op : Inst.
all_uses())
1039 setScoreByOperand(&Inst,
TRI,
MRI,
Op,
T, CurrScore);
1050 for (
const MachineOperand &
Op : Inst.
defs()) {
1052 if (
T == LOAD_CNT ||
T == SAMPLE_CNT ||
T == BVH_CNT) {
1053 if (
Interval.first >= NUM_ALL_VGPRS)
1055 if (updateVMCntOnly(Inst)) {
1060 VmemType
V = getVmemType(Inst);
1061 unsigned char TypesMask = 1 <<
V;
1064 if (hasPointSampleAccel(Inst))
1065 TypesMask |= 1 << VMEM_NOSAMPLER;
1067 VgprVmemTypes[RegNo] |= TypesMask;
1070 setScoreByInterval(
Interval,
T, CurrScore);
1073 (
TII->isDS(Inst) ||
TII->mayWriteLDSThroughDMA(Inst))) {
1078 if (!MemOp->isStore() ||
1083 auto AAI = MemOp->getAAInfo();
1091 if (!AAI || !AAI.Scope)
1093 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E && !Slot; ++
I) {
1094 for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
1095 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1101 if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
1103 LDSDMAStores.push_back(&Inst);
1104 Slot = LDSDMAStores.size();
1107 setRegScore(FIRST_LDS_VGPR + Slot,
T, CurrScore);
1109 setRegScore(FIRST_LDS_VGPR,
T, CurrScore);
1113 setRegScore(SCC,
T, CurrScore);
1114 PendingSCCWrite = &Inst;
1119void WaitcntBrackets::print(raw_ostream &OS)
const {
1123 for (
auto T : inst_counter_types(
Context->MaxCounter)) {
1124 unsigned SR = getScoreRange(
T);
1128 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1132 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1136 OS <<
" EXP_CNT(" << SR <<
"): ";
1139 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1143 OS <<
" SAMPLE_CNT(" << SR <<
"): ";
1146 OS <<
" BVH_CNT(" << SR <<
"): ";
1149 OS <<
" KM_CNT(" << SR <<
"): ";
1152 OS <<
" X_CNT(" << SR <<
"): ";
1155 OS <<
" UNKNOWN(" << SR <<
"): ";
1161 unsigned LB = getScoreLB(
T);
1163 for (
int J = 0; J <= VgprUB; J++) {
1164 unsigned RegScore = getRegScore(J,
T);
1167 unsigned RelScore = RegScore - LB - 1;
1168 if (J < FIRST_LDS_VGPR) {
1169 OS << RelScore <<
":v" << J <<
" ";
1171 OS << RelScore <<
":ds ";
1175 if (isSmemCounter(
T)) {
1176 for (
int J = 0; J <= SgprUB; J++) {
1177 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS,
T);
1180 unsigned RelScore = RegScore - LB - 1;
1181 OS << RelScore <<
":s" << J <<
" ";
1184 if (
T == KM_CNT && SCCScore > 0)
1185 OS << SCCScore <<
":scc ";
1190 OS <<
"Pending Events: ";
1191 if (hasPendingEvent()) {
1193 for (
unsigned I = 0;
I != NUM_WAIT_EVENTS; ++
I) {
1194 if (hasPendingEvent((WaitEventType)
I)) {
1195 OS <<
LS << WaitEventTypeName[
I];
1208void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
1209 simplifyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1210 simplifyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1211 simplifyWaitcnt(DS_CNT,
Wait.DsCnt);
1212 simplifyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1213 simplifyWaitcnt(SAMPLE_CNT,
Wait.SampleCnt);
1214 simplifyWaitcnt(BVH_CNT,
Wait.BvhCnt);
1215 simplifyWaitcnt(KM_CNT,
Wait.KmCnt);
1216 simplifyWaitcnt(X_CNT,
Wait.XCnt);
1219void WaitcntBrackets::simplifyWaitcnt(InstCounterType
T,
1220 unsigned &
Count)
const {
1224 if (
Count >= getScoreRange(
T))
1228void WaitcntBrackets::determineWait(InstCounterType
T, RegInterval
Interval,
1229 AMDGPU::Waitcnt &
Wait)
const {
1230 const unsigned LB = getScoreLB(
T);
1231 const unsigned UB = getScoreUB(
T);
1233 unsigned ScoreToWait = getRegScore(RegNo,
T);
1237 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1238 if ((
T == LOAD_CNT ||
T == DS_CNT) && hasPendingFlat() &&
1239 !
Context->ST->hasFlatLgkmVMemCountInOrder()) {
1243 addWait(
Wait,
T, 0);
1244 }
else if (counterOutOfOrder(
T)) {
1248 addWait(
Wait,
T, 0);
1252 unsigned NeededWait =
1253 std::min(UB - ScoreToWait,
Context->getWaitCountMax(
T) - 1);
1254 addWait(
Wait,
T, NeededWait);
1260void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1263 if (PendingSCCWrite &&
1264 PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1266 unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1268 if ((PendingEvents &
Context->WaitEventMaskForInst[KM_CNT]) ==
1269 SCC_WRITE_PendingEvent) {
1270 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1273 PendingEvents &= ~SCC_WRITE_PendingEvent;
1274 PendingSCCWrite =
nullptr;
1278void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
1279 applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1280 applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1281 applyWaitcnt(DS_CNT,
Wait.DsCnt);
1282 applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1283 applyWaitcnt(SAMPLE_CNT,
Wait.SampleCnt);
1284 applyWaitcnt(BVH_CNT,
Wait.BvhCnt);
1285 applyWaitcnt(KM_CNT,
Wait.KmCnt);
1289void WaitcntBrackets::applyWaitcnt(InstCounterType
T,
unsigned Count) {
1290 const unsigned UB = getScoreUB(
T);
1294 if (counterOutOfOrder(
T))
1296 setScoreLB(
T, std::max(getScoreLB(
T), UB -
Count));
1299 PendingEvents &= ~Context->WaitEventMaskForInst[
T];
1303void WaitcntBrackets::applyXcnt(
const AMDGPU::Waitcnt &
Wait) {
1307 if (
Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1308 return applyWaitcnt(X_CNT, 0);
1313 if (
Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1314 !hasPendingEvent(STORE_CNT))
1315 return applyWaitcnt(X_CNT, std::min(
Wait.XCnt,
Wait.LoadCnt));
1317 applyWaitcnt(X_CNT,
Wait.XCnt);
1322bool WaitcntBrackets::counterOutOfOrder(InstCounterType
T)
const {
1324 if ((
T ==
Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1325 (
T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1327 return hasMixedPendingEvents(
T);
1337char SIInsertWaitcntsLegacy::
ID = 0;
1342 return new SIInsertWaitcntsLegacy();
1347 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1352 if (NewEnc == MO.
getImm())
1363 case AMDGPU::S_WAIT_LOADCNT:
1365 case AMDGPU::S_WAIT_EXPCNT:
1367 case AMDGPU::S_WAIT_STORECNT:
1369 case AMDGPU::S_WAIT_SAMPLECNT:
1371 case AMDGPU::S_WAIT_BVHCNT:
1373 case AMDGPU::S_WAIT_DSCNT:
1375 case AMDGPU::S_WAIT_KMCNT:
1377 case AMDGPU::S_WAIT_XCNT:
1384bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
const {
1398bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1399 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1402 assert(isNormalMode(MaxCounter));
1405 MachineInstr *WaitcntInstr =
nullptr;
1406 MachineInstr *WaitcntVsCntInstr =
nullptr;
1409 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1411 dbgs() <<
"end of block\n";
1419 if (
II.isMetaInstruction()) {
1425 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1429 if (Opcode == AMDGPU::S_WAITCNT) {
1430 unsigned IEnc =
II.getOperand(0).getImm();
1433 ScoreBrackets.simplifyWaitcnt(OldWait);
1437 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1438 II.eraseFromParent();
1442 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1445 <<
"Before: " <<
Wait.LoadCnt <<
'\n';);
1446 ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR,
Wait);
1455 II.eraseFromParent();
1457 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1458 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1461 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1463 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1464 Wait.StoreCnt = std::min(
Wait.StoreCnt, OldVSCnt);
1466 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1467 II.eraseFromParent();
1470 WaitcntVsCntInstr = &
II;
1477 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1479 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1480 ScoreBrackets.applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1481 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1488 <<
"applied pre-existing waitcnt\n"
1489 <<
"New Instr at block end: " << *WaitcntInstr <<
'\n'
1490 :
dbgs() <<
"applied pre-existing waitcnt\n"
1491 <<
"Old Instr: " << *It
1492 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1495 if (WaitcntVsCntInstr) {
1497 AMDGPU::OpName::simm16,
Wait.StoreCnt);
1498 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1500 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1501 Wait.StoreCnt = ~0
u;
1504 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1505 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1507 :
dbgs() <<
"applied pre-existing waitcnt\n"
1508 <<
"Old Instr: " << *It
1509 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1517bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1519 AMDGPU::Waitcnt
Wait) {
1521 assert(isNormalMode(MaxCounter));
1528 if (
Wait.hasWaitExceptStoreCnt()) {
1530 [[maybe_unused]]
auto SWaitInst =
1535 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1536 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1539 if (
Wait.hasWaitStoreCnt()) {
1542 [[maybe_unused]]
auto SWaitInst =
1549 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1550 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1557WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1558 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt &&
ST->hasVscnt() ? 0 : ~0u);
1562WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1563 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1571bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1572 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1575 assert(!isNormalMode(MaxCounter));
1578 MachineInstr *CombinedLoadDsCntInstr =
nullptr;
1579 MachineInstr *CombinedStoreDsCntInstr =
nullptr;
1580 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1583 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
1585 dbgs() <<
"end of block\n";
1593 if (
II.isMetaInstruction()) {
1598 MachineInstr **UpdatableInstr;
1604 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1608 if (Opcode == AMDGPU::S_WAITCNT)
1611 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1613 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1616 ScoreBrackets.simplifyWaitcnt(OldWait);
1618 UpdatableInstr = &CombinedLoadDsCntInstr;
1619 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1621 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1624 ScoreBrackets.simplifyWaitcnt(OldWait);
1626 UpdatableInstr = &CombinedStoreDsCntInstr;
1627 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1630 II.eraseFromParent();
1636 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1638 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1639 addWait(
Wait, CT.value(), OldCnt);
1640 UpdatableInstr = &WaitInstrs[CT.value()];
1644 if (!*UpdatableInstr) {
1645 *UpdatableInstr = &
II;
1647 II.eraseFromParent();
1652 if (CombinedLoadDsCntInstr) {
1660 if (
Wait.LoadCnt != ~0u &&
Wait.DsCnt != ~0u) {
1663 AMDGPU::OpName::simm16, NewEnc);
1664 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1665 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1666 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1671 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1672 <<
"New Instr at block end: "
1673 << *CombinedLoadDsCntInstr <<
'\n'
1674 :
dbgs() <<
"applied pre-existing waitcnt\n"
1675 <<
"Old Instr: " << *It <<
"New Instr: "
1676 << *CombinedLoadDsCntInstr <<
'\n');
1683 if (CombinedStoreDsCntInstr) {
1685 if (
Wait.StoreCnt != ~0u &&
Wait.DsCnt != ~0u) {
1688 AMDGPU::OpName::simm16, NewEnc);
1689 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1690 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1691 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1692 Wait.StoreCnt = ~0
u;
1696 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1697 <<
"New Instr at block end: "
1698 << *CombinedStoreDsCntInstr <<
'\n'
1699 :
dbgs() <<
"applied pre-existing waitcnt\n"
1700 <<
"Old Instr: " << *It <<
"New Instr: "
1701 << *CombinedStoreDsCntInstr <<
'\n');
1714 if (
Wait.DsCnt != ~0u) {
1723 if (
Wait.LoadCnt != ~0u) {
1724 WaitsToErase.
push_back(&WaitInstrs[LOAD_CNT]);
1725 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
1726 }
else if (
Wait.StoreCnt != ~0u) {
1727 WaitsToErase.
push_back(&WaitInstrs[STORE_CNT]);
1728 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
1731 for (MachineInstr **WI : WaitsToErase) {
1735 (*WI)->eraseFromParent();
1741 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1742 if (!WaitInstrs[CT])
1745 unsigned NewCnt = getWait(
Wait, CT);
1746 if (NewCnt != ~0u) {
1748 AMDGPU::OpName::simm16, NewCnt);
1749 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1751 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1752 setNoWait(
Wait, CT);
1755 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1756 <<
"New Instr at block end: " << *WaitInstrs[CT]
1758 :
dbgs() <<
"applied pre-existing waitcnt\n"
1759 <<
"Old Instr: " << *It
1760 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
1771bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1773 AMDGPU::Waitcnt
Wait) {
1775 assert(!isNormalMode(MaxCounter));
1781 if (
Wait.DsCnt != ~0u) {
1782 MachineInstr *SWaitInst =
nullptr;
1784 if (
Wait.LoadCnt != ~0u) {
1792 }
else if (
Wait.StoreCnt != ~0u) {
1799 Wait.StoreCnt = ~0
u;
1807 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1808 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1815 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1820 [[maybe_unused]]
auto SWaitInst =
1827 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1828 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1835 unsigned Opc =
MI.getOpcode();
1836 return (
Opc == AMDGPU::S_CBRANCH_VCCNZ ||
Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1837 !
MI.getOperand(1).isUndef();
1865bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &
MI,
1866 WaitcntBrackets &ScoreBrackets,
1867 MachineInstr *OldWaitcntInstr,
1869 setForceEmitWaitcnt();
1873 AMDGPU::Waitcnt
Wait;
1879 if (
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1880 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1881 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1882 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1883 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1890 if (
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1891 MI.getOpcode() == AMDGPU::SI_RETURN ||
1892 MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1893 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1895 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
false));
1905 else if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
1906 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1907 if (!WCG->isOptNone() &&
1908 (
MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
1909 (
ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1910 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1911 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
1915 else if ((
MI.getOpcode() == AMDGPU::S_SENDMSG ||
1916 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1917 ST->hasLegacyGeometry() &&
1928 if (
MI.modifiesRegister(AMDGPU::EXEC,
TRI)) {
1931 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1932 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1933 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1934 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1941 if (
TII->isAlwaysGDS(
MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
1942 addWait(
Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
1948 Wait = AMDGPU::Waitcnt();
1950 const auto &CallAddrOp = *
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
1951 if (CallAddrOp.isReg()) {
1952 RegInterval CallAddrOpInterval =
1953 ScoreBrackets.getRegInterval(&
MI,
MRI,
TRI, CallAddrOp);
1955 ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
1958 if (
const auto *RtnAddrOp =
1959 TII->getNamedOperand(
MI, AMDGPU::OpName::dst)) {
1960 RegInterval RtnAddrOpInterval =
1961 ScoreBrackets.getRegInterval(&
MI,
MRI,
TRI, *RtnAddrOp);
1963 ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
1967 }
else if (
MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) {
1968 ScoreBrackets.tryClearSCCWriteEvent(&
MI);
1984 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
1985 const Value *
Ptr = Memop->getValue();
1986 if (Memop->isStore()) {
1987 if (
auto It = SLoadAddresses.
find(
Ptr); It != SLoadAddresses.
end()) {
1988 addWait(
Wait, SmemAccessCounter, 0);
1990 SLoadAddresses.
erase(It);
1993 unsigned AS = Memop->getAddrSpace();
1997 if (
TII->mayWriteLDSThroughDMA(
MI))
2001 unsigned RegNo = FIRST_LDS_VGPR;
2002 if (
Ptr && Memop->getAAInfo()) {
2003 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2004 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
2005 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true))
2006 ScoreBrackets.determineWait(LOAD_CNT, RegNo +
I + 1,
Wait);
2009 ScoreBrackets.determineWait(LOAD_CNT, RegNo,
Wait);
2011 if (Memop->isStore()) {
2012 ScoreBrackets.determineWait(EXP_CNT, RegNo,
Wait);
2017 for (
const MachineOperand &
Op :
MI.operands()) {
2022 if (
Op.isTied() &&
Op.isUse() &&
TII->doesNotReadTiedSource(
MI))
2027 const bool IsVGPR =
TRI->isVectorRegister(*
MRI,
Op.getReg());
2034 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
2043 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
2044 ScoreBrackets.hasOtherPendingVmemTypes(
Interval,
2046 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Interval) ||
2047 !
ST->hasVmemWriteVgprInOrder()) {
2049 ScoreBrackets.determineWait(SAMPLE_CNT,
Interval,
Wait);
2051 ScoreBrackets.clearVgprVmemTypes(
Interval);
2054 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2058 }
else if (
Op.getReg() == AMDGPU::SCC) {
2061 ScoreBrackets.determineWait(SmemAccessCounter,
Interval,
Wait);
2064 if (hasXcnt() &&
Op.isDef())
2082 if (
MI.getOpcode() == AMDGPU::S_BARRIER &&
2083 !
ST->hasAutoWaitcntBeforeBarrier() && !
ST->supportsBackOffBarrier()) {
2084 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2091 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2097 ScoreBrackets.simplifyWaitcnt(
Wait);
2102 Wait = WCG->getAllZeroWaitcnt(
false);
2104 if (ForceEmitWaitcnt[LOAD_CNT])
2106 if (ForceEmitWaitcnt[EXP_CNT])
2108 if (ForceEmitWaitcnt[DS_CNT])
2110 if (ForceEmitWaitcnt[SAMPLE_CNT])
2112 if (ForceEmitWaitcnt[BVH_CNT])
2114 if (ForceEmitWaitcnt[KM_CNT])
2116 if (ForceEmitWaitcnt[X_CNT])
2120 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2122 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2124 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2131 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2135bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
2137 MachineBasicBlock &
Block,
2138 WaitcntBrackets &ScoreBrackets,
2139 MachineInstr *OldWaitcntInstr) {
2142 if (OldWaitcntInstr)
2146 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2150 ScoreBrackets.applyWaitcnt(
Wait);
2153 if (
Wait.ExpCnt != ~0u && It !=
Block.instr_end() &&
2155 MachineOperand *WaitExp =
2156 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2164 <<
"Update Instr: " << *It);
2168 if (
Wait.KmCnt == 0 &&
Wait.XCnt != ~0u &&
2169 !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
2172 if (
Wait.LoadCnt == 0 &&
Wait.XCnt != ~0u &&
2173 !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
2179 if (
Wait.XCnt != ~0u && isVmemAccess(*It))
2182 if (WCG->createNewWaitcnt(
Block, It,
Wait))
2191bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(
const MachineInstr &
MI)
const {
2195 if (!
TII->usesVM_CNT(
MI))
2200 if (
MI.memoperands_empty())
2208 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2209 unsigned AS = Memop->getAddrSpace();
2220bool SIInsertWaitcnts::mayAccessLDSThroughFlat(
const MachineInstr &
MI)
const {
2224 if (!
TII->usesLGKM_CNT(
MI))
2228 if (
ST->isTgSplitEnabled())
2233 if (
MI.memoperands_empty())
2237 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2238 unsigned AS = Memop->getAddrSpace();
2246bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2247 return (
TII->isFLAT(
MI) && mayAccessVMEMThroughFlat(
MI)) ||
2253 return Opc == AMDGPU::GLOBAL_INV ||
Opc == AMDGPU::GLOBAL_WB ||
2254 Opc == AMDGPU::GLOBAL_WBINV;
2260 MachineBasicBlock *
Block)
const {
2261 auto BlockEnd =
Block->getParent()->end();
2262 auto BlockIter =
Block->getIterator();
2266 if (++BlockIter != BlockEnd) {
2267 It = BlockIter->instr_begin();
2274 if (!It->isMetaInstruction())
2282 return It->getOpcode() == AMDGPU::S_ENDPGM;
2286bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2287 MachineBasicBlock &
Block,
2288 WaitcntBrackets &ScoreBrackets) {
2289 AMDGPU::Waitcnt
Wait;
2290 bool NeedsEndPGMCheck =
false;
2298 NeedsEndPGMCheck =
true;
2301 ScoreBrackets.simplifyWaitcnt(
Wait);
2304 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2307 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2315void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2316 WaitcntBrackets *ScoreBrackets) {
2322 bool IsVMEMAccess =
false;
2323 bool IsSMEMAccess =
false;
2324 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2326 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2327 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_ACCESS, Inst);
2328 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_GPR_LOCK, Inst);
2329 ScoreBrackets->setPendingGDS();
2331 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
2333 }
else if (
TII->isFLAT(Inst)) {
2335 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, getVmemWaitEventType(Inst),
2342 int FlatASCount = 0;
2344 if (mayAccessVMEMThroughFlat(Inst)) {
2346 IsVMEMAccess =
true;
2347 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, getVmemWaitEventType(Inst),
2351 if (mayAccessLDSThroughFlat(Inst)) {
2353 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
2359 if (FlatASCount > 1)
2360 ScoreBrackets->setPendingFlat();
2363 IsVMEMAccess =
true;
2364 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, getVmemWaitEventType(Inst),
2367 if (
ST->vmemWriteNeedsExpWaitcnt() &&
2369 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMW_GPR_LOCK, Inst);
2371 }
else if (
TII->isSMRD(Inst)) {
2372 IsSMEMAccess =
true;
2373 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
2374 }
else if (Inst.
isCall()) {
2377 ScoreBrackets->applyWaitcnt(
2378 WCG->getAllZeroWaitcnt(
false));
2379 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2382 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2385 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_LDS_ACCESS, Inst);
2386 }
else if (
TII->isVINTERP(Inst)) {
2387 int64_t
Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2388 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2390 unsigned Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2392 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_PARAM_ACCESS, Inst);
2394 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_POS_ACCESS, Inst);
2396 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_GPR_LOCK, Inst);
2397 }
else if (asynchronouslyWritesSCC(Inst.
getOpcode())) {
2398 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SCC_WRITE, Inst);
2401 case AMDGPU::S_SENDMSG:
2402 case AMDGPU::S_SENDMSG_RTN_B32:
2403 case AMDGPU::S_SENDMSG_RTN_B64:
2404 case AMDGPU::S_SENDMSGHALT:
2405 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SQ_MESSAGE, Inst);
2407 case AMDGPU::S_MEMTIME:
2408 case AMDGPU::S_MEMREALTIME:
2409 case AMDGPU::S_GET_BARRIER_STATE_M0:
2410 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2411 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
2420 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMEM_GROUP, Inst);
2423 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_GROUP, Inst);
2426bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2427 unsigned OtherScore) {
2428 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2429 unsigned OtherShifted =
2430 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2431 Score = std::max(MyShifted, OtherShifted);
2432 return OtherShifted > MyShifted;
2440bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
2441 bool StrictDom =
false;
2443 VgprUB = std::max(VgprUB,
Other.VgprUB);
2444 SgprUB = std::max(SgprUB,
Other.SgprUB);
2446 for (
auto T : inst_counter_types(
Context->MaxCounter)) {
2448 const unsigned *WaitEventMaskForInst =
Context->WaitEventMaskForInst;
2449 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[
T];
2450 const unsigned OtherEvents =
Other.PendingEvents & WaitEventMaskForInst[
T];
2451 if (OtherEvents & ~OldEvents)
2453 PendingEvents |= OtherEvents;
2456 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
2457 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
2458 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
2459 if (NewUB < ScoreLBs[
T])
2463 M.OldLB = ScoreLBs[
T];
2464 M.OtherLB =
Other.ScoreLBs[
T];
2465 M.MyShift = NewUB - ScoreUBs[
T];
2466 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
2468 ScoreUBs[
T] = NewUB;
2470 StrictDom |= mergeScore(M, LastFlat[
T],
Other.LastFlat[
T]);
2473 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
2476 StrictDom |= mergeScore(M, SCCScore,
Other.SCCScore);
2477 if (
Other.hasPendingEvent(SCC_WRITE)) {
2478 unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2479 if (!OldEventsHasSCCWrite) {
2480 PendingSCCWrite =
Other.PendingSCCWrite;
2482 if (PendingSCCWrite !=
Other.PendingSCCWrite)
2483 PendingSCCWrite =
nullptr;
2488 for (
int J = 0; J <= VgprUB; J++)
2489 StrictDom |= mergeScore(M, VgprScores[
T][J],
Other.VgprScores[
T][J]);
2491 if (isSmemCounter(
T)) {
2492 unsigned Idx = getSgprScoresIdx(
T);
2493 for (
int J = 0; J <= SgprUB; J++)
2495 mergeScore(M, SgprScores[Idx][J],
Other.SgprScores[Idx][J]);
2499 for (
int J = 0; J <= VgprUB; J++) {
2500 unsigned char NewVmemTypes = VgprVmemTypes[J] |
Other.VgprVmemTypes[J];
2501 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2502 VgprVmemTypes[J] = NewVmemTypes;
2510 return Opcode == AMDGPU::S_WAITCNT ||
2513 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2514 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2515 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2519bool SIInsertWaitcnts::asynchronouslyWritesSCC(
unsigned Opcode) {
2520 return Opcode == AMDGPU::S_BARRIER_LEAVE ||
2521 Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
2522 Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
2526bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2527 MachineBasicBlock &
Block,
2528 WaitcntBrackets &ScoreBrackets) {
2532 dbgs() <<
"*** Begin Block: ";
2534 ScoreBrackets.dump();
2540 bool VCCZCorrect =
true;
2541 if (
ST->hasReadVCCZBug()) {
2544 VCCZCorrect =
false;
2545 }
else if (!
ST->partialVCCWritesUpdateVCCZ()) {
2548 VCCZCorrect =
false;
2552 MachineInstr *OldWaitcntInstr =
nullptr;
2557 MachineInstr &Inst = *Iter;
2566 if (!OldWaitcntInstr)
2567 OldWaitcntInstr = &Inst;
2572 bool FlushVmCnt =
Block.getFirstTerminator() == Inst &&
2573 isPreheaderToFlush(
Block, ScoreBrackets);
2576 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2578 OldWaitcntInstr =
nullptr;
2581 bool RestoreVCCZ = !VCCZCorrect &&
readsVCCZ(Inst);
2584 if (
ST->hasReadVCCZBug() || !
ST->partialVCCWritesUpdateVCCZ()) {
2588 if (!
ST->partialVCCWritesUpdateVCCZ())
2589 VCCZCorrect =
false;
2598 if (
ST->hasReadVCCZBug() &&
2599 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2602 VCCZCorrect =
false;
2610 if (
TII->isSMRD(Inst)) {
2611 for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
2614 if (!Memop->isInvariant()) {
2615 const Value *
Ptr = Memop->getValue();
2619 if (
ST->hasReadVCCZBug()) {
2621 VCCZCorrect =
false;
2625 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2627 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
2631 ScoreBrackets.dump();
2641 TII->get(
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2653 AMDGPU::Waitcnt
Wait;
2654 if (
Block.getFirstTerminator() ==
Block.end() &&
2655 isPreheaderToFlush(
Block, ScoreBrackets)) {
2656 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2658 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2660 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2669 dbgs() <<
"*** End Block: ";
2671 ScoreBrackets.dump();
2679bool SIInsertWaitcnts::isPreheaderToFlush(
2680 MachineBasicBlock &
MBB,
const WaitcntBrackets &ScoreBrackets) {
2681 auto [Iterator, IsInserted] = PreheadersToFlush.
try_emplace(&
MBB,
false);
2683 return Iterator->second;
2694 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2695 Iterator->second =
true;
2702bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
2704 return mayAccessVMEMThroughFlat(
MI);
2716bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *
ML,
2717 const WaitcntBrackets &Brackets) {
2718 bool HasVMemLoad =
false;
2719 bool HasVMemStore =
false;
2720 bool UsesVgprLoadedOutside =
false;
2721 DenseSet<Register> VgprUse;
2722 DenseSet<Register> VgprDef;
2724 for (MachineBasicBlock *
MBB :
ML->blocks()) {
2725 for (MachineInstr &
MI : *
MBB) {
2726 if (isVMEMOrFlatVMEM(
MI)) {
2730 HasVMemStore =
true;
2732 for (
const MachineOperand &
Op :
MI.all_uses()) {
2733 if (!
TRI->isVectorRegister(*
MRI,
Op.getReg()))
2745 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2746 Brackets.getScoreLB(LOAD_CNT) ||
2747 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2748 Brackets.getScoreLB(SAMPLE_CNT) ||
2749 Brackets.getRegScore(RegNo, BVH_CNT) >
2750 Brackets.getScoreLB(BVH_CNT)) {
2751 UsesVgprLoadedOutside =
true;
2758 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
2759 for (
const MachineOperand &
Op :
MI.all_defs()) {
2772 if (!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2774 return HasVMemLoad && UsesVgprLoadedOutside &&
ST->hasVmemWriteVgprInOrder();
2777bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
2778 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2780 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2782 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2783 AA = &AAR->getAAResults();
2785 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
2797 if (!SIInsertWaitcnts(MLI, PDT,
AA).
run(MF))
2802 .preserve<AAManager>();
2807 TII = ST->getInstrInfo();
2808 TRI = &
TII->getRegisterInfo();
2814 if (ST->hasExtendedWaitCounts()) {
2815 MaxCounter = NUM_EXTENDED_INST_CNTS;
2816 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2817 WCG = &WCGGFX12Plus;
2819 MaxCounter = NUM_NORMAL_INST_CNTS;
2820 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
2824 for (
auto T : inst_counter_types())
2825 ForceEmitWaitcnt[
T] =
false;
2827 WaitEventMaskForInst = WCG->getWaitEventMask();
2829 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2831 if (
ST->hasExtendedWaitCounts()) {
2845 [[maybe_unused]]
unsigned NumVGPRsMax =
2847 [[maybe_unused]]
unsigned NumSGPRsMax =
ST->getAddressableNumSGPRs();
2848 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2849 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2854 MachineBasicBlock &EntryBB = MF.
front();
2865 I !=
E && (
I->isPHI() ||
I->isMetaInstruction()); ++
I)
2868 if (
ST->hasExtendedWaitCounts()) {
2871 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2872 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
2875 if (!
ST->hasImageInsts() &&
2876 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
2880 TII->get(instrsForExtendedCounterTypes[CT]))
2887 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
2888 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2889 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2896 for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
2899 std::unique_ptr<WaitcntBrackets> Brackets;
2904 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
2906 MachineBasicBlock *
MBB = BII->first;
2907 BlockInfo &BI = BII->second;
2913 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2915 *Brackets = *BI.Incoming;
2918 Brackets = std::make_unique<WaitcntBrackets>(
this);
2923 Brackets->~WaitcntBrackets();
2924 new (Brackets.get()) WaitcntBrackets(
this);
2928 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
2931 if (Brackets->hasPendingEvent()) {
2932 BlockInfo *MoveBracketsToSucc =
nullptr;
2934 auto *SuccBII = BlockInfos.
find(Succ);
2935 BlockInfo &SuccBI = SuccBII->second;
2936 if (!SuccBI.Incoming) {
2937 SuccBI.Dirty =
true;
2938 if (SuccBII <= BII) {
2942 if (!MoveBracketsToSucc) {
2943 MoveBracketsToSucc = &SuccBI;
2945 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2947 }
else if (SuccBI.Incoming->merge(*Brackets)) {
2948 SuccBI.Dirty =
true;
2949 if (SuccBII <= BII) {
2955 if (MoveBracketsToSucc)
2956 MoveBracketsToSucc->Incoming = std::move(Brackets);
2961 if (
ST->hasScalarStores()) {
2962 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
2963 bool HaveScalarStores =
false;
2965 for (MachineBasicBlock &
MBB : MF) {
2966 for (MachineInstr &
MI :
MBB) {
2967 if (!HaveScalarStores &&
TII->isScalarStore(
MI))
2968 HaveScalarStores =
true;
2970 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
2971 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2976 if (HaveScalarStores) {
2985 for (MachineBasicBlock *
MBB : EndPgmBlocks) {
2986 bool SeenDCacheWB =
false;
2990 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
2991 SeenDCacheWB =
true;
2992 else if (
TII->isScalarStore(*
I))
2993 SeenDCacheWB =
false;
2996 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
2997 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3013 for (MachineInstr *
MI : ReleaseVGPRInsts) {
3015 TII->get(AMDGPU::S_ALLOC_VGPR))
3020 if (!ReleaseVGPRInsts.empty() &&
3021 (MF.getFrameInfo().hasCalls() ||
3022 ST->getOccupancyWithNumVGPRs(
3023 TRI->getNumUsedPhysRegs(*
MRI, AMDGPU::VGPR_32RegClass),
3026 for (MachineInstr *
MI : ReleaseVGPRInsts) {
3027 if (
ST->requiresNopBeforeDeallocVGPRs()) {
3029 TII->get(AMDGPU::S_NOP))
3033 TII->get(AMDGPU::S_SENDMSG))
3039 ReleaseVGPRInsts.clear();
3040 PreheadersToFlush.
clear();
3041 SLoadAddresses.
clear();
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
#define AMDGPU_EVENT_NAME(Name)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool readsVCCZ(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
static bool isCounterSet(unsigned ID)
static bool shouldExecute(unsigned CounterName)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getDynamicVGPRBlockSize() const
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
unsigned getStorecntBitMask(const IsaVersion &Version)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getXcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
FunctionAddr VTableAddr Count
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
DWARFExpression::Operation Op
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
static constexpr bool is_iterable