Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39#include "llvm/IR/Dominators.h"
43
44using namespace llvm;
45
46#define DEBUG_TYPE "si-insert-waitcnts"
47
48DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
49 "Force emit s_waitcnt expcnt(0) instrs");
50DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
52DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
53 "Force emit s_waitcnt vmcnt(0) instrs");
54
55static cl::opt<bool>
56 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
57 cl::desc("Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
59 cl::init(false), cl::Hidden);
60
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc("Force all waitcnt load counters to wait until 0"),
64 cl::init(false), cl::Hidden);
65
66namespace {
67// Class of object that encapsulates latest instruction counter score
68// associated with the operand. Used for determining whether
69// s_waitcnt instruction needs to be emitted.
70
71enum InstCounterType {
72 LOAD_CNT = 0, // VMcnt prior to gfx12.
73 DS_CNT, // LKGMcnt prior to gfx12.
74 EXP_CNT, //
75 STORE_CNT, // VScnt in gfx10/gfx11.
76 NUM_NORMAL_INST_CNTS,
77 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
78 BVH_CNT, // gfx12+ only.
79 KM_CNT, // gfx12+ only.
80 X_CNT, // gfx1250.
81 NUM_EXTENDED_INST_CNTS,
82 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
83};
84} // namespace
85
86namespace llvm {
87template <> struct enum_iteration_traits<InstCounterType> {
88 static constexpr bool is_iterable = true;
89};
90} // namespace llvm
91
92namespace {
93// Return an iterator over all counters between LOAD_CNT (the first counter)
94// and \c MaxCounter (exclusive, default value yields an enumeration over
95// all counters).
96auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
97 return enum_seq(LOAD_CNT, MaxCounter);
98}
99
100using RegInterval = std::pair<int, int>;
101
102struct HardwareLimits {
103 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
104 unsigned ExpcntMax;
105 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
106 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
107 unsigned SamplecntMax; // gfx12+ only.
108 unsigned BvhcntMax; // gfx12+ only.
109 unsigned KmcntMax; // gfx12+ only.
110 unsigned XcntMax; // gfx1250.
111};
112
113#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
114 DECL(VMEM_ACCESS) /* vmem read & write */ \
115 DECL(VMEM_READ_ACCESS) /* vmem read */ \
116 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
117 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
118 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
119 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
120 DECL(VMEM_GROUP) /* vmem group */ \
121 DECL(LDS_ACCESS) /* lds read & write */ \
122 DECL(GDS_ACCESS) /* gds read & write */ \
123 DECL(SQ_MESSAGE) /* send message */ \
124 DECL(SCC_WRITE) /* write to SCC from barrier */ \
125 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
126 DECL(SMEM_GROUP) /* scalar-memory group */ \
127 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
128 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
129 DECL(EXP_POS_ACCESS) /* write to export position */ \
130 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
131 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
132 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */
133
134// clang-format off
135#define AMDGPU_EVENT_ENUM(Name) Name,
136enum WaitEventType {
138 NUM_WAIT_EVENTS
139};
140#undef AMDGPU_EVENT_ENUM
141
142#define AMDGPU_EVENT_NAME(Name) #Name,
143static constexpr StringLiteral WaitEventTypeName[] = {
145};
146#undef AMDGPU_EVENT_NAME
147// clang-format on
148
149// The mapping is:
150// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
151// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
152// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
153// NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC
154// We reserve a fixed number of VGPR slots in the scoring tables for
155// special tokens like SCMEM_LDS (needed for buffer load to LDS).
156enum RegisterMapping {
157 SQ_MAX_PGM_VGPRS = 2048, // Maximum programmable VGPRs across all targets.
158 AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets.
159 SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets.
160 // Artificial register slots to track LDS writes into specific LDS locations
161 // if a location is known. When slots are exhausted or location is
162 // unknown use the first slot. The first slot is also always updated in
163 // addition to known location's slot to properly generate waits if dependent
164 // instruction's location is unknown.
165 FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
166 NUM_LDS_VGPRS = 9, // One more than the stores we track.
167 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
168 NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
169 // Remaining non-allocatable registers
170 SCC = NUM_ALL_ALLOCATABLE
171};
172
173// Enumerate different types of result-returning VMEM operations. Although
174// s_waitcnt orders them all with a single vmcnt counter, in the absence of
175// s_waitcnt only instructions of the same VmemType are guaranteed to write
176// their results in order -- so there is no need to insert an s_waitcnt between
177// two instructions of the same type that write the same vgpr.
178enum VmemType {
179 // BUF instructions and MIMG instructions without a sampler.
180 VMEM_NOSAMPLER,
181 // MIMG instructions with a sampler.
182 VMEM_SAMPLER,
183 // BVH instructions
184 VMEM_BVH,
185 NUM_VMEM_TYPES
186};
187
188// Maps values of InstCounterType to the instruction that waits on that
189// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
190// returns true.
191static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
192 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
193 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
194 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
195
196static bool updateVMCntOnly(const MachineInstr &Inst) {
197 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
199}
200
201#ifndef NDEBUG
202static bool isNormalMode(InstCounterType MaxCounter) {
203 return MaxCounter == NUM_NORMAL_INST_CNTS;
204}
205#endif // NDEBUG
206
207VmemType getVmemType(const MachineInstr &Inst) {
208 assert(updateVMCntOnly(Inst));
209 if (!SIInstrInfo::isImage(Inst))
210 return VMEM_NOSAMPLER;
212 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
214
215 if (BaseInfo->BVH)
216 return VMEM_BVH;
217
218 // We have to make an additional check for isVSAMPLE here since some
219 // instructions don't have a sampler, but are still classified as sampler
220 // instructions for the purposes of e.g. waitcnt.
221 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
222 return VMEM_SAMPLER;
223
224 return VMEM_NOSAMPLER;
225}
226
227unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
228 switch (T) {
229 case LOAD_CNT:
230 return Wait.LoadCnt;
231 case EXP_CNT:
232 return Wait.ExpCnt;
233 case DS_CNT:
234 return Wait.DsCnt;
235 case STORE_CNT:
236 return Wait.StoreCnt;
237 case SAMPLE_CNT:
238 return Wait.SampleCnt;
239 case BVH_CNT:
240 return Wait.BvhCnt;
241 case KM_CNT:
242 return Wait.KmCnt;
243 case X_CNT:
244 return Wait.XCnt;
245 default:
246 llvm_unreachable("bad InstCounterType");
247 }
248}
249
250void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
251 unsigned &WC = getCounterRef(Wait, T);
252 WC = std::min(WC, Count);
253}
254
255void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
256 getCounterRef(Wait, T) = ~0u;
257}
258
259unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
260 return getCounterRef(Wait, T);
261}
262
263// Mapping from event to counter according to the table masks.
264InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
265 for (auto T : inst_counter_types()) {
266 if (masks[T] & (1 << E))
267 return T;
268 }
269 llvm_unreachable("event type has no associated counter");
270}
271
272class WaitcntBrackets;
273
274// This abstracts the logic for generating and updating S_WAIT* instructions
275// away from the analysis that determines where they are needed. This was
276// done because the set of counters and instructions for waiting on them
277// underwent a major shift with gfx12, sufficiently so that having this
278// abstraction allows the main analysis logic to be simpler than it would
279// otherwise have had to become.
280class WaitcntGenerator {
281protected:
282 const GCNSubtarget *ST = nullptr;
283 const SIInstrInfo *TII = nullptr;
284 AMDGPU::IsaVersion IV;
285 InstCounterType MaxCounter;
286 bool OptNone;
287
288public:
289 WaitcntGenerator() = default;
290 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
291 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
292 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
293 OptNone(MF.getFunction().hasOptNone() ||
294 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
295
296 // Return true if the current function should be compiled with no
297 // optimization.
298 bool isOptNone() const { return OptNone; }
299
300 // Edits an existing sequence of wait count instructions according
301 // to an incoming Waitcnt value, which is itself updated to reflect
302 // any new wait count instructions which may need to be generated by
303 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
304 // were made.
305 //
306 // This editing will usually be merely updated operands, but it may also
307 // delete instructions if the incoming Wait value indicates they are not
308 // needed. It may also remove existing instructions for which a wait
309 // is needed if it can be determined that it is better to generate new
310 // instructions later, as can happen on gfx12.
311 virtual bool
312 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
313 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
315
316 // Transform a soft waitcnt into a normal one.
317 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
318
319 // Generates new wait count instructions according to the value of
320 // Wait, returning true if any new instructions were created.
321 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
323 AMDGPU::Waitcnt Wait) = 0;
324
325 // Returns an array of bit masks which can be used to map values in
326 // WaitEventType to corresponding counter values in InstCounterType.
327 virtual const unsigned *getWaitEventMask() const = 0;
328
329 // Returns a new waitcnt with all counters except VScnt set to 0. If
330 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
331 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
332
333 virtual ~WaitcntGenerator() = default;
334
335 // Create a mask value from the initializer list of wait event types.
336 static constexpr unsigned
337 eventMask(std::initializer_list<WaitEventType> Events) {
338 unsigned Mask = 0;
339 for (auto &E : Events)
340 Mask |= 1 << E;
341
342 return Mask;
343 }
344};
345
346class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
347public:
348 WaitcntGeneratorPreGFX12() = default;
349 WaitcntGeneratorPreGFX12(const MachineFunction &MF)
350 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
351
352 bool
353 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
354 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
355 MachineBasicBlock::instr_iterator It) const override;
356
357 bool createNewWaitcnt(MachineBasicBlock &Block,
359 AMDGPU::Waitcnt Wait) override;
360
361 const unsigned *getWaitEventMask() const override {
362 assert(ST);
363
364 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
365 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
366 VMEM_BVH_READ_ACCESS}),
367 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
368 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
369 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
370 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
371 0,
372 0,
373 0,
374 0};
375
376 return WaitEventMaskForInstPreGFX12;
377 }
378
379 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
380};
381
382class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
383public:
384 WaitcntGeneratorGFX12Plus() = default;
385 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
386 InstCounterType MaxCounter)
387 : WaitcntGenerator(MF, MaxCounter) {}
388
389 bool
390 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
391 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
392 MachineBasicBlock::instr_iterator It) const override;
393
394 bool createNewWaitcnt(MachineBasicBlock &Block,
396 AMDGPU::Waitcnt Wait) override;
397
398 const unsigned *getWaitEventMask() const override {
399 assert(ST);
400
401 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
402 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
403 eventMask({LDS_ACCESS, GDS_ACCESS}),
404 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
405 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
406 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
407 eventMask({VMEM_SAMPLER_READ_ACCESS}),
408 eventMask({VMEM_BVH_READ_ACCESS}),
409 eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
410 eventMask({VMEM_GROUP, SMEM_GROUP})};
411
412 return WaitEventMaskForInstGFX12Plus;
413 }
414
415 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
416};
417
418class SIInsertWaitcnts {
419public:
420 const GCNSubtarget *ST;
421 InstCounterType SmemAccessCounter;
422 InstCounterType MaxCounter;
423 const unsigned *WaitEventMaskForInst;
424
425private:
426 const SIInstrInfo *TII = nullptr;
427 const SIRegisterInfo *TRI = nullptr;
428 const MachineRegisterInfo *MRI = nullptr;
429
430 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
431 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
432 MachineLoopInfo *MLI;
433 MachinePostDominatorTree *PDT;
434 AliasAnalysis *AA = nullptr;
435
436 struct BlockInfo {
437 std::unique_ptr<WaitcntBrackets> Incoming;
438 bool Dirty = true;
439 };
440
441 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
442
443 bool ForceEmitWaitcnt[NUM_INST_CNTS];
444
445 // In any given run of this pass, WCG will point to one of these two
446 // generator objects, which must have been re-initialised before use
447 // from a value made using a subtarget constructor.
448 WaitcntGeneratorPreGFX12 WCGPreGFX12;
449 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
450
451 WaitcntGenerator *WCG = nullptr;
452
453 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
454 // message.
455 DenseSet<MachineInstr *> ReleaseVGPRInsts;
456
457 HardwareLimits Limits;
458
459public:
460 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
461 AliasAnalysis *AA)
462 : MLI(MLI), PDT(PDT), AA(AA) {
463 (void)ForceExpCounter;
464 (void)ForceLgkmCounter;
465 (void)ForceVMCounter;
466 }
467
468 unsigned getWaitCountMax(InstCounterType T) const {
469 switch (T) {
470 case LOAD_CNT:
471 return Limits.LoadcntMax;
472 case DS_CNT:
473 return Limits.DscntMax;
474 case EXP_CNT:
475 return Limits.ExpcntMax;
476 case STORE_CNT:
477 return Limits.StorecntMax;
478 case SAMPLE_CNT:
479 return Limits.SamplecntMax;
480 case BVH_CNT:
481 return Limits.BvhcntMax;
482 case KM_CNT:
483 return Limits.KmcntMax;
484 case X_CNT:
485 return Limits.XcntMax;
486 default:
487 break;
488 }
489 return 0;
490 }
491
492 bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
493 bool isPreheaderToFlush(MachineBasicBlock &MBB,
494 const WaitcntBrackets &ScoreBrackets);
495 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
496 bool run(MachineFunction &MF);
497
498 bool isForceEmitWaitcnt() const {
499 for (auto T : inst_counter_types())
500 if (ForceEmitWaitcnt[T])
501 return true;
502 return false;
503 }
504
505 void setForceEmitWaitcnt() {
506// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
507// For debug builds, get the debug counter info and adjust if need be
508#ifndef NDEBUG
509 if (DebugCounter::isCounterSet(ForceExpCounter) &&
510 DebugCounter::shouldExecute(ForceExpCounter)) {
511 ForceEmitWaitcnt[EXP_CNT] = true;
512 } else {
513 ForceEmitWaitcnt[EXP_CNT] = false;
514 }
515
516 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
517 DebugCounter::shouldExecute(ForceLgkmCounter)) {
518 ForceEmitWaitcnt[DS_CNT] = true;
519 ForceEmitWaitcnt[KM_CNT] = true;
520 } else {
521 ForceEmitWaitcnt[DS_CNT] = false;
522 ForceEmitWaitcnt[KM_CNT] = false;
523 }
524
525 if (DebugCounter::isCounterSet(ForceVMCounter) &&
526 DebugCounter::shouldExecute(ForceVMCounter)) {
527 ForceEmitWaitcnt[LOAD_CNT] = true;
528 ForceEmitWaitcnt[SAMPLE_CNT] = true;
529 ForceEmitWaitcnt[BVH_CNT] = true;
530 } else {
531 ForceEmitWaitcnt[LOAD_CNT] = false;
532 ForceEmitWaitcnt[SAMPLE_CNT] = false;
533 ForceEmitWaitcnt[BVH_CNT] = false;
534 }
535#endif // NDEBUG
536 }
537
538 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
539 // instruction.
540 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
541 switch (Inst.getOpcode()) {
542 case AMDGPU::GLOBAL_INV:
543 return VMEM_READ_ACCESS; // tracked using loadcnt
544 case AMDGPU::GLOBAL_WB:
545 case AMDGPU::GLOBAL_WBINV:
546 return VMEM_WRITE_ACCESS; // tracked using storecnt
547 default:
548 break;
549 }
550
551 // Maps VMEM access types to their corresponding WaitEventType.
552 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
553 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
554
556 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
557 // these should use VM_CNT.
558 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
559 return VMEM_ACCESS;
560 if (Inst.mayStore() &&
561 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
562 // FLAT and SCRATCH instructions may access scratch. Other VMEM
563 // instructions do not.
564 if (TII->mayAccessScratchThroughFlat(Inst))
565 return SCRATCH_WRITE_ACCESS;
566 return VMEM_WRITE_ACCESS;
567 }
568 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
569 return VMEM_READ_ACCESS;
570 return VmemReadMapping[getVmemType(Inst)];
571 }
572
573 bool hasXcnt() const { return ST->hasWaitXCnt(); }
574
575 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
576 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
577 bool isVmemAccess(const MachineInstr &MI) const;
578 bool generateWaitcntInstBefore(MachineInstr &MI,
579 WaitcntBrackets &ScoreBrackets,
580 MachineInstr *OldWaitcntInstr,
581 bool FlushVmCnt);
582 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
584 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
585 MachineInstr *OldWaitcntInstr);
586 void updateEventWaitcntAfter(MachineInstr &Inst,
587 WaitcntBrackets *ScoreBrackets);
588 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
589 MachineBasicBlock *Block) const;
590 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
591 WaitcntBrackets &ScoreBrackets);
592 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
593 WaitcntBrackets &ScoreBrackets);
594 static bool asynchronouslyWritesSCC(unsigned Opcode);
595};
596
597// This objects maintains the current score brackets of each wait counter, and
598// a per-register scoreboard for each wait counter.
599//
600// We also maintain the latest score for every event type that can change the
601// waitcnt in order to know if there are multiple types of events within
602// the brackets. When multiple types of event happen in the bracket,
603// wait count may get decreased out of order, therefore we need to put in
604// "s_waitcnt 0" before use.
605class WaitcntBrackets {
606public:
607 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}
608
609 bool isSmemCounter(InstCounterType T) const {
610 return T == Context->SmemAccessCounter || T == X_CNT;
611 }
612
613 unsigned getSgprScoresIdx(InstCounterType T) const {
614 assert(isSmemCounter(T) && "Invalid SMEM counter");
615 return T == X_CNT ? 1 : 0;
616 }
617
618 unsigned getScoreLB(InstCounterType T) const {
619 assert(T < NUM_INST_CNTS);
620 return ScoreLBs[T];
621 }
622
623 unsigned getScoreUB(InstCounterType T) const {
624 assert(T < NUM_INST_CNTS);
625 return ScoreUBs[T];
626 }
627
628 unsigned getScoreRange(InstCounterType T) const {
629 return getScoreUB(T) - getScoreLB(T);
630 }
631
632 unsigned getRegScore(int GprNo, InstCounterType T) const {
633 if (GprNo < NUM_ALL_VGPRS)
634 return VgprScores[T][GprNo];
635
636 if (GprNo < NUM_ALL_ALLOCATABLE)
637 return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
638
639 assert(GprNo == SCC);
640 return SCCScore;
641 }
642
643 bool merge(const WaitcntBrackets &Other);
644
645 RegInterval getRegInterval(const MachineInstr *MI,
646 const MachineRegisterInfo *MRI,
647 const SIRegisterInfo *TRI,
648 const MachineOperand &Op) const;
649
650 bool counterOutOfOrder(InstCounterType T) const;
651 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
652 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
653
654 void determineWait(InstCounterType T, RegInterval Interval,
655 AMDGPU::Waitcnt &Wait) const;
656 void determineWait(InstCounterType T, int RegNo,
657 AMDGPU::Waitcnt &Wait) const {
658 determineWait(T, {RegNo, RegNo + 1}, Wait);
659 }
660 void tryClearSCCWriteEvent(MachineInstr *Inst);
661
662 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
663 void applyWaitcnt(InstCounterType T, unsigned Count);
664 void applyXcnt(const AMDGPU::Waitcnt &Wait);
665 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
666 const MachineRegisterInfo *MRI, WaitEventType E,
667 MachineInstr &MI);
668
669 unsigned hasPendingEvent() const { return PendingEvents; }
670 unsigned hasPendingEvent(WaitEventType E) const {
671 return PendingEvents & (1 << E);
672 }
673 unsigned hasPendingEvent(InstCounterType T) const {
674 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
675 assert((HasPending != 0) == (getScoreRange(T) != 0));
676 return HasPending;
677 }
678
679 bool hasMixedPendingEvents(InstCounterType T) const {
680 unsigned Events = hasPendingEvent(T);
681 // Return true if more than one bit is set in Events.
682 return Events & (Events - 1);
683 }
684
685 bool hasPendingFlat() const {
686 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
687 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
688 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
689 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
690 }
691
692 void setPendingFlat() {
693 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
694 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
695 }
696
697 bool hasPendingGDS() const {
698 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
699 }
700
701 unsigned getPendingGDSWait() const {
702 return std::min(getScoreUB(DS_CNT) - LastGDS,
703 Context->getWaitCountMax(DS_CNT) - 1);
704 }
705
706 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
707
708 // Return true if there might be pending writes to the vgpr-interval by VMEM
709 // instructions with types different from V.
710 bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
711 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
712 assert(RegNo < NUM_ALL_VGPRS);
713 if (VgprVmemTypes[RegNo] & ~(1 << V))
714 return true;
715 }
716 return false;
717 }
718
719 void clearVgprVmemTypes(RegInterval Interval) {
720 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
721 assert(RegNo < NUM_ALL_VGPRS);
722 VgprVmemTypes[RegNo] = 0;
723 }
724 }
725
726 void setStateOnFunctionEntryOrReturn() {
727 setScoreUB(STORE_CNT,
728 getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
729 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
730 }
731
732 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
733 return LDSDMAStores;
734 }
735
736 bool hasPointSampleAccel(const MachineInstr &MI) const;
737 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
738 RegInterval Interval) const;
739
740 void print(raw_ostream &) const;
741 void dump() const { print(dbgs()); }
742
743private:
744 struct MergeInfo {
745 unsigned OldLB;
746 unsigned OtherLB;
747 unsigned MyShift;
748 unsigned OtherShift;
749 };
750 static bool mergeScore(const MergeInfo &M, unsigned &Score,
751 unsigned OtherScore);
752
753 void setScoreLB(InstCounterType T, unsigned Val) {
754 assert(T < NUM_INST_CNTS);
755 ScoreLBs[T] = Val;
756 }
757
758 void setScoreUB(InstCounterType T, unsigned Val) {
759 assert(T < NUM_INST_CNTS);
760 ScoreUBs[T] = Val;
761
762 if (T != EXP_CNT)
763 return;
764
765 if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
766 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
767 }
768
769 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
770 setScoreByInterval({GprNo, GprNo + 1}, T, Val);
771 }
772
773 void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
774 unsigned Score);
775
776 void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
777 const MachineRegisterInfo *MRI,
778 const MachineOperand &Op, InstCounterType CntTy,
779 unsigned Val);
780
781 const SIInsertWaitcnts *Context;
782
783 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
784 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
785 unsigned PendingEvents = 0;
786 // Remember the last flat memory operation.
787 unsigned LastFlat[NUM_INST_CNTS] = {0};
788 // Remember the last GDS operation.
789 unsigned LastGDS = 0;
790 // wait_cnt scores for every vgpr.
791 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
792 int VgprUB = -1;
793 int SgprUB = -1;
794 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
795 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
796 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
797 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
798 // X_CNT score.
799 unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
800 // Reg score for SCC.
801 unsigned SCCScore = 0;
802 // The unique instruction that has an SCC write pending, if there is one.
803 const MachineInstr *PendingSCCWrite = nullptr;
804 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
805 // write to each vgpr.
806 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
807 // Store representative LDS DMA operations. The only useful info here is
808 // alias info. One store is kept per unique AAInfo.
809 SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
810};
811
812class SIInsertWaitcntsLegacy : public MachineFunctionPass {
813public:
814 static char ID;
815 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
816
817 bool runOnMachineFunction(MachineFunction &MF) override;
818
819 StringRef getPassName() const override {
820 return "SI insert wait instructions";
821 }
822
823 void getAnalysisUsage(AnalysisUsage &AU) const override {
824 AU.setPreservesCFG();
825 AU.addRequired<MachineLoopInfoWrapperPass>();
826 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
827 AU.addUsedIfAvailable<AAResultsWrapperPass>();
828 AU.addPreserved<AAResultsWrapperPass>();
830 }
831};
832
833} // end anonymous namespace
834
835RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
837 const SIRegisterInfo *TRI,
838 const MachineOperand &Op) const {
839 if (Op.getReg() == AMDGPU::SCC)
840 return {SCC, SCC + 1};
841
842 if (!TRI->isInAllocatableClass(Op.getReg()))
843 return {-1, -1};
844
845 // A use via a PW operand does not need a waitcnt.
846 // A partial write is not a WAW.
847 assert(!Op.getSubReg() || !Op.isUndef());
848
849 RegInterval Result;
850
851 MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
852 unsigned RegIdx = TRI->getHWRegIndex(MCReg);
853
854 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
855 unsigned Size = TRI->getRegSizeInBits(*RC);
856
857 // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
858 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
859 unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0);
860 assert(!Context->ST->hasMAIInsts() || Reg < AGPR_OFFSET);
861 Result.first = Reg;
862 if (TRI->isAGPR(*MRI, Op.getReg()))
863 Result.first += AGPR_OFFSET;
864 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
865 assert(Size % 16 == 0);
866 Result.second = Result.first + (Size / 16);
867
868 if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) {
869 // Regardless of which lo16/hi16 is used, consider the full 32-bit
870 // register used.
871 if (AMDGPU::isHi16Reg(MCReg, *TRI))
872 Result.first -= 1;
873 else
874 Result.second += 1;
875 }
876 } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
877 // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
878 // sources like SRC_PRIVATE_BASE.
879 Result.first = RegIdx + NUM_ALL_VGPRS;
880 Result.second = Result.first + divideCeil(Size, 32);
881 } else {
882 return {-1, -1};
883 }
884
885 return Result;
886}
887
888void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
889 InstCounterType CntTy,
890 unsigned Score) {
891 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
892 if (RegNo < NUM_ALL_VGPRS) {
893 VgprUB = std::max(VgprUB, RegNo);
894 VgprScores[CntTy][RegNo] = Score;
895 } else if (RegNo < NUM_ALL_ALLOCATABLE) {
896 SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
897 SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
898 } else {
899 assert(RegNo == SCC);
900 SCCScore = Score;
901 }
902 }
903}
904
905void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
906 const SIRegisterInfo *TRI,
907 const MachineRegisterInfo *MRI,
908 const MachineOperand &Op,
909 InstCounterType CntTy, unsigned Score) {
910 RegInterval Interval = getRegInterval(MI, MRI, TRI, Op);
911 setScoreByInterval(Interval, CntTy, Score);
912}
913
914// Return true if the subtarget is one that enables Point Sample Acceleration
915// and the MachineInstr passed in is one to which it might be applied (the
916// hardware makes this decision based on several factors, but we can't determine
917// this at compile time, so we have to assume it might be applied if the
918// instruction supports it).
919bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
920 if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
921 return false;
922
923 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
924 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
926 return BaseInfo->PointSampleAccel;
927}
928
929// Return true if the subtarget enables Point Sample Acceleration, the supplied
930// MachineInstr is one to which it might be applied and the supplied interval is
931// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
932// (this is the type that a point sample accelerated instruction effectively
933// becomes)
934bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
935 const MachineInstr &MI, RegInterval Interval) const {
936 if (!hasPointSampleAccel(MI))
937 return false;
938
939 return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
940}
941
942void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
943 const SIRegisterInfo *TRI,
944 const MachineRegisterInfo *MRI,
945 WaitEventType E, MachineInstr &Inst) {
946 InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
947
948 unsigned UB = getScoreUB(T);
949 unsigned CurrScore = UB + 1;
950 if (CurrScore == 0)
951 report_fatal_error("InsertWaitcnt score wraparound");
952 // PendingEvents and ScoreUB need to be update regardless if this event
953 // changes the score of a register or not.
954 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
955 PendingEvents |= 1 << E;
956 setScoreUB(T, CurrScore);
957
958 if (T == EXP_CNT) {
959 // Put score on the source vgprs. If this is a store, just use those
960 // specific register(s).
961 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
962 // All GDS operations must protect their address register (same as
963 // export.)
964 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
965 setScoreByOperand(&Inst, TRI, MRI, *AddrOp, EXP_CNT, CurrScore);
966
967 if (Inst.mayStore()) {
968 if (const auto *Data0 =
969 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
970 setScoreByOperand(&Inst, TRI, MRI, *Data0, EXP_CNT, CurrScore);
971 if (const auto *Data1 =
972 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
973 setScoreByOperand(&Inst, TRI, MRI, *Data1, EXP_CNT, CurrScore);
974 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
975 Inst.getOpcode() != AMDGPU::DS_APPEND &&
976 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
977 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
978 for (const MachineOperand &Op : Inst.all_uses()) {
979 if (TRI->isVectorRegister(*MRI, Op.getReg()))
980 setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
981 }
982 }
983 } else if (TII->isFLAT(Inst)) {
984 if (Inst.mayStore()) {
985 setScoreByOperand(&Inst, TRI, MRI,
986 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
987 EXP_CNT, CurrScore);
988 } else if (SIInstrInfo::isAtomicRet(Inst)) {
989 setScoreByOperand(&Inst, TRI, MRI,
990 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
991 EXP_CNT, CurrScore);
992 }
993 } else if (TII->isMIMG(Inst)) {
994 if (Inst.mayStore()) {
995 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
996 CurrScore);
997 } else if (SIInstrInfo::isAtomicRet(Inst)) {
998 setScoreByOperand(&Inst, TRI, MRI,
999 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1000 EXP_CNT, CurrScore);
1001 }
1002 } else if (TII->isMTBUF(Inst)) {
1003 if (Inst.mayStore())
1004 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
1005 CurrScore);
1006 } else if (TII->isMUBUF(Inst)) {
1007 if (Inst.mayStore()) {
1008 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
1009 CurrScore);
1010 } else if (SIInstrInfo::isAtomicRet(Inst)) {
1011 setScoreByOperand(&Inst, TRI, MRI,
1012 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1013 EXP_CNT, CurrScore);
1014 }
1015 } else if (TII->isLDSDIR(Inst)) {
1016 // LDSDIR instructions attach the score to the destination.
1017 setScoreByOperand(&Inst, TRI, MRI,
1018 *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1019 EXP_CNT, CurrScore);
1020 } else {
1021 if (TII->isEXP(Inst)) {
1022 // For export the destination registers are really temps that
1023 // can be used as the actual source after export patching, so
1024 // we need to treat them like sources and set the EXP_CNT
1025 // score.
1026 for (MachineOperand &DefMO : Inst.all_defs()) {
1027 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
1028 setScoreByOperand(&Inst, TRI, MRI, DefMO, EXP_CNT, CurrScore);
1029 }
1030 }
1031 }
1032 for (const MachineOperand &Op : Inst.all_uses()) {
1033 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1034 setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
1035 }
1036 }
1037 } else if (T == X_CNT) {
1038 for (const MachineOperand &Op : Inst.all_uses())
1039 setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore);
1040 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1041 // Match the score to the destination registers.
1042 //
1043 // Check only explicit operands. Stores, especially spill stores, include
1044 // implicit uses and defs of their super registers which would create an
1045 // artificial dependency, while these are there only for register liveness
1046 // accounting purposes.
1047 //
1048 // Special cases where implicit register defs exists, such as M0 or VCC,
1049 // but none with memory instructions.
1050 for (const MachineOperand &Op : Inst.defs()) {
1051 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
1052 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1053 if (Interval.first >= NUM_ALL_VGPRS)
1054 continue;
1055 if (updateVMCntOnly(Inst)) {
1056 // updateVMCntOnly should only leave us with VGPRs
1057 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1058 // defs. That's required for a sane index into `VgprMemTypes` below
1059 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1060 VmemType V = getVmemType(Inst);
1061 unsigned char TypesMask = 1 << V;
1062 // If instruction can have Point Sample Accel applied, we have to flag
1063 // this with another potential dependency
1064 if (hasPointSampleAccel(Inst))
1065 TypesMask |= 1 << VMEM_NOSAMPLER;
1066 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
1067 VgprVmemTypes[RegNo] |= TypesMask;
1068 }
1069 }
1070 setScoreByInterval(Interval, T, CurrScore);
1071 }
1072 if (Inst.mayStore() &&
1073 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
1074 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1075 // written can be accessed. A load from LDS to VMEM does not need a wait.
1076 unsigned Slot = 0;
1077 for (const auto *MemOp : Inst.memoperands()) {
1078 if (!MemOp->isStore() ||
1079 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1080 continue;
1081 // Comparing just AA info does not guarantee memoperands are equal
1082 // in general, but this is so for LDS DMA in practice.
1083 auto AAI = MemOp->getAAInfo();
1084 // Alias scope information gives a way to definitely identify an
1085 // original memory object and practically produced in the module LDS
1086 // lowering pass. If there is no scope available we will not be able
1087 // to disambiguate LDS aliasing as after the module lowering all LDS
1088 // is squashed into a single big object. Do not attempt to use one of
1089 // the limited LDSDMAStores for something we will not be able to use
1090 // anyway.
1091 if (!AAI || !AAI.Scope)
1092 break;
1093 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1094 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1095 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1096 Slot = I + 1;
1097 break;
1098 }
1099 }
1100 }
1101 if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
1102 break;
1103 LDSDMAStores.push_back(&Inst);
1104 Slot = LDSDMAStores.size();
1105 break;
1106 }
1107 setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
1108 if (Slot)
1109 setRegScore(FIRST_LDS_VGPR, T, CurrScore);
1110 }
1111
1112 if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) {
1113 setRegScore(SCC, T, CurrScore);
1114 PendingSCCWrite = &Inst;
1115 }
1116 }
1117}
1118
1119void WaitcntBrackets::print(raw_ostream &OS) const {
1120 const GCNSubtarget *ST = Context->ST;
1121
1122 OS << '\n';
1123 for (auto T : inst_counter_types(Context->MaxCounter)) {
1124 unsigned SR = getScoreRange(T);
1125
1126 switch (T) {
1127 case LOAD_CNT:
1128 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1129 << SR << "): ";
1130 break;
1131 case DS_CNT:
1132 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1133 << SR << "): ";
1134 break;
1135 case EXP_CNT:
1136 OS << " EXP_CNT(" << SR << "): ";
1137 break;
1138 case STORE_CNT:
1139 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1140 << SR << "): ";
1141 break;
1142 case SAMPLE_CNT:
1143 OS << " SAMPLE_CNT(" << SR << "): ";
1144 break;
1145 case BVH_CNT:
1146 OS << " BVH_CNT(" << SR << "): ";
1147 break;
1148 case KM_CNT:
1149 OS << " KM_CNT(" << SR << "): ";
1150 break;
1151 case X_CNT:
1152 OS << " X_CNT(" << SR << "): ";
1153 break;
1154 default:
1155 OS << " UNKNOWN(" << SR << "): ";
1156 break;
1157 }
1158
1159 if (SR != 0) {
1160 // Print vgpr scores.
1161 unsigned LB = getScoreLB(T);
1162
1163 for (int J = 0; J <= VgprUB; J++) {
1164 unsigned RegScore = getRegScore(J, T);
1165 if (RegScore <= LB)
1166 continue;
1167 unsigned RelScore = RegScore - LB - 1;
1168 if (J < FIRST_LDS_VGPR) {
1169 OS << RelScore << ":v" << J << " ";
1170 } else {
1171 OS << RelScore << ":ds ";
1172 }
1173 }
1174 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1175 if (isSmemCounter(T)) {
1176 for (int J = 0; J <= SgprUB; J++) {
1177 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
1178 if (RegScore <= LB)
1179 continue;
1180 unsigned RelScore = RegScore - LB - 1;
1181 OS << RelScore << ":s" << J << " ";
1182 }
1183 }
1184 if (T == KM_CNT && SCCScore > 0)
1185 OS << SCCScore << ":scc ";
1186 }
1187 OS << '\n';
1188 }
1189
1190 OS << "Pending Events: ";
1191 if (hasPendingEvent()) {
1192 ListSeparator LS;
1193 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1194 if (hasPendingEvent((WaitEventType)I)) {
1195 OS << LS << WaitEventTypeName[I];
1196 }
1197 }
1198 } else {
1199 OS << "none";
1200 }
1201 OS << '\n';
1202
1203 OS << '\n';
1204}
1205
1206/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1207/// whether a waitcnt instruction is needed at all.
1208void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1209 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1210 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1211 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1212 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1213 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1214 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1215 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1216 simplifyWaitcnt(X_CNT, Wait.XCnt);
1217}
1218
1219void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1220 unsigned &Count) const {
1221 // The number of outstanding events for this type, T, can be calculated
1222 // as (UB - LB). If the current Count is greater than or equal to the number
1223 // of outstanding events, then the wait for this counter is redundant.
1224 if (Count >= getScoreRange(T))
1225 Count = ~0u;
1226}
1227
1228void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
1229 AMDGPU::Waitcnt &Wait) const {
1230 const unsigned LB = getScoreLB(T);
1231 const unsigned UB = getScoreUB(T);
1232 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1233 unsigned ScoreToWait = getRegScore(RegNo, T);
1234
1235 // If the score of src_operand falls within the bracket, we need an
1236 // s_waitcnt instruction.
1237 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1238 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1239 !Context->ST->hasFlatLgkmVMemCountInOrder()) {
1240 // If there is a pending FLAT operation, and this is a VMem or LGKM
1241 // waitcnt and the target can report early completion, then we need
1242 // to force a waitcnt 0.
1243 addWait(Wait, T, 0);
1244 } else if (counterOutOfOrder(T)) {
1245 // Counter can get decremented out-of-order when there
1246 // are multiple types event in the bracket. Also emit an s_wait counter
1247 // with a conservative value of 0 for the counter.
1248 addWait(Wait, T, 0);
1249 } else {
1250 // If a counter has been maxed out avoid overflow by waiting for
1251 // MAX(CounterType) - 1 instead.
1252 unsigned NeededWait =
1253 std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
1254 addWait(Wait, T, NeededWait);
1255 }
1256 }
1257 }
1258}
1259
1260void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1261 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1262 // SCC has landed
1263 if (PendingSCCWrite &&
1264 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1265 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1266 unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1267 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1268 if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
1269 SCC_WRITE_PendingEvent) {
1270 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1271 }
1272
1273 PendingEvents &= ~SCC_WRITE_PendingEvent;
1274 PendingSCCWrite = nullptr;
1275 }
1276}
1277
1278void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1279 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1280 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1281 applyWaitcnt(DS_CNT, Wait.DsCnt);
1282 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1283 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1284 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1285 applyWaitcnt(KM_CNT, Wait.KmCnt);
1286 applyXcnt(Wait);
1287}
1288
1289void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1290 const unsigned UB = getScoreUB(T);
1291 if (Count >= UB)
1292 return;
1293 if (Count != 0) {
1294 if (counterOutOfOrder(T))
1295 return;
1296 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1297 } else {
1298 setScoreLB(T, UB);
1299 PendingEvents &= ~Context->WaitEventMaskForInst[T];
1300 }
1301}
1302
1303void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1304 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1305 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1306 // zero.
1307 if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1308 return applyWaitcnt(X_CNT, 0);
1309
1310 // If we have pending store we cannot optimize XCnt because we do not wait for
1311 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1312 // decremented to the same number as LOADCnt.
1313 if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1314 !hasPendingEvent(STORE_CNT))
1315 return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
1316
1317 applyWaitcnt(X_CNT, Wait.XCnt);
1318}
1319
1320// Where there are multiple types of event in the bracket of a counter,
1321// the decrement may go out of order.
1322bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1323 // Scalar memory read always can go out of order.
1324 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1325 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1326 return true;
1327 return hasMixedPendingEvents(T);
1328}
1329
1330INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1331 false, false)
1334INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1336
1337char SIInsertWaitcntsLegacy::ID = 0;
1338
1339char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1340
1342 return new SIInsertWaitcntsLegacy();
1343}
1344
1345static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1346 unsigned NewEnc) {
1347 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1348 assert(OpIdx >= 0);
1349
1350 MachineOperand &MO = MI.getOperand(OpIdx);
1351
1352 if (NewEnc == MO.getImm())
1353 return false;
1354
1355 MO.setImm(NewEnc);
1356 return true;
1357}
1358
1359/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1360/// and if so, which counter it is waiting on.
1361static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1362 switch (Opcode) {
1363 case AMDGPU::S_WAIT_LOADCNT:
1364 return LOAD_CNT;
1365 case AMDGPU::S_WAIT_EXPCNT:
1366 return EXP_CNT;
1367 case AMDGPU::S_WAIT_STORECNT:
1368 return STORE_CNT;
1369 case AMDGPU::S_WAIT_SAMPLECNT:
1370 return SAMPLE_CNT;
1371 case AMDGPU::S_WAIT_BVHCNT:
1372 return BVH_CNT;
1373 case AMDGPU::S_WAIT_DSCNT:
1374 return DS_CNT;
1375 case AMDGPU::S_WAIT_KMCNT:
1376 return KM_CNT;
1377 case AMDGPU::S_WAIT_XCNT:
1378 return X_CNT;
1379 default:
1380 return {};
1381 }
1382}
1383
1384bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1385 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1386 if (Opcode == Waitcnt->getOpcode())
1387 return false;
1388
1389 Waitcnt->setDesc(TII->get(Opcode));
1390 return true;
1391}
1392
1393/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1394/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1395/// from \p Wait that were added by previous passes. Currently this pass
1396/// conservatively assumes that these preexisting waits are required for
1397/// correctness.
1398bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1399 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1400 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1401 assert(ST);
1402 assert(isNormalMode(MaxCounter));
1403
1404 bool Modified = false;
1405 MachineInstr *WaitcntInstr = nullptr;
1406 MachineInstr *WaitcntVsCntInstr = nullptr;
1407
1408 LLVM_DEBUG({
1409 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1410 if (It == OldWaitcntInstr.getParent()->instr_end())
1411 dbgs() << "end of block\n";
1412 else
1413 dbgs() << *It;
1414 });
1415
1416 for (auto &II :
1417 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1418 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1419 if (II.isMetaInstruction()) {
1420 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1421 continue;
1422 }
1423
1424 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1425 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1426
1427 // Update required wait count. If this is a soft waitcnt (= it was added
1428 // by an earlier pass), it may be entirely removed.
1429 if (Opcode == AMDGPU::S_WAITCNT) {
1430 unsigned IEnc = II.getOperand(0).getImm();
1431 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1432 if (TrySimplify)
1433 ScoreBrackets.simplifyWaitcnt(OldWait);
1434 Wait = Wait.combined(OldWait);
1435
1436 // Merge consecutive waitcnt of the same type by erasing multiples.
1437 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1438 II.eraseFromParent();
1439 Modified = true;
1440 } else
1441 WaitcntInstr = &II;
1442 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1443 assert(ST->hasVMemToLDSLoad());
1444 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1445 << "Before: " << Wait.LoadCnt << '\n';);
1446 ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
1447 LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
1448
1449 // It is possible (but unlikely) that this is the only wait instruction,
1450 // in which case, we exit this loop without a WaitcntInstr to consume
1451 // `Wait`. But that works because `Wait` was passed in by reference, and
1452 // the callee eventually calls createNewWaitcnt on it. We test this
1453 // possibility in an articial MIR test since such a situation cannot be
1454 // recreated by running the memory legalizer.
1455 II.eraseFromParent();
1456 } else {
1457 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1458 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1459
1460 unsigned OldVSCnt =
1461 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1462 if (TrySimplify)
1463 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1464 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1465
1466 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1467 II.eraseFromParent();
1468 Modified = true;
1469 } else
1470 WaitcntVsCntInstr = &II;
1471 }
1472 }
1473
1474 if (WaitcntInstr) {
1475 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1477 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1478
1479 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1480 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1481 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1482 Wait.LoadCnt = ~0u;
1483 Wait.ExpCnt = ~0u;
1484 Wait.DsCnt = ~0u;
1485
1486 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1487 ? dbgs()
1488 << "applied pre-existing waitcnt\n"
1489 << "New Instr at block end: " << *WaitcntInstr << '\n'
1490 : dbgs() << "applied pre-existing waitcnt\n"
1491 << "Old Instr: " << *It
1492 << "New Instr: " << *WaitcntInstr << '\n');
1493 }
1494
1495 if (WaitcntVsCntInstr) {
1496 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1497 AMDGPU::OpName::simm16, Wait.StoreCnt);
1498 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1499
1500 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1501 Wait.StoreCnt = ~0u;
1502
1503 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1504 ? dbgs() << "applied pre-existing waitcnt\n"
1505 << "New Instr at block end: " << *WaitcntVsCntInstr
1506 << '\n'
1507 : dbgs() << "applied pre-existing waitcnt\n"
1508 << "Old Instr: " << *It
1509 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1510 }
1511
1512 return Modified;
1513}
1514
1515/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1516/// required counters in \p Wait
1517bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1518 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1519 AMDGPU::Waitcnt Wait) {
1520 assert(ST);
1521 assert(isNormalMode(MaxCounter));
1522
1523 bool Modified = false;
1524 const DebugLoc &DL = Block.findDebugLoc(It);
1525
1526 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1527 // single instruction while VScnt has its own instruction.
1528 if (Wait.hasWaitExceptStoreCnt()) {
1529 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1530 [[maybe_unused]] auto SWaitInst =
1531 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1532 Modified = true;
1533
1534 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1535 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1536 dbgs() << "New Instr: " << *SWaitInst << '\n');
1537 }
1538
1539 if (Wait.hasWaitStoreCnt()) {
1540 assert(ST->hasVscnt());
1541
1542 [[maybe_unused]] auto SWaitInst =
1543 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1544 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1545 .addImm(Wait.StoreCnt);
1546 Modified = true;
1547
1548 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1549 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1550 dbgs() << "New Instr: " << *SWaitInst << '\n');
1551 }
1552
1553 return Modified;
1554}
1555
1556AMDGPU::Waitcnt
1557WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1558 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1559}
1560
1561AMDGPU::Waitcnt
1562WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1563 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1564 ~0u /* XCNT */);
1565}
1566
1567/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1568/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1569/// were added by previous passes. Currently this pass conservatively
1570/// assumes that these preexisting waits are required for correctness.
1571bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1572 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1573 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1574 assert(ST);
1575 assert(!isNormalMode(MaxCounter));
1576
1577 bool Modified = false;
1578 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1579 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1580 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1581
1582 LLVM_DEBUG({
1583 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1584 if (It == OldWaitcntInstr.getParent()->instr_end())
1585 dbgs() << "end of block\n";
1586 else
1587 dbgs() << *It;
1588 });
1589
1590 for (auto &II :
1591 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1592 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1593 if (II.isMetaInstruction()) {
1594 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1595 continue;
1596 }
1597
1598 MachineInstr **UpdatableInstr;
1599
1600 // Update required wait count. If this is a soft waitcnt (= it was added
1601 // by an earlier pass), it may be entirely removed.
1602
1603 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1604 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1605
1606 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1607 // attempt to do more than that either.
1608 if (Opcode == AMDGPU::S_WAITCNT)
1609 continue;
1610
1611 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1612 unsigned OldEnc =
1613 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1614 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1615 if (TrySimplify)
1616 ScoreBrackets.simplifyWaitcnt(OldWait);
1617 Wait = Wait.combined(OldWait);
1618 UpdatableInstr = &CombinedLoadDsCntInstr;
1619 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1620 unsigned OldEnc =
1621 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1622 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1623 if (TrySimplify)
1624 ScoreBrackets.simplifyWaitcnt(OldWait);
1625 Wait = Wait.combined(OldWait);
1626 UpdatableInstr = &CombinedStoreDsCntInstr;
1627 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1628 // Architectures higher than GFX10 do not have direct loads to
1629 // LDS, so no work required here yet.
1630 II.eraseFromParent();
1631 continue;
1632 } else {
1633 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1634 assert(CT.has_value());
1635 unsigned OldCnt =
1636 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1637 if (TrySimplify)
1638 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1639 addWait(Wait, CT.value(), OldCnt);
1640 UpdatableInstr = &WaitInstrs[CT.value()];
1641 }
1642
1643 // Merge consecutive waitcnt of the same type by erasing multiples.
1644 if (!*UpdatableInstr) {
1645 *UpdatableInstr = &II;
1646 } else {
1647 II.eraseFromParent();
1648 Modified = true;
1649 }
1650 }
1651
1652 if (CombinedLoadDsCntInstr) {
1653 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1654 // to be waited for. Otherwise, let the instruction be deleted so
1655 // the appropriate single counter wait instruction can be inserted
1656 // instead, when new S_WAIT_*CNT instructions are inserted by
1657 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1658 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1659 // the loop below that deals with single counter instructions.
1660 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1661 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1662 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1663 AMDGPU::OpName::simm16, NewEnc);
1664 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1665 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1666 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1667 Wait.LoadCnt = ~0u;
1668 Wait.DsCnt = ~0u;
1669
1670 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1671 ? dbgs() << "applied pre-existing waitcnt\n"
1672 << "New Instr at block end: "
1673 << *CombinedLoadDsCntInstr << '\n'
1674 : dbgs() << "applied pre-existing waitcnt\n"
1675 << "Old Instr: " << *It << "New Instr: "
1676 << *CombinedLoadDsCntInstr << '\n');
1677 } else {
1678 CombinedLoadDsCntInstr->eraseFromParent();
1679 Modified = true;
1680 }
1681 }
1682
1683 if (CombinedStoreDsCntInstr) {
1684 // Similarly for S_WAIT_STORECNT_DSCNT.
1685 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1686 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1687 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1688 AMDGPU::OpName::simm16, NewEnc);
1689 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1690 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1691 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1692 Wait.StoreCnt = ~0u;
1693 Wait.DsCnt = ~0u;
1694
1695 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1696 ? dbgs() << "applied pre-existing waitcnt\n"
1697 << "New Instr at block end: "
1698 << *CombinedStoreDsCntInstr << '\n'
1699 : dbgs() << "applied pre-existing waitcnt\n"
1700 << "Old Instr: " << *It << "New Instr: "
1701 << *CombinedStoreDsCntInstr << '\n');
1702 } else {
1703 CombinedStoreDsCntInstr->eraseFromParent();
1704 Modified = true;
1705 }
1706 }
1707
1708 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1709 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1710 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1711 // instructions so that createNewWaitcnt() will create new combined
1712 // instructions to replace them.
1713
1714 if (Wait.DsCnt != ~0u) {
1715 // This is a vector of addresses in WaitInstrs pointing to instructions
1716 // that should be removed if they are present.
1718
1719 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1720 // both) need to be waited for, ensure that there are no existing
1721 // individual wait count instructions for these.
1722
1723 if (Wait.LoadCnt != ~0u) {
1724 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1725 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1726 } else if (Wait.StoreCnt != ~0u) {
1727 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1728 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1729 }
1730
1731 for (MachineInstr **WI : WaitsToErase) {
1732 if (!*WI)
1733 continue;
1734
1735 (*WI)->eraseFromParent();
1736 *WI = nullptr;
1737 Modified = true;
1738 }
1739 }
1740
1741 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1742 if (!WaitInstrs[CT])
1743 continue;
1744
1745 unsigned NewCnt = getWait(Wait, CT);
1746 if (NewCnt != ~0u) {
1747 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1748 AMDGPU::OpName::simm16, NewCnt);
1749 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1750
1751 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1752 setNoWait(Wait, CT);
1753
1754 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1755 ? dbgs() << "applied pre-existing waitcnt\n"
1756 << "New Instr at block end: " << *WaitInstrs[CT]
1757 << '\n'
1758 : dbgs() << "applied pre-existing waitcnt\n"
1759 << "Old Instr: " << *It
1760 << "New Instr: " << *WaitInstrs[CT] << '\n');
1761 } else {
1762 WaitInstrs[CT]->eraseFromParent();
1763 Modified = true;
1764 }
1765 }
1766
1767 return Modified;
1768}
1769
1770/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1771bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1772 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1773 AMDGPU::Waitcnt Wait) {
1774 assert(ST);
1775 assert(!isNormalMode(MaxCounter));
1776
1777 bool Modified = false;
1778 const DebugLoc &DL = Block.findDebugLoc(It);
1779
1780 // Check for opportunities to use combined wait instructions.
1781 if (Wait.DsCnt != ~0u) {
1782 MachineInstr *SWaitInst = nullptr;
1783
1784 if (Wait.LoadCnt != ~0u) {
1785 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1786
1787 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1788 .addImm(Enc);
1789
1790 Wait.LoadCnt = ~0u;
1791 Wait.DsCnt = ~0u;
1792 } else if (Wait.StoreCnt != ~0u) {
1793 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1794
1795 SWaitInst =
1796 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1797 .addImm(Enc);
1798
1799 Wait.StoreCnt = ~0u;
1800 Wait.DsCnt = ~0u;
1801 }
1802
1803 if (SWaitInst) {
1804 Modified = true;
1805
1806 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1807 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1808 dbgs() << "New Instr: " << *SWaitInst << '\n');
1809 }
1810 }
1811
1812 // Generate an instruction for any remaining counter that needs
1813 // waiting for.
1814
1815 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1816 unsigned Count = getWait(Wait, CT);
1817 if (Count == ~0u)
1818 continue;
1819
1820 [[maybe_unused]] auto SWaitInst =
1821 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1822 .addImm(Count);
1823
1824 Modified = true;
1825
1826 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1827 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1828 dbgs() << "New Instr: " << *SWaitInst << '\n');
1829 }
1830
1831 return Modified;
1832}
1833
1834static bool readsVCCZ(const MachineInstr &MI) {
1835 unsigned Opc = MI.getOpcode();
1836 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1837 !MI.getOperand(1).isUndef();
1838}
1839
1840/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1842 // Currently all conventions wait, but this may not always be the case.
1843 //
1844 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1845 // senses to omit the wait and do it in the caller.
1846 return true;
1847}
1848
1849/// \returns true if the callee is expected to wait for any outstanding waits
1850/// before returning.
1851static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
1852
1853/// Generate s_waitcnt instruction to be placed before cur_Inst.
1854/// Instructions of a given type are returned in order,
1855/// but instructions of different types can complete out of order.
1856/// We rely on this in-order completion
1857/// and simply assign a score to the memory access instructions.
1858/// We keep track of the active "score bracket" to determine
1859/// if an access of a memory read requires an s_waitcnt
1860/// and if so what the value of each counter is.
1861/// The "score bracket" is bound by the lower bound and upper bound
1862/// scores (*_score_LB and *_score_ub respectively).
1863/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1864/// flush the vmcnt counter here.
1865bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1866 WaitcntBrackets &ScoreBrackets,
1867 MachineInstr *OldWaitcntInstr,
1868 bool FlushVmCnt) {
1869 setForceEmitWaitcnt();
1870
1871 assert(!MI.isMetaInstruction());
1872
1873 AMDGPU::Waitcnt Wait;
1874
1875 // FIXME: This should have already been handled by the memory legalizer.
1876 // Removing this currently doesn't affect any lit tests, but we need to
1877 // verify that nothing was relying on this. The number of buffer invalidates
1878 // being handled here should not be expanded.
1879 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1880 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1881 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1882 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1883 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1884 Wait.LoadCnt = 0;
1885 }
1886
1887 // All waits must be resolved at call return.
1888 // NOTE: this could be improved with knowledge of all call sites or
1889 // with knowledge of the called routines.
1890 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1891 MI.getOpcode() == AMDGPU::SI_RETURN ||
1892 MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1893 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1894 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1895 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1896 }
1897 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1898 // Technically the hardware will do this on its own if we don't, but that
1899 // might cost extra cycles compared to doing it explicitly.
1900 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1901 // have to wait for outstanding VMEM stores. In this case it can be useful to
1902 // send a message to explicitly release all VGPRs before the stores have
1903 // completed, but it is only safe to do this if there are no outstanding
1904 // scratch stores.
1905 else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1906 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1907 if (!WCG->isOptNone() &&
1908 (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
1909 (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1910 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1911 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
1912 ReleaseVGPRInsts.insert(&MI);
1913 }
1914 // Resolve vm waits before gs-done.
1915 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1916 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1917 ST->hasLegacyGeometry() &&
1918 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1920 Wait.LoadCnt = 0;
1921 }
1922
1923 // Export & GDS instructions do not read the EXEC mask until after the export
1924 // is granted (which can occur well after the instruction is issued).
1925 // The shader program must flush all EXP operations on the export-count
1926 // before overwriting the EXEC mask.
1927 else {
1928 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1929 // Export and GDS are tracked individually, either may trigger a waitcnt
1930 // for EXEC.
1931 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1932 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1933 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1934 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1935 Wait.ExpCnt = 0;
1936 }
1937 }
1938
1939 // Wait for any pending GDS instruction to complete before any
1940 // "Always GDS" instruction.
1941 if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
1942 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
1943
1944 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1945 // The function is going to insert a wait on everything in its prolog.
1946 // This still needs to be careful if the call target is a load (e.g. a GOT
1947 // load). We also need to check WAW dependency with saved PC.
1948 Wait = AMDGPU::Waitcnt();
1949
1950 const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1951 if (CallAddrOp.isReg()) {
1952 RegInterval CallAddrOpInterval =
1953 ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOp);
1954
1955 ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
1956 Wait);
1957
1958 if (const auto *RtnAddrOp =
1959 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
1960 RegInterval RtnAddrOpInterval =
1961 ScoreBrackets.getRegInterval(&MI, MRI, TRI, *RtnAddrOp);
1962
1963 ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
1964 Wait);
1965 }
1966 }
1967 } else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) {
1968 ScoreBrackets.tryClearSCCWriteEvent(&MI);
1969 } else {
1970 // FIXME: Should not be relying on memoperands.
1971 // Look at the source operands of every instruction to see if
1972 // any of them results from a previous memory operation that affects
1973 // its current usage. If so, an s_waitcnt instruction needs to be
1974 // emitted.
1975 // If the source operand was defined by a load, add the s_waitcnt
1976 // instruction.
1977 //
1978 // Two cases are handled for destination operands:
1979 // 1) If the destination operand was defined by a load, add the s_waitcnt
1980 // instruction to guarantee the right WAW order.
1981 // 2) If a destination operand that was used by a recent export/store ins,
1982 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1983
1984 for (const MachineMemOperand *Memop : MI.memoperands()) {
1985 const Value *Ptr = Memop->getValue();
1986 if (Memop->isStore()) {
1987 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
1988 addWait(Wait, SmemAccessCounter, 0);
1989 if (PDT->dominates(MI.getParent(), It->second))
1990 SLoadAddresses.erase(It);
1991 }
1992 }
1993 unsigned AS = Memop->getAddrSpace();
1995 continue;
1996 // No need to wait before load from VMEM to LDS.
1997 if (TII->mayWriteLDSThroughDMA(MI))
1998 continue;
1999
2000 // LOAD_CNT is only relevant to vgpr or LDS.
2001 unsigned RegNo = FIRST_LDS_VGPR;
2002 if (Ptr && Memop->getAAInfo()) {
2003 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2004 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2005 if (MI.mayAlias(AA, *LDSDMAStores[I], true))
2006 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
2007 }
2008 } else {
2009 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
2010 }
2011 if (Memop->isStore()) {
2012 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
2013 }
2014 }
2015
2016 // Loop over use and def operands.
2017 for (const MachineOperand &Op : MI.operands()) {
2018 if (!Op.isReg())
2019 continue;
2020
2021 // If the instruction does not read tied source, skip the operand.
2022 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
2023 continue;
2024
2025 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, Op);
2026
2027 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
2028 if (IsVGPR) {
2029 // Implicit VGPR defs and uses are never a part of the memory
2030 // instructions description and usually present to account for
2031 // super-register liveness.
2032 // TODO: Most of the other instructions also have implicit uses
2033 // for the liveness accounting only.
2034 if (Op.isImplicit() && MI.mayLoadOrStore())
2035 continue;
2036
2037 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2038 // previous write and this write are the same type of VMEM
2039 // instruction, in which case they are (in some architectures)
2040 // guaranteed to write their results in order anyway.
2041 // Additionally check instructions where Point Sample Acceleration
2042 // might be applied.
2043 if (Op.isUse() || !updateVMCntOnly(MI) ||
2044 ScoreBrackets.hasOtherPendingVmemTypes(Interval,
2045 getVmemType(MI)) ||
2046 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||
2047 !ST->hasVmemWriteVgprInOrder()) {
2048 ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
2049 ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);
2050 ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
2051 ScoreBrackets.clearVgprVmemTypes(Interval);
2052 }
2053
2054 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2055 ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
2056 }
2057 ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
2058 } else if (Op.getReg() == AMDGPU::SCC) {
2059 ScoreBrackets.determineWait(KM_CNT, Interval, Wait);
2060 } else {
2061 ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
2062 }
2063
2064 if (hasXcnt() && Op.isDef())
2065 ScoreBrackets.determineWait(X_CNT, Interval, Wait);
2066 }
2067 }
2068 }
2069
2070 // Ensure safety against exceptions from outstanding memory operations while
2071 // waiting for a barrier:
2072 //
2073 // * Some subtargets safely handle backing off the barrier in hardware
2074 // when an exception occurs.
2075 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2076 // there can be no outstanding memory operations during the wait.
2077 // * Subtargets with split barriers don't need to back off the barrier; it
2078 // is up to the trap handler to preserve the user barrier state correctly.
2079 //
2080 // In all other cases, ensure safety by ensuring that there are no outstanding
2081 // memory operations.
2082 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
2083 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
2084 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2085 }
2086
2087 // TODO: Remove this work-around, enable the assert for Bug 457939
2088 // after fixing the scheduler. Also, the Shader Compiler code is
2089 // independent of target.
2090 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
2091 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2092 Wait.DsCnt = 0;
2093 }
2094 }
2095
2096 // Verify that the wait is actually needed.
2097 ScoreBrackets.simplifyWaitcnt(Wait);
2098
2099 // When forcing emit, we need to skip terminators because that would break the
2100 // terminators of the MBB if we emit a waitcnt between terminators.
2101 if (ForceEmitZeroFlag && !MI.isTerminator())
2102 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2103
2104 if (ForceEmitWaitcnt[LOAD_CNT])
2105 Wait.LoadCnt = 0;
2106 if (ForceEmitWaitcnt[EXP_CNT])
2107 Wait.ExpCnt = 0;
2108 if (ForceEmitWaitcnt[DS_CNT])
2109 Wait.DsCnt = 0;
2110 if (ForceEmitWaitcnt[SAMPLE_CNT])
2111 Wait.SampleCnt = 0;
2112 if (ForceEmitWaitcnt[BVH_CNT])
2113 Wait.BvhCnt = 0;
2114 if (ForceEmitWaitcnt[KM_CNT])
2115 Wait.KmCnt = 0;
2116 if (ForceEmitWaitcnt[X_CNT])
2117 Wait.XCnt = 0;
2118
2119 if (FlushVmCnt) {
2120 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2121 Wait.LoadCnt = 0;
2122 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2123 Wait.SampleCnt = 0;
2124 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2125 Wait.BvhCnt = 0;
2126 }
2127
2128 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
2129 Wait.LoadCnt = 0;
2130
2131 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2132 OldWaitcntInstr);
2133}
2134
2135bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2137 MachineBasicBlock &Block,
2138 WaitcntBrackets &ScoreBrackets,
2139 MachineInstr *OldWaitcntInstr) {
2140 bool Modified = false;
2141
2142 if (OldWaitcntInstr)
2143 // Try to merge the required wait with preexisting waitcnt instructions.
2144 // Also erase redundant waitcnt.
2145 Modified =
2146 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2147
2148 // Any counts that could have been applied to any existing waitcnt
2149 // instructions will have been done so, now deal with any remaining.
2150 ScoreBrackets.applyWaitcnt(Wait);
2151
2152 // ExpCnt can be merged into VINTERP.
2153 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
2155 MachineOperand *WaitExp =
2156 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2157 if (Wait.ExpCnt < WaitExp->getImm()) {
2158 WaitExp->setImm(Wait.ExpCnt);
2159 Modified = true;
2160 }
2161 Wait.ExpCnt = ~0u;
2162
2163 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2164 << "Update Instr: " << *It);
2165 }
2166
2167 // XCnt may be already consumed by a load wait.
2168 if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
2169 !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
2170 Wait.XCnt = ~0u;
2171
2172 if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
2173 !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
2174 Wait.XCnt = ~0u;
2175
2176 // Since the translation for VMEM addresses occur in-order, we can skip the
2177 // XCnt if the current instruction is of VMEM type and has a memory dependency
2178 // with another VMEM instruction in flight.
2179 if (Wait.XCnt != ~0u && isVmemAccess(*It))
2180 Wait.XCnt = ~0u;
2181
2182 if (WCG->createNewWaitcnt(Block, It, Wait))
2183 Modified = true;
2184
2185 return Modified;
2186}
2187
2188// This is a flat memory operation. Check to see if it has memory tokens other
2189// than LDS. Other address spaces supported by flat memory operations involve
2190// global memory.
2191bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
2192 assert(TII->isFLAT(MI));
2193
2194 // All flat instructions use the VMEM counter except prefetch.
2195 if (!TII->usesVM_CNT(MI))
2196 return false;
2197
2198 // If there are no memory operands then conservatively assume the flat
2199 // operation may access VMEM.
2200 if (MI.memoperands_empty())
2201 return true;
2202
2203 // See if any memory operand specifies an address space that involves VMEM.
2204 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
2205 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
2206 // (GDS) address space is not supported by flat operations. Therefore, simply
2207 // return true unless only the LDS address space is found.
2208 for (const MachineMemOperand *Memop : MI.memoperands()) {
2209 unsigned AS = Memop->getAddrSpace();
2211 if (AS != AMDGPUAS::LOCAL_ADDRESS)
2212 return true;
2213 }
2214
2215 return false;
2216}
2217
2218// This is a flat memory operation. Check to see if it has memory tokens for
2219// either LDS or FLAT.
2220bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
2221 assert(TII->isFLAT(MI));
2222
2223 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
2224 if (!TII->usesLGKM_CNT(MI))
2225 return false;
2226
2227 // If in tgsplit mode then there can be no use of LDS.
2228 if (ST->isTgSplitEnabled())
2229 return false;
2230
2231 // If there are no memory operands then conservatively assume the flat
2232 // operation may access LDS.
2233 if (MI.memoperands_empty())
2234 return true;
2235
2236 // See if any memory operand specifies an address space that involves LDS.
2237 for (const MachineMemOperand *Memop : MI.memoperands()) {
2238 unsigned AS = Memop->getAddrSpace();
2240 return true;
2241 }
2242
2243 return false;
2244}
2245
2246bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2247 return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
2248 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2249}
2250
2252 auto Opc = Inst.getOpcode();
2253 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
2254 Opc == AMDGPU::GLOBAL_WBINV;
2255}
2256
2257// Return true if the next instruction is S_ENDPGM, following fallthrough
2258// blocks if necessary.
2259bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2260 MachineBasicBlock *Block) const {
2261 auto BlockEnd = Block->getParent()->end();
2262 auto BlockIter = Block->getIterator();
2263
2264 while (true) {
2265 if (It.isEnd()) {
2266 if (++BlockIter != BlockEnd) {
2267 It = BlockIter->instr_begin();
2268 continue;
2269 }
2270
2271 return false;
2272 }
2273
2274 if (!It->isMetaInstruction())
2275 break;
2276
2277 It++;
2278 }
2279
2280 assert(!It.isEnd());
2281
2282 return It->getOpcode() == AMDGPU::S_ENDPGM;
2283}
2284
2285// Add a wait after an instruction if architecture requirements mandate one.
2286bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2287 MachineBasicBlock &Block,
2288 WaitcntBrackets &ScoreBrackets) {
2289 AMDGPU::Waitcnt Wait;
2290 bool NeedsEndPGMCheck = false;
2291
2292 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2293 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2295
2296 if (TII->isAlwaysGDS(Inst.getOpcode())) {
2297 Wait.DsCnt = 0;
2298 NeedsEndPGMCheck = true;
2299 }
2300
2301 ScoreBrackets.simplifyWaitcnt(Wait);
2302
2303 auto SuccessorIt = std::next(Inst.getIterator());
2304 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2305 /*OldWaitcntInstr=*/nullptr);
2306
2307 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2308 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2309 .addImm(0);
2310 }
2311
2312 return Result;
2313}
2314
2315void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2316 WaitcntBrackets *ScoreBrackets) {
2317 // Now look at the instruction opcode. If it is a memory access
2318 // instruction, update the upper-bound of the appropriate counter's
2319 // bracket and the destination operand scores.
2320 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2321
2322 bool IsVMEMAccess = false;
2323 bool IsSMEMAccess = false;
2324 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2325 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2326 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2327 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
2328 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
2329 ScoreBrackets->setPendingGDS();
2330 } else {
2331 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2332 }
2333 } else if (TII->isFLAT(Inst)) {
2334 if (isGFX12CacheInvOrWBInst(Inst)) {
2335 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2336 Inst);
2337 return;
2338 }
2339
2340 assert(Inst.mayLoadOrStore());
2341
2342 int FlatASCount = 0;
2343
2344 if (mayAccessVMEMThroughFlat(Inst)) {
2345 ++FlatASCount;
2346 IsVMEMAccess = true;
2347 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2348 Inst);
2349 }
2350
2351 if (mayAccessLDSThroughFlat(Inst)) {
2352 ++FlatASCount;
2353 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2354 }
2355
2356 // This is a flat memory operation that access both VMEM and LDS, so note it
2357 // - it will require that both the VM and LGKM be flushed to zero if it is
2358 // pending when a VM or LGKM dependency occurs.
2359 if (FlatASCount > 1)
2360 ScoreBrackets->setPendingFlat();
2361 } else if (SIInstrInfo::isVMEM(Inst) &&
2363 IsVMEMAccess = true;
2364 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2365 Inst);
2366
2367 if (ST->vmemWriteNeedsExpWaitcnt() &&
2368 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2369 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
2370 }
2371 } else if (TII->isSMRD(Inst)) {
2372 IsSMEMAccess = true;
2373 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2374 } else if (Inst.isCall()) {
2375 if (callWaitsOnFunctionReturn(Inst)) {
2376 // Act as a wait on everything
2377 ScoreBrackets->applyWaitcnt(
2378 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2379 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2380 } else {
2381 // May need to way wait for anything.
2382 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2383 }
2384 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2385 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
2386 } else if (TII->isVINTERP(Inst)) {
2387 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2388 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2389 } else if (SIInstrInfo::isEXP(Inst)) {
2390 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2392 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2393 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2394 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2395 else
2396 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2397 } else if (asynchronouslyWritesSCC(Inst.getOpcode())) {
2398 ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
2399 } else {
2400 switch (Inst.getOpcode()) {
2401 case AMDGPU::S_SENDMSG:
2402 case AMDGPU::S_SENDMSG_RTN_B32:
2403 case AMDGPU::S_SENDMSG_RTN_B64:
2404 case AMDGPU::S_SENDMSGHALT:
2405 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
2406 break;
2407 case AMDGPU::S_MEMTIME:
2408 case AMDGPU::S_MEMREALTIME:
2409 case AMDGPU::S_GET_BARRIER_STATE_M0:
2410 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2411 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2412 break;
2413 }
2414 }
2415
2416 if (!hasXcnt())
2417 return;
2418
2419 if (IsVMEMAccess)
2420 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst);
2421
2422 if (IsSMEMAccess)
2423 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_GROUP, Inst);
2424}
2425
2426bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2427 unsigned OtherScore) {
2428 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2429 unsigned OtherShifted =
2430 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2431 Score = std::max(MyShifted, OtherShifted);
2432 return OtherShifted > MyShifted;
2433}
2434
2435/// Merge the pending events and associater score brackets of \p Other into
2436/// this brackets status.
2437///
2438/// Returns whether the merge resulted in a change that requires tighter waits
2439/// (i.e. the merged brackets strictly dominate the original brackets).
2440bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2441 bool StrictDom = false;
2442
2443 VgprUB = std::max(VgprUB, Other.VgprUB);
2444 SgprUB = std::max(SgprUB, Other.SgprUB);
2445
2446 for (auto T : inst_counter_types(Context->MaxCounter)) {
2447 // Merge event flags for this counter
2448 const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
2449 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2450 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2451 if (OtherEvents & ~OldEvents)
2452 StrictDom = true;
2453 PendingEvents |= OtherEvents;
2454
2455 // Merge scores for this counter
2456 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2457 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2458 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2459 if (NewUB < ScoreLBs[T])
2460 report_fatal_error("waitcnt score overflow");
2461
2462 MergeInfo M;
2463 M.OldLB = ScoreLBs[T];
2464 M.OtherLB = Other.ScoreLBs[T];
2465 M.MyShift = NewUB - ScoreUBs[T];
2466 M.OtherShift = NewUB - Other.ScoreUBs[T];
2467
2468 ScoreUBs[T] = NewUB;
2469
2470 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2471
2472 if (T == DS_CNT)
2473 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2474
2475 if (T == KM_CNT) {
2476 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2477 if (Other.hasPendingEvent(SCC_WRITE)) {
2478 unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2479 if (!OldEventsHasSCCWrite) {
2480 PendingSCCWrite = Other.PendingSCCWrite;
2481 } else {
2482 if (PendingSCCWrite != Other.PendingSCCWrite)
2483 PendingSCCWrite = nullptr;
2484 }
2485 }
2486 }
2487
2488 for (int J = 0; J <= VgprUB; J++)
2489 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
2490
2491 if (isSmemCounter(T)) {
2492 unsigned Idx = getSgprScoresIdx(T);
2493 for (int J = 0; J <= SgprUB; J++)
2494 StrictDom |=
2495 mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);
2496 }
2497 }
2498
2499 for (int J = 0; J <= VgprUB; J++) {
2500 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2501 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2502 VgprVmemTypes[J] = NewVmemTypes;
2503 }
2504
2505 return StrictDom;
2506}
2507
2508static bool isWaitInstr(MachineInstr &Inst) {
2509 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2510 return Opcode == AMDGPU::S_WAITCNT ||
2511 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2512 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2513 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2514 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2515 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2516 counterTypeForInstr(Opcode).has_value();
2517}
2518
2519bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) {
2520 return Opcode == AMDGPU::S_BARRIER_LEAVE ||
2521 Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
2522 Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
2523}
2524
2525// Generate s_waitcnt instructions where needed.
2526bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2527 MachineBasicBlock &Block,
2528 WaitcntBrackets &ScoreBrackets) {
2529 bool Modified = false;
2530
2531 LLVM_DEBUG({
2532 dbgs() << "*** Begin Block: ";
2533 Block.printName(dbgs());
2534 ScoreBrackets.dump();
2535 });
2536
2537 // Track the correctness of vccz through this basic block. There are two
2538 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2539 // ST->partialVCCWritesUpdateVCCZ().
2540 bool VCCZCorrect = true;
2541 if (ST->hasReadVCCZBug()) {
2542 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2543 // to vcc and then issued an smem load.
2544 VCCZCorrect = false;
2545 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2546 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2547 // to vcc_lo or vcc_hi.
2548 VCCZCorrect = false;
2549 }
2550
2551 // Walk over the instructions.
2552 MachineInstr *OldWaitcntInstr = nullptr;
2553
2554 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2555 E = Block.instr_end();
2556 Iter != E;) {
2557 MachineInstr &Inst = *Iter;
2558 if (Inst.isMetaInstruction()) {
2559 ++Iter;
2560 continue;
2561 }
2562
2563 // Track pre-existing waitcnts that were added in earlier iterations or by
2564 // the memory legalizer.
2565 if (isWaitInstr(Inst)) {
2566 if (!OldWaitcntInstr)
2567 OldWaitcntInstr = &Inst;
2568 ++Iter;
2569 continue;
2570 }
2571
2572 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2573 isPreheaderToFlush(Block, ScoreBrackets);
2574
2575 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2576 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2577 FlushVmCnt);
2578 OldWaitcntInstr = nullptr;
2579
2580 // Restore vccz if it's not known to be correct already.
2581 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2582
2583 // Don't examine operands unless we need to track vccz correctness.
2584 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2585 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2586 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2587 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2588 if (!ST->partialVCCWritesUpdateVCCZ())
2589 VCCZCorrect = false;
2590 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2591 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2592 // vccz bit, so when we detect that an instruction may read from a
2593 // corrupt vccz bit, we need to:
2594 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2595 // operations to complete.
2596 // 2. Restore the correct value of vccz by writing the current value
2597 // of vcc back to vcc.
2598 if (ST->hasReadVCCZBug() &&
2599 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2600 // Writes to vcc while there's an outstanding smem read may get
2601 // clobbered as soon as any read completes.
2602 VCCZCorrect = false;
2603 } else {
2604 // Writes to vcc will fix any incorrect value in vccz.
2605 VCCZCorrect = true;
2606 }
2607 }
2608 }
2609
2610 if (TII->isSMRD(Inst)) {
2611 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2612 // No need to handle invariant loads when avoiding WAR conflicts, as
2613 // there cannot be a vector store to the same memory location.
2614 if (!Memop->isInvariant()) {
2615 const Value *Ptr = Memop->getValue();
2616 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2617 }
2618 }
2619 if (ST->hasReadVCCZBug()) {
2620 // This smem read could complete and clobber vccz at any time.
2621 VCCZCorrect = false;
2622 }
2623 }
2624
2625 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2626
2627 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
2628
2629 LLVM_DEBUG({
2630 Inst.print(dbgs());
2631 ScoreBrackets.dump();
2632 });
2633
2634 // TODO: Remove this work-around after fixing the scheduler and enable the
2635 // assert above.
2636 if (RestoreVCCZ) {
2637 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2638 // bit is updated, so we can restore the bit by reading the value of
2639 // vcc and then writing it back to the register.
2640 BuildMI(Block, Inst, Inst.getDebugLoc(),
2641 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2642 TRI->getVCC())
2643 .addReg(TRI->getVCC());
2644 VCCZCorrect = true;
2645 Modified = true;
2646 }
2647
2648 ++Iter;
2649 }
2650
2651 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2652 // needed.
2653 AMDGPU::Waitcnt Wait;
2654 if (Block.getFirstTerminator() == Block.end() &&
2655 isPreheaderToFlush(Block, ScoreBrackets)) {
2656 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2657 Wait.LoadCnt = 0;
2658 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2659 Wait.SampleCnt = 0;
2660 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2661 Wait.BvhCnt = 0;
2662 }
2663
2664 // Combine or remove any redundant waitcnts at the end of the block.
2665 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2666 OldWaitcntInstr);
2667
2668 LLVM_DEBUG({
2669 dbgs() << "*** End Block: ";
2670 Block.printName(dbgs());
2671 ScoreBrackets.dump();
2672 });
2673
2674 return Modified;
2675}
2676
2677// Return true if the given machine basic block is a preheader of a loop in
2678// which we want to flush the vmcnt counter, and false otherwise.
2679bool SIInsertWaitcnts::isPreheaderToFlush(
2680 MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
2681 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2682 if (!IsInserted)
2683 return Iterator->second;
2684
2685 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
2686 if (!Succ)
2687 return false;
2688
2689 MachineLoop *Loop = MLI->getLoopFor(Succ);
2690 if (!Loop)
2691 return false;
2692
2693 if (Loop->getLoopPreheader() == &MBB &&
2694 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2695 Iterator->second = true;
2696 return true;
2697 }
2698
2699 return false;
2700}
2701
2702bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2704 return mayAccessVMEMThroughFlat(MI);
2705 return SIInstrInfo::isVMEM(MI);
2706}
2707
2708// Return true if it is better to flush the vmcnt counter in the preheader of
2709// the given loop. We currently decide to flush in two situations:
2710// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2711// vgpr containing a value that is loaded outside of the loop. (Only on
2712// targets with no vscnt counter).
2713// 2. The loop contains vmem load(s), but the loaded values are not used in the
2714// loop, and at least one use of a vgpr containing a value that is loaded
2715// outside of the loop.
2716bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2717 const WaitcntBrackets &Brackets) {
2718 bool HasVMemLoad = false;
2719 bool HasVMemStore = false;
2720 bool UsesVgprLoadedOutside = false;
2721 DenseSet<Register> VgprUse;
2722 DenseSet<Register> VgprDef;
2723
2724 for (MachineBasicBlock *MBB : ML->blocks()) {
2725 for (MachineInstr &MI : *MBB) {
2726 if (isVMEMOrFlatVMEM(MI)) {
2727 if (MI.mayLoad())
2728 HasVMemLoad = true;
2729 if (MI.mayStore())
2730 HasVMemStore = true;
2731 }
2732 for (const MachineOperand &Op : MI.all_uses()) {
2733 if (!TRI->isVectorRegister(*MRI, Op.getReg()))
2734 continue;
2735 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
2736 // Vgpr use
2737 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2738 // If we find a register that is loaded inside the loop, 1. and 2.
2739 // are invalidated and we can exit.
2740 if (VgprDef.contains(RegNo))
2741 return false;
2742 VgprUse.insert(RegNo);
2743 // If at least one of Op's registers is in the score brackets, the
2744 // value is likely loaded outside of the loop.
2745 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2746 Brackets.getScoreLB(LOAD_CNT) ||
2747 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2748 Brackets.getScoreLB(SAMPLE_CNT) ||
2749 Brackets.getRegScore(RegNo, BVH_CNT) >
2750 Brackets.getScoreLB(BVH_CNT)) {
2751 UsesVgprLoadedOutside = true;
2752 break;
2753 }
2754 }
2755 }
2756
2757 // VMem load vgpr def
2758 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
2759 for (const MachineOperand &Op : MI.all_defs()) {
2760 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
2761 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2762 // If we find a register that is loaded inside the loop, 1. and 2.
2763 // are invalidated and we can exit.
2764 if (VgprUse.contains(RegNo))
2765 return false;
2766 VgprDef.insert(RegNo);
2767 }
2768 }
2769 }
2770 }
2771 }
2772 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2773 return true;
2774 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2775}
2776
2777bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
2778 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2779 auto *PDT =
2780 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2781 AliasAnalysis *AA = nullptr;
2782 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2783 AA = &AAR->getAAResults();
2784
2785 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
2786}
2787
2788PreservedAnalyses
2791 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
2792 auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
2794 .getManager()
2795 .getCachedResult<AAManager>(MF.getFunction());
2796
2797 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
2798 return PreservedAnalyses::all();
2799
2802 .preserve<AAManager>();
2803}
2804
2805bool SIInsertWaitcnts::run(MachineFunction &MF) {
2806 ST = &MF.getSubtarget<GCNSubtarget>();
2807 TII = ST->getInstrInfo();
2808 TRI = &TII->getRegisterInfo();
2809 MRI = &MF.getRegInfo();
2811
2813
2814 if (ST->hasExtendedWaitCounts()) {
2815 MaxCounter = NUM_EXTENDED_INST_CNTS;
2816 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2817 WCG = &WCGGFX12Plus;
2818 } else {
2819 MaxCounter = NUM_NORMAL_INST_CNTS;
2820 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
2821 WCG = &WCGPreGFX12;
2822 }
2823
2824 for (auto T : inst_counter_types())
2825 ForceEmitWaitcnt[T] = false;
2826
2827 WaitEventMaskForInst = WCG->getWaitEventMask();
2828
2829 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2830
2831 if (ST->hasExtendedWaitCounts()) {
2832 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2833 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2834 } else {
2835 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2836 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2837 }
2838 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2839 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2840 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2841 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2842 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2843 Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
2844
2845 [[maybe_unused]] unsigned NumVGPRsMax =
2846 ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
2847 [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2848 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2849 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2850
2851 BlockInfos.clear();
2852 bool Modified = false;
2853
2854 MachineBasicBlock &EntryBB = MF.front();
2856
2857 if (!MFI->isEntryFunction()) {
2858 // Wait for any outstanding memory operations that the input registers may
2859 // depend on. We can't track them and it's better to do the wait after the
2860 // costly call sequence.
2861
2862 // TODO: Could insert earlier and schedule more liberally with operations
2863 // that only use caller preserved registers.
2864 for (MachineBasicBlock::iterator E = EntryBB.end();
2865 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2866 ;
2867
2868 if (ST->hasExtendedWaitCounts()) {
2869 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2870 .addImm(0);
2871 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2872 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
2873 continue;
2874
2875 if (!ST->hasImageInsts() &&
2876 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
2877 continue;
2878
2879 BuildMI(EntryBB, I, DebugLoc(),
2880 TII->get(instrsForExtendedCounterTypes[CT]))
2881 .addImm(0);
2882 }
2883 } else {
2884 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2885 }
2886
2887 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
2888 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2889 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2890
2891 Modified = true;
2892 }
2893
2894 // Keep iterating over the blocks in reverse post order, inserting and
2895 // updating s_waitcnt where needed, until a fix point is reached.
2896 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
2897 BlockInfos.try_emplace(MBB);
2898
2899 std::unique_ptr<WaitcntBrackets> Brackets;
2900 bool Repeat;
2901 do {
2902 Repeat = false;
2903
2904 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2905 ++BII) {
2906 MachineBasicBlock *MBB = BII->first;
2907 BlockInfo &BI = BII->second;
2908 if (!BI.Dirty)
2909 continue;
2910
2911 if (BI.Incoming) {
2912 if (!Brackets)
2913 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2914 else
2915 *Brackets = *BI.Incoming;
2916 } else {
2917 if (!Brackets) {
2918 Brackets = std::make_unique<WaitcntBrackets>(this);
2919 } else {
2920 // Reinitialize in-place. N.B. do not do this by assigning from a
2921 // temporary because the WaitcntBrackets class is large and it could
2922 // cause this function to use an unreasonable amount of stack space.
2923 Brackets->~WaitcntBrackets();
2924 new (Brackets.get()) WaitcntBrackets(this);
2925 }
2926 }
2927
2928 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2929 BI.Dirty = false;
2930
2931 if (Brackets->hasPendingEvent()) {
2932 BlockInfo *MoveBracketsToSucc = nullptr;
2933 for (MachineBasicBlock *Succ : MBB->successors()) {
2934 auto *SuccBII = BlockInfos.find(Succ);
2935 BlockInfo &SuccBI = SuccBII->second;
2936 if (!SuccBI.Incoming) {
2937 SuccBI.Dirty = true;
2938 if (SuccBII <= BII) {
2939 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2940 Repeat = true;
2941 }
2942 if (!MoveBracketsToSucc) {
2943 MoveBracketsToSucc = &SuccBI;
2944 } else {
2945 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2946 }
2947 } else if (SuccBI.Incoming->merge(*Brackets)) {
2948 SuccBI.Dirty = true;
2949 if (SuccBII <= BII) {
2950 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2951 Repeat = true;
2952 }
2953 }
2954 }
2955 if (MoveBracketsToSucc)
2956 MoveBracketsToSucc->Incoming = std::move(Brackets);
2957 }
2958 }
2959 } while (Repeat);
2960
2961 if (ST->hasScalarStores()) {
2962 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
2963 bool HaveScalarStores = false;
2964
2965 for (MachineBasicBlock &MBB : MF) {
2966 for (MachineInstr &MI : MBB) {
2967 if (!HaveScalarStores && TII->isScalarStore(MI))
2968 HaveScalarStores = true;
2969
2970 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2971 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2972 EndPgmBlocks.push_back(&MBB);
2973 }
2974 }
2975
2976 if (HaveScalarStores) {
2977 // If scalar writes are used, the cache must be flushed or else the next
2978 // wave to reuse the same scratch memory can be clobbered.
2979 //
2980 // Insert s_dcache_wb at wave termination points if there were any scalar
2981 // stores, and only if the cache hasn't already been flushed. This could
2982 // be improved by looking across blocks for flushes in postdominating
2983 // blocks from the stores but an explicitly requested flush is probably
2984 // very rare.
2985 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2986 bool SeenDCacheWB = false;
2987
2988 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2989 I != E; ++I) {
2990 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2991 SeenDCacheWB = true;
2992 else if (TII->isScalarStore(*I))
2993 SeenDCacheWB = false;
2994
2995 // FIXME: It would be better to insert this before a waitcnt if any.
2996 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2997 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2998 !SeenDCacheWB) {
2999 Modified = true;
3000 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
3001 }
3002 }
3003 }
3004 }
3005 }
3006
3007 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
3008 // This is done in different ways depending on how the VGPRs were allocated
3009 // (i.e. whether we're in dynamic VGPR mode or not).
3010 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
3011 // waveslot limited kernel runs slower with the deallocation.
3012 if (MFI->isDynamicVGPREnabled()) {
3013 for (MachineInstr *MI : ReleaseVGPRInsts) {
3014 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3015 TII->get(AMDGPU::S_ALLOC_VGPR))
3016 .addImm(0);
3017 Modified = true;
3018 }
3019 } else {
3020 if (!ReleaseVGPRInsts.empty() &&
3021 (MF.getFrameInfo().hasCalls() ||
3022 ST->getOccupancyWithNumVGPRs(
3023 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
3024 /*IsDynamicVGPR=*/false) <
3026 for (MachineInstr *MI : ReleaseVGPRInsts) {
3027 if (ST->requiresNopBeforeDeallocVGPRs()) {
3028 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3029 TII->get(AMDGPU::S_NOP))
3030 .addImm(0);
3031 }
3032 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3033 TII->get(AMDGPU::S_SENDMSG))
3035 Modified = true;
3036 }
3037 }
3038 }
3039 ReleaseVGPRInsts.clear();
3040 PreheadersToFlush.clear();
3041 SLoadAddresses.clear();
3042
3043 return Modified;
3044}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
#define AMDGPU_EVENT_NAME(Name)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool readsVCCZ(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool isCounterSet(unsigned ID)
static bool shouldExecute(unsigned CounterName)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
bool erase(const KeyT &Val)
Definition DenseMap.h:303
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
iterator end()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:141
iterator begin()
Definition MapVector.h:65
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:107
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:862
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:169
self_iterator getIterator()
Definition ilist_node.h:130
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
unsigned getStorecntBitMask(const IsaVersion &Version)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getXcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ Other
Any other memory.
Definition ModRef.h:68
DWARFExpression::Operation Op
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.