diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index aaefe27b1324f..ea2dc494dabd9 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -284,6 +284,39 @@ void GCNHazardRecognizer::processBundle() { CurrCycleInstr = nullptr; } +void GCNHazardRecognizer::processBundleBottomUp() { + // Walk through the instructions in this bundle in bottom-up order. + // We only use this during post-RA scheduling, so hazard recognizer mode + // should never be active here (it always runs top-down). + assert(!IsHazardRecognizerMode && + "Bottom-up scheduling shouldn't run in hazard recognizer mode"); + + // Step through each instruction in the bundle in bottom-up order. + MachineBasicBlock::instr_iterator MI = + std::next(CurrCycleInstr->getIterator()); + MachineBasicBlock::instr_iterator E = + CurrCycleInstr->getParent()->instr_end(); + + // Evict stale entries to maintain a fixed lookahead window. + // TODO: Hazard detection is not yet implemented. This scheduling + // is intended for GFX11 and newer. + for (; MI != E && MI->isInsideBundle(); ++MI) { + CurrCycleInstr = &*MI; + + // Remove up to (MaxLookAhead - 1) oldest entries. + for (unsigned I = 0, E = MaxLookAhead - 1; I < E && !EmittedInstrs.empty(); + ++I) + EmittedInstrs.pop_back(); + + EmittedInstrs.push_back(CurrCycleInstr); + + // Keep only the most recent MaxLookAhead entries + EmittedInstrs.resize(MaxLookAhead); + } + + CurrCycleInstr = nullptr; +} + void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) { assert(IsHazardRecognizerMode); @@ -423,7 +456,41 @@ void GCNHazardRecognizer::AdvanceCycle() { } void GCNHazardRecognizer::RecedeCycle() { - llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); + // If no instruction was issued this cycle, pop the oldest placeholder. + if (!CurrCycleInstr) { + if (!EmittedInstrs.empty()) + EmittedInstrs.pop_back(); + return; + } + + // If this is a bundle header, handle the entire bundle here. + if (CurrCycleInstr->isBundle()) { + processBundleBottomUp(); + return; + } + + unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); + if (!NumWaitStates) { + CurrCycleInstr = nullptr; + return; + } + + // Add current instruction to the emitted list. + EmittedInstrs.push_back(CurrCycleInstr); + + // Model remaining wait states by removing older placeholders. + for (unsigned I = 1, E = std::min(NumWaitStates, getMaxLookAhead()); I < E; + ++I) { + if (!EmittedInstrs.empty()) + EmittedInstrs.pop_back(); + } + + // getMaxLookahead() is the largest number of wait states we will ever need + // to insert, so there is no point in keeping track of more than that many + // wait states. + EmittedInstrs.resize(getMaxLookAhead()); + + CurrCycleInstr = nullptr; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index bbc55851bf967..88c7426be552d 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -69,6 +69,10 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { // Advance over a MachineInstr bundle. Look for hazards in the bundled // instructions. void processBundle(); + // Recede over a MachineInstr bundle. Adds bundled instructions to the + // EmittedInstrs queue in bottom-up scheduling mode. + // TODO: Hazard detection is not yet implemented. + void processBundleBottomUp(); // Run on an individual instruction in hazard recognizer mode. This can be // used on a newly inserted instruction before returning from PreEmitNoops. diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir index 7bdb8f5b35ec5..02ebffca84bda 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=postmisched -verify-misched -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=postmisched -verify-misched -o - %s | FileCheck -check-prefix=CHECK %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=postmisched -misched-postra-direction=bottomup -verify-misched -o - %s | FileCheck -check-prefix=CHECK-BOTTOMUP %s --- | define amdgpu_kernel void @no_sched_barrier(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void } @@ -29,6 +30,21 @@ body: | ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: } ; CHECK-NEXT: S_ENDPGM 0 + ; + ; CHECK-BOTTOMUP-LABEL: name: no_sched_barrier + ; CHECK-BOTTOMUP: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-BOTTOMUP-NEXT: renamable $sgpr0_sgpr1 = IMPLICIT_DEF + ; CHECK-BOTTOMUP-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { + ; CHECK-BOTTOMUP-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-BOTTOMUP-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-BOTTOMUP-NEXT: } + ; CHECK-BOTTOMUP-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec + ; CHECK-BOTTOMUP-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec + ; CHECK-BOTTOMUP-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 { + ; CHECK-BOTTOMUP-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-BOTTOMUP-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-BOTTOMUP-NEXT: } + ; CHECK-BOTTOMUP-NEXT: S_ENDPGM 0 renamable $sgpr0_sgpr1 = IMPLICIT_DEF renamable $vgpr0 = IMPLICIT_DEF BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { @@ -66,6 +82,22 @@ body: | ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: } ; CHECK-NEXT: S_ENDPGM 0 + ; + ; CHECK-BOTTOMUP-LABEL: name: sched_barrier_mask_0 + ; CHECK-BOTTOMUP: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-BOTTOMUP-NEXT: renamable $sgpr0_sgpr1 = IMPLICIT_DEF + ; CHECK-BOTTOMUP-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { + ; CHECK-BOTTOMUP-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-BOTTOMUP-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-BOTTOMUP-NEXT: } + ; CHECK-BOTTOMUP-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec + ; CHECK-BOTTOMUP-NEXT: SCHED_BARRIER 0 + ; CHECK-BOTTOMUP-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec + ; CHECK-BOTTOMUP-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 { + ; CHECK-BOTTOMUP-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-BOTTOMUP-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-BOTTOMUP-NEXT: } + ; CHECK-BOTTOMUP-NEXT: S_ENDPGM 0 renamable $sgpr0_sgpr1 = IMPLICIT_DEF renamable $vgpr0 = IMPLICIT_DEF BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { @@ -105,6 +137,22 @@ body: | ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) ; CHECK-NEXT: } ; CHECK-NEXT: S_ENDPGM 0 + ; + ; CHECK-BOTTOMUP-LABEL: name: sched_barrier_mask_1 + ; CHECK-BOTTOMUP: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-BOTTOMUP-NEXT: renamable $sgpr0_sgpr1 = IMPLICIT_DEF + ; CHECK-BOTTOMUP-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { + ; CHECK-BOTTOMUP-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-BOTTOMUP-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-BOTTOMUP-NEXT: } + ; CHECK-BOTTOMUP-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec + ; CHECK-BOTTOMUP-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec + ; CHECK-BOTTOMUP-NEXT: SCHED_BARRIER 1 + ; CHECK-BOTTOMUP-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 { + ; CHECK-BOTTOMUP-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-BOTTOMUP-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-BOTTOMUP-NEXT: } + ; CHECK-BOTTOMUP-NEXT: S_ENDPGM 0 renamable $sgpr0_sgpr1 = IMPLICIT_DEF renamable $vgpr0 = IMPLICIT_DEF BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {